diff --git a/.classpath b/.classpath
index 8cc301584..3e4f29314 100644
--- a/.classpath
+++ b/.classpath
@@ -23,6 +23,7 @@
+
diff --git a/build.xml b/build.xml
index 5cc6aefb2..af41a8802 100644
--- a/build.xml
+++ b/build.xml
@@ -179,6 +179,7 @@
+
diff --git a/lib/xz-1.8.License b/lib/xz-1.8.License
new file mode 100644
index 000000000..c1d404dc7
--- /dev/null
+++ b/lib/xz-1.8.License
@@ -0,0 +1,10 @@
+
+Licensing of XZ for Java
+========================
+
+ All the files in this package have been written by Lasse Collin
+ and/or Igor Pavlov. All these files have been put into the
+ public domain. You can do whatever you want with these files.
+
+ This software is provided "as is", without any warranty.
+
diff --git a/lib/xz-1.8.jar b/lib/xz-1.8.jar
new file mode 100644
index 000000000..9931efa3e
Binary files /dev/null and b/lib/xz-1.8.jar differ
diff --git a/pom.xml b/pom.xml
index f7a4d2556..37bf57004 100644
--- a/pom.xml
+++ b/pom.xml
@@ -380,6 +380,12 @@
1.17
jar
+
+
+ org.tukaani
+ xz
+ 1.8
+
commons-fileupload
commons-fileupload
diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index 4a7bc49b5..3ee564da2 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -42,6 +42,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.parser.GenericXMLParser;
+import net.yacy.document.parser.XZParser;
import net.yacy.document.parser.apkParser;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.bzipParser;
@@ -93,6 +94,7 @@ public final class TextParser {
static {
initParser(new apkParser());
initParser(new bzipParser());
+ initParser(new XZParser());
initParser(new csvParser());
initParser(new docParser());
initParser(new gzipParser());
@@ -380,6 +382,32 @@ public final class TextParser {
Integer.MAX_VALUE, Long.MAX_VALUE);
}
+ /**
+ * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
+ * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
+ * (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
+ * not support parsing within limits, an exception is thrown when
+ * content size is beyond maxBytes.
+ * @param location the URL of the source
+ * @param mimeType the mime type of the source, if known
+ * @param charset the charset name of the source, if known
+ * @param ignoreClassNames an eventual set of CSS class names whose matching html elements content should be ignored
+ * @param timezoneOffset the local time zone offset
+ * @param depth the current depth of the crawl
+ * @param contentLength the length of the source, if known (else -1 should be used)
+ * @param source a input stream
+ * @param maxLinks the maximum total number of links to parse and add to the result documents
+ * @param maxBytes the maximum number of content bytes to process
+ * @return a list of documents that result from parsing the source, with empty or null text.
+ * @throws Parser.Failure when the parser processing failed
+ */
+ public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set ignoreClassNames,
+ final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
+ long maxBytes) throws Parser.Failure{
+ return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
+ sourceStream, maxLinks, maxBytes);
+ }
+
/**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
diff --git a/source/net/yacy/document/parser/AbstractCompressorParser.java b/source/net/yacy/document/parser/AbstractCompressorParser.java
new file mode 100644
index 000000000..753b894a4
--- /dev/null
+++ b/source/net/yacy/document/parser/AbstractCompressorParser.java
@@ -0,0 +1,187 @@
+// AbstractCompressorParser.java
+// ---------------------------
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package net.yacy.document.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.util.Date;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.CompressorInputStream;
+
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+import net.yacy.document.TextParser;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Base class for parsing compressed files relying on Apache commons-compress
+ * tools.
+ */
+public abstract class AbstractCompressorParser extends AbstractParser implements Parser {
+
+ /** Crawl depth applied when parsing internal compressed content */
+ protected static final int DEFAULT_DEPTH = 999;
+
+ /**
+ * @param name the human readable name of the parser
+ */
+ public AbstractCompressorParser(final String name) {
+ super(name);
+ }
+
+ /**
+ * @param source an open input stream on a compressed source
+ * @return a sub class of CompressorInputStream capable of uncompressing the source
+ * on the fly
+ * @throws IOException when an error occurred when trying to open the compressed
+ * stream
+ */
+ protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
+
+ /**
+ * Maps the given name of a compressed file to the name that the
+ * file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned.
+ *
+ * @param filename name of a compressed file
+ * @return name of the corresponding uncompressed file
+ */
+ protected abstract String getUncompressedFilename(final String filename);
+
+ @Override
+ public Document[] parse(final DigestURL location, final String mimeType, final String charset,
+ final Set ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
+ final InputStream source) throws Parser.Failure, InterruptedException {
+
+ return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
+ Long.MAX_VALUE);
+ }
+
+ @Override
+ public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
+ final Set ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
+ final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
+ Document maindoc;
+ final CompressorInputStream compressedInStream;
+ try {
+ compressedInStream = createDecompressStream(source);
+ } catch (final IOException | RuntimeException e) {
+ throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+ }
+
+ try {
+ // create maindoc for this archive, register with supplied url & mime
+ maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
+
+ final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
+ AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
+ if (docs != null) {
+ maindoc.addSubDocuments(docs);
+ if (docs.length > 0 && docs[0].isPartiallyParsed()) {
+ maindoc.setPartiallyParsed(true);
+ }
+ }
+ } catch (final Parser.Failure e) {
+ throw e;
+ } catch (final IOException | RuntimeException e) {
+ throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+ }
+ return new Document[] { maindoc };
+ }
+
+ /**
+ * Create the main parsed document for the compressed document at the given URL
+ * and Media type
+ *
+ * @param location the parsed resource URL
+ * @param mimeType the media type of the resource
+ * @param charset the charset name if known
+ * @param parser an instance of CompressorParser that is registered as the
+ * parser origin of the document
+ * @return a Document instance
+ */
+ protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
+ final AbstractCompressorParser parser) {
+ final String filename = location.getFileName();
+ return new Document(location, mimeType, charset, parser, null, null,
+ AbstractParser
+ .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+ null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
+ }
+
+ /**
+ * Parse content in an open stream uncompressing on the fly a compressed
+ * resource.
+ *
+ * @param location the URL of the compressed resource
+ * @param charset the charset name if known
+ * @param ignoreClassNames an eventual set of CSS class names whose matching
+ * html elements content should be ignored
+ * @param timezoneOffset the local time zone offset
+ * @param compressedInStream an open stream uncompressing on the fly the
+ * compressed content
+ * @param maxLinks the maximum total number of links to parse and add
+ * to the result documents
+ * @param maxBytes the maximum number of content bytes to process
+ * @return a list of documents that result from parsing the source, with empty
+ * or null text.
+ * @throws Parser.Failure when the parser processing failed
+ */
+ protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
+ final Set ignoreClassNames, final int timezoneOffset, final int depth,
+ final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
+ final String compressedFileName = location.getFileName();
+ final String contentfilename = getUncompressedFilename(compressedFileName);
+ final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
+ try {
+ /*
+ * Use the uncompressed file name for sub parsers to not unnecessarily use again
+ * this same uncompressing parser
+ */
+ final String locationPath = location.getPath();
+ final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+ + contentfilename;
+ final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
+ location.getPort(), contentPath);
+
+ /*
+ * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
+ * compressed content
+ */
+ return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
+ -1, compressedInStream, maxLinks, maxBytes);
+ } catch (final MalformedURLException e) {
+ throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
+ }
+ }
+
+ @Override
+ public boolean isParseWithLimitsSupported() {
+ return true;
+ }
+
+}
diff --git a/source/net/yacy/document/parser/XZParser.java b/source/net/yacy/document/parser/XZParser.java
new file mode 100644
index 000000000..c1fc20cad
--- /dev/null
+++ b/source/net/yacy/document/parser/XZParser.java
@@ -0,0 +1,66 @@
+// XZParser.java
+// ---------------------------
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package net.yacy.document.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+import org.apache.commons.compress.compressors.xz.XZUtils;
+
+import net.yacy.kelondro.util.MemoryControl;
+
+/**
+ * Parser for xz archives. Uncompresses and parses the content and adds it to
+ * the created main parsed document.
+ *
+ * @see xz file format website
+ */
+public class XZParser extends AbstractCompressorParser {
+
+ public XZParser() {
+ super("XZ Compressed Archive Parser");
+ this.SUPPORTED_EXTENSIONS.add("xz");
+ this.SUPPORTED_EXTENSIONS.add("txz");
+ this.SUPPORTED_MIME_TYPES.add("application/x-xz");
+ }
+
+ @Override
+ protected CompressorInputStream createDecompressStream(final InputStream source) throws IOException {
+ /*
+ * Limit the size dedicated to reading compressed blocks to at most 25% of the
+ * available memory. Eventual stricter limits should be handled by the caller
+ * (see for example crawler.[protocol].maxFileSize configuration setting).
+ */
+ final long availableMemory = MemoryControl.available();
+ final long maxKBytes = (long) (availableMemory * 0.25 / 1024.0);
+ return new XZCompressorInputStream(source, false, (int) Math.min(Integer.MAX_VALUE, maxKBytes));
+ }
+
+ @Override
+ protected String getUncompressedFilename(final String filename) {
+ return XZUtils.getUncompressedFilename(filename);
+ }
+
+}
diff --git a/test/java/net/yacy/document/parser/XZParserTest.java b/test/java/net/yacy/document/parser/XZParserTest.java
new file mode 100644
index 000000000..5392bf21a
--- /dev/null
+++ b/test/java/net/yacy/document/parser/XZParserTest.java
@@ -0,0 +1,246 @@
+// XZParserTest.java
+// ---------------------------
+// Copyright 2018 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package net.yacy.document.parser;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collection;
+
+import org.junit.Test;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.Document;
+import net.yacy.document.Parser.Failure;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Unit tests for the {@link XZParser} class
+ *
+ * @author luccioman
+ *
+ */
+public class XZParserTest {
+
+ /** Folder containing test files */
+ private static final File TEST_FOLER = new File("test", "parsertest");
+
+ /**
+ * Unit test for the XZParser.parse() function with some small bzip2 test files.
+ *
+ * @throws Failure when a file could not be parsed
+ * @throws InterruptedException when the test was interrupted before its
+ * termination
+ * @throws IOException when a read/write error occurred
+ */
+ @Test
+ public void testParse() throws Failure, InterruptedException, IOException {
+ final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+ final XZParser parser = new XZParser();
+
+ for (final String fileName : fileNames) {
+ final DigestURL location = new DigestURL("http://localhost/" + fileName);
+ try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+ final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(),
+ new VocabularyScraper(), 0, inStream);
+ assertNotNull("Parser result must not be null for file " + fileName, documents);
+ assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+ assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+ documents[0].getTextString().contains("Maßkrügen"));
+ final Collection anchors = documents[0].getAnchors();
+ assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
+ assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+ assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+ }
+ }
+ }
+
+ /**
+ * Testing parse integration with the tar parser on a test txz archive.
+ *
+ * @throws Failure when a file could not be parsed
+ * @throws InterruptedException when the test was interrupted before its
+ * termination
+ * @throws IOException when a read/write error occurred
+ */
+ @Test
+ public void testParseTxz() throws Failure, InterruptedException, IOException {
+ final String fileName = "umlaute_html_xml_txt_gnu.txz";
+ final XZParser parser = new XZParser();
+
+ final DigestURL location = new DigestURL("http://localhost/" + fileName);
+ try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+ final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(),
+ new VocabularyScraper(), 0, inStream);
+
+ assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+ final String parsedText = documents[0].getTextString();
+ assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+ assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+ parsedText.contains("Maßkrügen"));
+ assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+ assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+ assertTrue(parsedText.contains("URL reference in raw text file"));
+ assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+
+ final Collection detectedAnchors = documents[0].getAnchors();
+ assertNotNull(detectedAnchors);
+ assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
+ detectedAnchors.size());
+ assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+ assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+ assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+ assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+ assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+ }
+ }
+
+ /**
+ * Unit test for the XZParser.parseWithLimits() function with some small xz test
+ * files which content is within limits.
+ *
+ * @throws Failure when a file could not be parsed
+ * @throws InterruptedException when the test was interrupted before its
+ * termination
+ * @throws IOException when a read/write error occurred
+ */
+ @Test
+ public void testParseWithLimits() throws Failure, InterruptedException, IOException {
+ final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+ final XZParser parser = new XZParser();
+
+ for (final String fileName : fileNames) {
+ final DigestURL location = new DigestURL("http://localhost/" + fileName);
+ try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+ final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+ StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000, 10000);
+ assertNotNull("Parser result must not be null for file " + fileName, documents);
+ assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+ assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+ documents[0].getTextString().contains("Maßkrügen"));
+ final Collection anchors = documents[0].getAnchors();
+ assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
+ assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+ assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+ assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
+ documents[0].isPartiallyParsed());
+ }
+ }
+
+ }
+
+ /**
+ * Unit test for the XZParser.parseWithLimits() when maxLinks limit is exceeded
+ *
+ * @throws Failure when a file could not be parsed
+ * @throws InterruptedException when the test was interrupted before its
+ * termination
+ * @throws IOException when a read/write error occurred
+ */
+ @Test
+ public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
+ final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+ final XZParser parser = new XZParser();
+
+ /* maxLinks limit exceeded */
+ for (final String fileName : fileNames) {
+ final DigestURL location = new DigestURL("http://localhost/" + fileName);
+ try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+ final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+ StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
+ assertNotNull("Parser result must not be null for file " + fileName, documents);
+ assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+ assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+ documents[0].getTextString().contains("Maßkrügen"));
+ final Collection anchors = documents[0].getAnchors();
+ assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+ assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+ documents[0].isPartiallyParsed());
+ }
+ }
+ }
+
+ /**
+ * Unit test for the XZParser.parseWithLimits() when maxBytes limit is exceeded
+ *
+ * @throws Failure when a file could not be parsed
+ * @throws InterruptedException when the test was interrupted before its
+ * termination
+ * @throws IOException when a read/write error occurred
+ */
+ @Test
+ public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
+ final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
+ final XZParser parser = new XZParser();
+
+ String fileName = fileNames[0];
+
+ DigestURL location = new DigestURL("http://localhost/" + fileName);
+ try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+ /*
+ * The bytes limit is set to let parsing the beginning text part, but stop
+ * before reaching the tag
+ */
+ final long maxBytes = 258;
+ final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+ StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+ assertNotNull("Parser result must not be null for file " + fileName, documents);
+ assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+ assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+ documents[0].getTextString().contains("Maßkrügen"));
+ final Collection anchors = documents[0].getAnchors();
+ assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+ assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+ documents[0].isPartiallyParsed());
+ }
+
+ fileName = fileNames[1];
+ location = new DigestURL("http://localhost/" + fileName);
+ try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+ /*
+ * The bytes limit is set to let parsing the beginning of the text, but stop
+ * before reaching the URL
+ */
+ final long maxBytes = 65;
+ final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
+ StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+ assertNotNull("Parser result must not be null for file " + fileName, documents);
+ assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+ assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+ documents[0].getTextString().contains("Maßkrügen"));
+ final Collection anchors = documents[0].getAnchors();
+ assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+ assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+ documents[0].isPartiallyParsed());
+ }
+ }
+
+}
diff --git a/test/parsertest/umlaute_html_utf8.html.xz b/test/parsertest/umlaute_html_utf8.html.xz
new file mode 100644
index 000000000..b487be27a
Binary files /dev/null and b/test/parsertest/umlaute_html_utf8.html.xz differ
diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.txz b/test/parsertest/umlaute_html_xml_txt_gnu.txz
new file mode 100644
index 000000000..e791c9df9
Binary files /dev/null and b/test/parsertest/umlaute_html_xml_txt_gnu.txz differ
diff --git a/test/parsertest/umlaute_linux.txt.xz b/test/parsertest/umlaute_linux.txt.xz
new file mode 100755
index 000000000..889843209
Binary files /dev/null and b/test/parsertest/umlaute_linux.txt.xz differ