diff --git a/.classpath b/.classpath index 8cc301584..3e4f29314 100644 --- a/.classpath +++ b/.classpath @@ -23,6 +23,7 @@ + diff --git a/build.xml b/build.xml index 5cc6aefb2..af41a8802 100644 --- a/build.xml +++ b/build.xml @@ -179,6 +179,7 @@ + diff --git a/lib/xz-1.8.License b/lib/xz-1.8.License new file mode 100644 index 000000000..c1d404dc7 --- /dev/null +++ b/lib/xz-1.8.License @@ -0,0 +1,10 @@ + +Licensing of XZ for Java +======================== + + All the files in this package have been written by Lasse Collin + and/or Igor Pavlov. All these files have been put into the + public domain. You can do whatever you want with these files. + + This software is provided "as is", without any warranty. + diff --git a/lib/xz-1.8.jar b/lib/xz-1.8.jar new file mode 100644 index 000000000..9931efa3e Binary files /dev/null and b/lib/xz-1.8.jar differ diff --git a/pom.xml b/pom.xml index f7a4d2556..37bf57004 100644 --- a/pom.xml +++ b/pom.xml @@ -380,6 +380,12 @@ 1.17 jar + + + org.tukaani + xz + 1.8 + commons-fileupload commons-fileupload diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 4a7bc49b5..3ee564da2 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -42,6 +42,7 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.StrictLimitInputStream; import net.yacy.document.parser.GenericXMLParser; +import net.yacy.document.parser.XZParser; import net.yacy.document.parser.apkParser; import net.yacy.document.parser.audioTagParser; import net.yacy.document.parser.bzipParser; @@ -93,6 +94,7 @@ public final class TextParser { static { initParser(new apkParser()); initParser(new bzipParser()); + initParser(new XZParser()); initParser(new csvParser()); initParser(new docParser()); initParser(new gzipParser()); @@ -380,6 +382,32 @@ public final class TextParser { Integer.MAX_VALUE, Long.MAX_VALUE); } + /** + * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) + * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits + * (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do + * not support parsing within limits, an exception is thrown when + * content size is beyond maxBytes. + * @param location the URL of the source + * @param mimeType the mime type of the source, if known + * @param charset the charset name of the source, if known + * @param ignoreClassNames an eventual set of CSS class names whose matching html elements content should be ignored + * @param timezoneOffset the local time zone offset + * @param depth the current depth of the crawl + * @param contentLength the length of the source, if known (else -1 should be used) + * @param source a input stream + * @param maxLinks the maximum total number of links to parse and add to the result documents + * @param maxBytes the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with empty or null text. + * @throws Parser.Failure when the parser processing failed + */ + public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set ignoreClassNames, + final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks, + long maxBytes) throws Parser.Failure{ + return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength, + sourceStream, maxLinks, maxBytes); + } + /** * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits diff --git a/source/net/yacy/document/parser/AbstractCompressorParser.java b/source/net/yacy/document/parser/AbstractCompressorParser.java new file mode 100644 index 000000000..753b894a4 --- /dev/null +++ b/source/net/yacy/document/parser/AbstractCompressorParser.java @@ -0,0 +1,187 @@ +// AbstractCompressorParser.java +// --------------------------- +// Copyright 2018 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.Date; +import java.util.Set; + +import org.apache.commons.compress.compressors.CompressorInputStream; + +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; + +/** + * Base class for parsing compressed files relying on Apache commons-compress + * tools. + */ +public abstract class AbstractCompressorParser extends AbstractParser implements Parser { + + /** Crawl depth applied when parsing internal compressed content */ + protected static final int DEFAULT_DEPTH = 999; + + /** + * @param name the human readable name of the parser + */ + public AbstractCompressorParser(final String name) { + super(name); + } + + /** + * @param source an open input stream on a compressed source + * @return a sub class of CompressorInputStream capable of uncompressing the source + * on the fly + * @throws IOException when an error occurred when trying to open the compressed + * stream + */ + protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException; + + /** + * Maps the given name of a compressed file to the name that the + * file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned. + * + * @param filename name of a compressed file + * @return name of the corresponding uncompressed file + */ + protected abstract String getUncompressedFilename(final String filename); + + @Override + public Document[] parse(final DigestURL location, final String mimeType, final String charset, + final Set ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { + + return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE, + Long.MAX_VALUE); + } + + @Override + public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, + final Set ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset, + final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure { + Document maindoc; + final CompressorInputStream compressedInStream; + try { + compressedInStream = createDecompressStream(source); + } catch (final IOException | RuntimeException e) { + throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); + } + + try { + // create maindoc for this archive, register with supplied url & mime + maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this); + + final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset, + AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes); + if (docs != null) { + maindoc.addSubDocuments(docs); + if (docs.length > 0 && docs[0].isPartiallyParsed()) { + maindoc.setPartiallyParsed(true); + } + } + } catch (final Parser.Failure e) { + throw e; + } catch (final IOException | RuntimeException e) { + throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); + } + return new Document[] { maindoc }; + } + + /** + * Create the main parsed document for the compressed document at the given URL + * and Media type + * + * @param location the parsed resource URL + * @param mimeType the media type of the resource + * @param charset the charset name if known + * @param parser an instance of CompressorParser that is registered as the + * parser origin of the document + * @return a Document instance + */ + protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, + final AbstractCompressorParser parser) { + final String filename = location.getFileName(); + return new Document(location, mimeType, charset, parser, null, null, + AbstractParser + .singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title + null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date()); + } + + /** + * Parse content in an open stream uncompressing on the fly a compressed + * resource. + * + * @param location the URL of the compressed resource + * @param charset the charset name if known + * @param ignoreClassNames an eventual set of CSS class names whose matching + * html elements content should be ignored + * @param timezoneOffset the local time zone offset + * @param compressedInStream an open stream uncompressing on the fly the + * compressed content + * @param maxLinks the maximum total number of links to parse and add + * to the result documents + * @param maxBytes the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with empty + * or null text. + * @throws Parser.Failure when the parser processing failed + */ + protected Document[] parseCompressedInputStream(final DigestURL location, final String charset, + final Set ignoreClassNames, final int timezoneOffset, final int depth, + final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure { + final String compressedFileName = location.getFileName(); + final String contentfilename = getUncompressedFilename(compressedFileName); + final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename)); + try { + /* + * Use the uncompressed file name for sub parsers to not unnecessarily use again + * this same uncompressing parser + */ + final String locationPath = location.getPath(); + final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + + contentfilename; + final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), + location.getPort(), contentPath); + + /* + * Rely on the supporting parsers to respect the maxLinks and maxBytes limits on + * compressed content + */ + return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth, + -1, compressedInStream, maxLinks, maxBytes); + } catch (final MalformedURLException e) { + throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location); + } + } + + @Override + public boolean isParseWithLimitsSupported() { + return true; + } + +} diff --git a/source/net/yacy/document/parser/XZParser.java b/source/net/yacy/document/parser/XZParser.java new file mode 100644 index 000000000..c1fc20cad --- /dev/null +++ b/source/net/yacy/document/parser/XZParser.java @@ -0,0 +1,66 @@ +// XZParser.java +// --------------------------- +// Copyright 2018 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.apache.commons.compress.compressors.xz.XZUtils; + +import net.yacy.kelondro.util.MemoryControl; + +/** + * Parser for xz archives. Uncompresses and parses the content and adds it to + * the created main parsed document. + * + * @see xz file format website + */ +public class XZParser extends AbstractCompressorParser { + + public XZParser() { + super("XZ Compressed Archive Parser"); + this.SUPPORTED_EXTENSIONS.add("xz"); + this.SUPPORTED_EXTENSIONS.add("txz"); + this.SUPPORTED_MIME_TYPES.add("application/x-xz"); + } + + @Override + protected CompressorInputStream createDecompressStream(final InputStream source) throws IOException { + /* + * Limit the size dedicated to reading compressed blocks to at most 25% of the + * available memory. Eventual stricter limits should be handled by the caller + * (see for example crawler.[protocol].maxFileSize configuration setting). + */ + final long availableMemory = MemoryControl.available(); + final long maxKBytes = (long) (availableMemory * 0.25 / 1024.0); + return new XZCompressorInputStream(source, false, (int) Math.min(Integer.MAX_VALUE, maxKBytes)); + } + + @Override + protected String getUncompressedFilename(final String filename) { + return XZUtils.getUncompressedFilename(filename); + } + +} diff --git a/test/java/net/yacy/document/parser/XZParserTest.java b/test/java/net/yacy/document/parser/XZParserTest.java new file mode 100644 index 000000000..5392bf21a --- /dev/null +++ b/test/java/net/yacy/document/parser/XZParserTest.java @@ -0,0 +1,246 @@ +// XZParserTest.java +// --------------------------- +// Copyright 2018 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Collection; + +import org.junit.Test; + +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.document.Document; +import net.yacy.document.Parser.Failure; +import net.yacy.document.VocabularyScraper; + +/** + * Unit tests for the {@link XZParser} class + * + * @author luccioman + * + */ +public class XZParserTest { + + /** Folder containing test files */ + private static final File TEST_FOLER = new File("test", "parsertest"); + + /** + * Unit test for the XZParser.parse() function with some small bzip2 test files. + * + * @throws Failure when a file could not be parsed + * @throws InterruptedException when the test was interrupted before its + * termination + * @throws IOException when a read/write error occurred + */ + @Test + public void testParse() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" }; + final XZParser parser = new XZParser(); + + for (final String fileName : fileNames) { + final DigestURL location = new DigestURL("http://localhost/" + fileName); + try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) { + final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + final Collection anchors = documents[0].getAnchors(); + assertNotNull("Detected URLS must not be null for file " + fileName, anchors); + assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size()); + assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_")); + } + } + } + + /** + * Testing parse integration with the tar parser on a test txz archive. + * + * @throws Failure when a file could not be parsed + * @throws InterruptedException when the test was interrupted before its + * termination + * @throws IOException when a read/write error occurred + */ + @Test + public void testParseTxz() throws Failure, InterruptedException, IOException { + final String fileName = "umlaute_html_xml_txt_gnu.txz"; + final XZParser parser = new XZParser(); + + final DigestURL location = new DigestURL("http://localhost/" + fileName); + try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) { + final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(), + new VocabularyScraper(), 0, inStream); + + assertNotNull("Parser result must not be null for file " + fileName, documents); + + final String parsedText = documents[0].getTextString(); + assertNotNull("Parsed text must not be empty for file " + fileName, parsedText); + assertTrue("Parsed text must contain test word with umlaut char in file " + fileName, + parsedText.contains("Maßkrügen")); + assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML")); + assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML")); + assertTrue(parsedText.contains("URL reference in raw text file")); + assertTrue(parsedText.contains("UTF-8 encoded XML test file")); + + final Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5, + detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt"))); + } + } + + /** + * Unit test for the XZParser.parseWithLimits() function with some small xz test + * files which content is within limits. + * + * @throws Failure when a file could not be parsed + * @throws InterruptedException when the test was interrupted before its + * termination + * @throws IOException when a read/write error occurred + */ + @Test + public void testParseWithLimits() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" }; + final XZParser parser = new XZParser(); + + for (final String fileName : fileNames) { + final DigestURL location = new DigestURL("http://localhost/" + fileName); + try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) { + final Document[] documents = parser.parseWithLimits(location, "application/x-xz", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000, 10000); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + final Collection anchors = documents[0].getAnchors(); + assertNotNull("Detected URLs must not be null for file " + fileName, anchors); + assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size()); + assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_")); + assertFalse("Parse document must not be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } + } + + } + + /** + * Unit test for the XZParser.parseWithLimits() when maxLinks limit is exceeded + * + * @throws Failure when a file could not be parsed + * @throws InterruptedException when the test was interrupted before its + * termination + * @throws IOException when a read/write error occurred + */ + @Test + public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" }; + final XZParser parser = new XZParser(); + + /* maxLinks limit exceeded */ + for (final String fileName : fileNames) { + final DigestURL location = new DigestURL("http://localhost/" + fileName); + try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) { + final Document[] documents = parser.parseWithLimits(location, "application/x-xz", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + final Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } + } + } + + /** + * Unit test for the XZParser.parseWithLimits() when maxBytes limit is exceeded + * + * @throws Failure when a file could not be parsed + * @throws InterruptedException when the test was interrupted before its + * termination + * @throws IOException when a read/write error occurred + */ + @Test + public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException { + final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" }; + final XZParser parser = new XZParser(); + + String fileName = fileNames[0]; + + DigestURL location = new DigestURL("http://localhost/" + fileName); + try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) { + /* + * The bytes limit is set to let parsing the beginning text part, but stop + * before reaching the tag + */ + final long maxBytes = 258; + final Document[] documents = parser.parseWithLimits(location, "application/x-xz", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + final Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } + + fileName = fileNames[1]; + location = new DigestURL("http://localhost/" + fileName); + try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) { + /* + * The bytes limit is set to let parsing the beginning of the text, but stop + * before reaching the URL + */ + final long maxBytes = 65; + final Document[] documents = parser.parseWithLimits(location, "application/x-xz", + StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes); + assertNotNull("Parser result must not be null for file " + fileName, documents); + assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString()); + assertTrue("Parsed text must contain test word with umlaut char" + fileName, + documents[0].getTextString().contains("Maßkrügen")); + final Collection anchors = documents[0].getAnchors(); + assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty()); + assertTrue("Parsed document must be marked as partially parsed for file " + fileName, + documents[0].isPartiallyParsed()); + } + } + +} diff --git a/test/parsertest/umlaute_html_utf8.html.xz b/test/parsertest/umlaute_html_utf8.html.xz new file mode 100644 index 000000000..b487be27a Binary files /dev/null and b/test/parsertest/umlaute_html_utf8.html.xz differ diff --git a/test/parsertest/umlaute_html_xml_txt_gnu.txz b/test/parsertest/umlaute_html_xml_txt_gnu.txz new file mode 100644 index 000000000..e791c9df9 Binary files /dev/null and b/test/parsertest/umlaute_html_xml_txt_gnu.txz differ diff --git a/test/parsertest/umlaute_linux.txt.xz b/test/parsertest/umlaute_linux.txt.xz new file mode 100755 index 000000000..889843209 Binary files /dev/null and b/test/parsertest/umlaute_linux.txt.xz differ