diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index a634fb598..cca71e75c 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -671,6 +671,19 @@ dc_rights return v; } + /** + * Adds the main content of subdocuments to this document. + * This is useful if the document is a container for other documents (like zip or other archives) + * to make the content of the subdocuments searcheable, + * but has only one url (unlike container-urls as rss). + * + * This is similar to mergeDocuments but directly joins internal content variables, + * uses less parsed details and keeps this documents crawl data (like crawldepth, lastmodified) + * + * @see mergeDocuments() + * @param docs to be included + * @throws IOException + */ public void addSubDocuments(final Document[] docs) throws IOException { for (final Document doc: docs) { this.sections.addAll(doc.sections); diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index fe95e8ab7..0dc0daad6 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -30,6 +30,7 @@ package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; +import java.util.Date; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; @@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2Utils; - +/** + * Parses a bz2 archive. + * Unzips and parses the content and adds it to the created main document + */ public class bzipParser extends AbstractParser implements Parser { public bzipParser() { @@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser { throws Parser.Failure, InterruptedException { File tempFile = null; - Document[] docs; + Document maindoc = null; try { int read = 0; final byte[] data = new byte[1024]; @@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser { // creating a temp file to store the uncompressed data final FileOutputStream out = new FileOutputStream(tempFile); - // reading gzip file and store it uncompressed + // reading bzip file and store it uncompressed while((read = zippedContent.read(data, 0, 1024)) != -1) { out.write(data, 0, read); } zippedContent.close(); out.close(); + // create maindoc for this bzip container, register with supplied url & mime + maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // creating a new parser class to parse the unzipped content final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName()); final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); - docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); - // TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding) + final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } - return docs; + return maindoc == null ? null : new Document[]{maindoc}; } } diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 58f788f37..504dd1116 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -30,17 +30,23 @@ package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; +import java.util.Date; import java.util.zip.GZIPInputStream; import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; +import org.apache.commons.compress.compressors.gzip.GzipUtils; - +/** + * Parses a gz archive. + * Unzips and parses the content and adds it to the created main document + */ public class gzipParser extends AbstractParser implements Parser { public gzipParser() { @@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser { final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; - Document[] docs = null; + Document maindoc = null; try { int read = 0; final byte[] data = new byte[1024]; @@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser { } zippedContent.close(); out.close(); - + // create maindoc for this gzip container, register with supplied url & mime + maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); + final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); + final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); + Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } - return docs; + return maindoc == null ? null : new Document[]{maindoc}; } } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index be4b515fd..c5a5fbd03 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -29,8 +29,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; -import java.util.ArrayList; -import java.util.List; +import java.util.Date; import java.util.zip.GZIPInputStream; import net.yacy.cora.document.encoding.UTF8; @@ -47,7 +46,10 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; // this is a new implementation of this parser idiom using multiple documents as result set - +/** + * Parses the tar file and each contained file, + * returns one document with combined content. + */ public class tarParser extends AbstractParser implements Parser { private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105 @@ -70,8 +72,6 @@ public class tarParser extends AbstractParser implements Parser { final int timezoneOffset, InputStream source) throws Parser.Failure, InterruptedException { - final List docacc = new ArrayList(); - Document[] subDocs = null; final String ext = MultiProtocolURL.getFileExtension(location.getFileName()); if (ext.equals("gz") || ext.equals("tgz")) { try { @@ -82,11 +82,31 @@ public class tarParser extends AbstractParser implements Parser { } TarArchiveEntry entry; final TarArchiveInputStream tis = new TarArchiveInputStream(source); - File tmp = null; - + + // create maindoc for this bzip container + Document maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // loop through the elements in the tar file and parse every single file inside while (true) { try { + File tmp = null; entry = tis.getNextTarEntry(); if (entry == null) break; if (entry.isDirectory() || entry.getSize() <= 0) continue; @@ -96,9 +116,9 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); + final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); if (subDocs == null) continue; - for (final Document d: subDocs) docacc.add(d); + maindoc.addSubDocuments(subDocs); } catch (final Parser.Failure e) { AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); } finally { @@ -109,8 +129,7 @@ public class tarParser extends AbstractParser implements Parser { break; } } - if (docacc.isEmpty()) return null; - return docacc.toArray(new Document[docacc.size()]); + return new Document[]{maindoc}; } public final static boolean isTar(File f) { diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index a924a6e03..155d669ba 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -27,8 +27,7 @@ package net.yacy.document.parser; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; +import java.util.Date; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -43,7 +42,11 @@ import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; // this is a new implementation of this parser idiom using multiple documents as result set - +/** + * Parses Zip archives. Creates a main document for the zip url/file. + * Each file in the zip is parsed and the result added to the main document. + * parse returns one document with the combined content. + */ public class zipParser extends AbstractParser implements Parser { public zipParser() { @@ -74,15 +77,33 @@ public class zipParser extends AbstractParser implements Parser { if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location); - Document[] docs = null; - final List docacc = new ArrayList(); ZipEntry entry; final ZipInputStream zis = new ZipInputStream(source); - File tmp = null; + // create maindoc for this zip container with supplied url and mime + Document maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object)null, + null, + null, + null, + false, + new Date()); // loop through the elements in the zip file and parse every single file inside while (true) { try { + File tmp = null; if (zis.available() <= 0) break; entry = zis.getNextEntry(); if (entry == null) break; @@ -95,9 +116,9 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final DigestURL virtualURL = DigestURL.newURL(location, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp); + final Document[] docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp); if (docs == null) continue; - for (final Document d: docs) docacc.add(d); + maindoc.addSubDocuments(docs); } catch (final Parser.Failure e) { AbstractParser.log.warn("ZIP parser entry " + name + ": " + e.getMessage()); } finally { @@ -108,7 +129,6 @@ public class zipParser extends AbstractParser implements Parser { break; } } - if (docacc.isEmpty()) return null; - return docacc.toArray(new Document[docacc.size()]); + return new Document[]{maindoc}; } }