fix http://forum.yacy-websuche.de/viewtopic.php?p=20889#p18426

reuse code from htmlParser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7184 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 8fe1102452
parent daeea96aea
commit 8fe1102452
2 changed files with 20 additions and 19 deletions
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -75,13 +75,12 @@ public class htmlParser extends AbstractParser implements Parser {
        SUPPORTED_MIME_TYPES.add("text/csv");
    }
-    public Document[] parse(
+    public static ContentScraper parseToScraper(
            final MultiProtocolURI location, 
            final String mimeType, 
            final String documentCharset, 
-            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
+            final InputStream sourceStream) throws Parser.Failure {
-        // make a scraper and transformer
+        // make a scraper
        final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
        String charset = null;
        try {
@ -95,10 +94,6 @@ public class htmlParser extends AbstractParser implements Parser {
            charset = patchCharsetEncoding(charset);
        }
        if (documentCharset == null || !documentCharset.equalsIgnoreCase(charset)) {
            log.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true));
        }
        Charset c;
        try {
            c = Charset.forName(charset);
@ -122,10 +117,18 @@ public class htmlParser extends AbstractParser implements Parser {
        //hfos.close();
        if (writer.binarySuspect()) {
            final String errorMsg = "Binary data found in resource";
-            log.logSevere("Unable to parse '" + location + "'. " + errorMsg);
+            throw new Parser.Failure(errorMsg, location);    
            throw new Parser.Failure(errorMsg,location);    
        }
-        return transformScraper(location, mimeType, documentCharset, scraper);
+        return scraper;
    }
    public Document[] parse(
            final MultiProtocolURI location, 
            final String mimeType, 
            final String documentCharset, 
            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
        return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
    }
    private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -29,7 +29,6 @@ package net.yacy.repository;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
 import java.util.Arrays;
 import java.util.Date;
@ -48,8 +47,8 @@ import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.document.parser.html.TransformerWriter;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
@ -327,12 +326,11 @@ public final class LoaderDispatcher {
        byte[] page = (r == null) ? null : r.getContent();
        if (page == null) throw new IOException("no response from url " + location.toString());
-        // scrape content
+        try {
-        final ContentScraper scraper = new ContentScraper(location);
+        	return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
-        final Writer writer = new TransformerWriter(null, null, scraper, null, false);
+        } catch(Parser.Failure e) {
-        writer.write(new String(page, "UTF-8"));
+        	throw new IOException(e.getMessage());
-        
+        }
        return scraper;
    }
    /**