reuse code from htmlParser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7184 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 15 years ago
parent daeea96aea
commit 8fe1102452

@ -75,13 +75,12 @@ public class htmlParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("text/csv"); SUPPORTED_MIME_TYPES.add("text/csv");
} }
public Document[] parse( public static ContentScraper parseToScraper(
final MultiProtocolURI location, final MultiProtocolURI location,
final String mimeType,
final String documentCharset, final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure {
// make a scraper and transformer // make a scraper
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false); final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
String charset = null; String charset = null;
try { try {
@ -95,10 +94,6 @@ public class htmlParser extends AbstractParser implements Parser {
charset = patchCharsetEncoding(charset); charset = patchCharsetEncoding(charset);
} }
if (documentCharset == null || !documentCharset.equalsIgnoreCase(charset)) {
log.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "' for URL = " + location.toNormalform(true, true));
}
Charset c; Charset c;
try { try {
c = Charset.forName(charset); c = Charset.forName(charset);
@ -122,10 +117,18 @@ public class htmlParser extends AbstractParser implements Parser {
//hfos.close(); //hfos.close();
if (writer.binarySuspect()) { if (writer.binarySuspect()) {
final String errorMsg = "Binary data found in resource"; final String errorMsg = "Binary data found in resource";
log.logSevere("Unable to parse '" + location + "'. " + errorMsg); throw new Parser.Failure(errorMsg, location);
throw new Parser.Failure(errorMsg,location);
} }
return transformScraper(location, mimeType, documentCharset, scraper); return scraper;
}
public Document[] parse(
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
} }
private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) { private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {

@ -29,7 +29,6 @@ package net.yacy.repository;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Date; import java.util.Date;
@ -48,8 +47,8 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -327,12 +326,11 @@ public final class LoaderDispatcher {
byte[] page = (r == null) ? null : r.getContent(); byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString()); if (page == null) throw new IOException("no response from url " + location.toString());
// scrape content try {
final ContentScraper scraper = new ContentScraper(location); return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
final Writer writer = new TransformerWriter(null, null, scraper, null, false); } catch(Parser.Failure e) {
writer.write(new String(page, "UTF-8")); throw new IOException(e.getMessage());
}
return scraper;
} }
/** /**

Loading…
Cancel
Save