From f254fcfc67d0ed8c585987c4815c5da885a1159f Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 24 Feb 2017 01:25:32 +0100 Subject: [PATCH] fix htmlParser . Possible side effect, missing end-tag will truncate trailing content text. --- .../document/parser/html/ContentScraper.java | 2 +- .../parser/html/TransformerWriter.java | 6 ++++ .../yacy/document/parser/htmlParserTest.java | 31 +++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 8907ec08b..f83265fa9 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -274,7 +274,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { @Override public void scrapeText(final char[] newtext0, final String insideTag) { // System.out.println("SCRAPE: " + UTF8.String(newtext)); - if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return; + if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return; int p, pl, q, s = 0; char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 2f63ffb74..8c745cdbc 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -39,6 +39,7 @@ import java.nio.charset.Charset; import java.util.Enumeration; import java.util.Properties; import java.util.Stack; +import net.yacy.document.parser.html.ContentScraper.TagName; import net.yacy.kelondro.io.CharBuffer; @@ -199,6 +200,11 @@ public final class TransformerWriter extends Writer { return filterTag(text, quotechar, tag, false); } + // don't add text from within \n" + + "\n" + + "\n" + + "" + textSource + "\n" + + ""; + ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10); + + System.out.println(scraper.getText()); + String txt = scraper.getText(); + System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]"); + assertEquals(txt, textSource); + } }