From be928815fcb15132354aa8c217b6610f6e37b3ce Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 17 Jun 2012 17:18:19 +0200 Subject: [PATCH] fixed wrong parsing of style and script --- .../net/yacy/document/parser/html/ContentScraper.java | 6 ++++-- .../yacy/document/parser/html/TransformerWriter.java | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index fe2f8c559..386722b59 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -99,7 +99,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { strong(TagType.pair), i(TagType.pair), li(TagType.pair), - script(TagType.pair); + script(TagType.pair), + style(TagType.pair); public TagType type; private Tag(final TagType type) { @@ -201,6 +202,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { @Override public void scrapeText(final char[] newtext, final String insideTag) { // System.out.println("SCRAPE: " + UTF8.String(newtext)); + if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return; int p, pl, q, s = 0; // match evaluation pattern @@ -434,7 +436,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } @Override - public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) { + public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { final String href = tagopts.getProperty("href", EMPTY_STRING); diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index e2db4aa77..46c52dcaf 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -202,7 +202,7 @@ public final class TransformerWriter extends Writer { if (tag == null) { // case (1): this is not a tag opener/closer - if (this.scraper != null) this.scraper.scrapeText(content, null); + if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null); if (this.transformer != null) return this.transformer.transformText(content); return content; } @@ -222,7 +222,9 @@ public final class TransformerWriter extends Writer { // we are collection tag text for the tag 'filterTag' -> case (4) - (7) if (tag == null || tag.equals("!")) { // case (4): getting no tag, go on collecting content - if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag); + if (this.scraper != null) { + this.scraper.scrapeText(content, this.filterTag); + } if (this.transformer != null) { this.filterCont.append(this.transformer.transformText(content)); } else { @@ -330,7 +332,7 @@ public final class TransformerWriter extends Writer { if (in[1] == '/') { // a closing tag tagend = tagEnd(in, 2); - tag = new String(in, 2, tagend - 2); + tag = new String(in, 2, tagend - 2).toLowerCase(); final char[] text = new char[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(tag, false, text, quotechar); @@ -338,7 +340,7 @@ public final class TransformerWriter extends Writer { // an opening tag tagend = tagEnd(in, 1); - tag = new String(in, 1, tagend - 1); + tag = new String(in, 1, tagend - 1).toLowerCase(); final char[] text = new char[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(tag, true, text, quotechar);