From be928815fcb15132354aa8c217b6610f6e37b3ce Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 17 Jun 2012 17:18:19 +0200
Subject: [PATCH] fixed wrong parsing of style and script

---
 .../net/yacy/document/parser/html/ContentScraper.java  |  6 ++++--
 .../yacy/document/parser/html/TransformerWriter.java   | 10 ++++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index fe2f8c559..386722b59 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -99,7 +99,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         strong(TagType.pair),
         i(TagType.pair),
         li(TagType.pair),
-        script(TagType.pair);
+        script(TagType.pair),
+        style(TagType.pair);
 
         public TagType type;
         private Tag(final TagType type) {
@@ -201,6 +202,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     @Override
     public void scrapeText(final char[] newtext, final String insideTag) {
         // System.out.println("SCRAPE: " + UTF8.String(newtext));
+        if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
         int p, pl, q, s = 0;
 
         // match evaluation pattern
@@ -434,7 +436,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     }
 
     @Override
-    public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
+    public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) {
         // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
         if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
             final String href = tagopts.getProperty("href", EMPTY_STRING);
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index e2db4aa77..46c52dcaf 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -202,7 +202,7 @@ public final class TransformerWriter extends Writer {
 
             if (tag == null) {
                 // case (1): this is not a tag opener/closer
-                if (this.scraper != null) this.scraper.scrapeText(content, null);
+                if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
                 if (this.transformer != null) return this.transformer.transformText(content);
                 return content;
             }
@@ -222,7 +222,9 @@ public final class TransformerWriter extends Writer {
         // we are collection tag text for the tag 'filterTag' -> case (4) - (7)
         if (tag == null || tag.equals("!")) {
             // case (4): getting no tag, go on collecting content
-            if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag);
+            if (this.scraper != null) {
+                this.scraper.scrapeText(content, this.filterTag);
+            }
             if (this.transformer != null) {
                 this.filterCont.append(this.transformer.transformText(content));
             } else {
@@ -330,7 +332,7 @@ public final class TransformerWriter extends Writer {
             if (in[1] == '/') {
                 // a closing tag
                 tagend = tagEnd(in, 2);
-                tag = new String(in, 2, tagend - 2);
+                tag = new String(in, 2, tagend - 2).toLowerCase();
                 final char[] text = new char[in.length - tagend - 1];
                 System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
                 return filterTag(tag, false, text, quotechar);
@@ -338,7 +340,7 @@ public final class TransformerWriter extends Writer {
 
             // an opening tag
             tagend = tagEnd(in, 1);
-            tag = new String(in, 1, tagend - 1);
+            tag = new String(in, 1, tagend - 1).toLowerCase();
             final char[] text = new char[in.length - tagend - 1];
             System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
             return filterTag(tag, true, text, quotechar);