From f254fcfc67d0ed8c585987c4815c5da885a1159f Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Fri, 24 Feb 2017 01:25:32 +0100
Subject: [PATCH] fix htmlParser <script> text extraction on code containing
 expression recognized as tag like 1<a reported in
 https://github.com/yacy/yacy_search_server/issues/109

Script content is ignored by default, but the text is filtered for html
tags. Modified scraper to skip tag filtering while within a <script>
section (until a closing tag is detected </script>.
Possible side effect, missing </script> end-tag will truncate trailing
content text.
---
 .../document/parser/html/ContentScraper.java  |  2 +-
 .../parser/html/TransformerWriter.java        |  6 ++++
 .../yacy/document/parser/htmlParserTest.java  | 31 +++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 8907ec08b..f83265fa9 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -274,7 +274,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     @Override
     public void scrapeText(final char[] newtext0, final String insideTag) {
         // System.out.println("SCRAPE: " + UTF8.String(newtext));
-        if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return;
+        if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return;
         int p, pl, q, s = 0;
         char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
         
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index 2f63ffb74..8c745cdbc 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -39,6 +39,7 @@ import java.nio.charset.Charset;
 import java.util.Enumeration;
 import java.util.Properties;
 import java.util.Stack;
+import net.yacy.document.parser.html.ContentScraper.TagName;
 
 import net.yacy.kelondro.io.CharBuffer;
 
@@ -199,6 +200,11 @@ public final class TransformerWriter extends Writer {
             return filterTag(text, quotechar, tag, false);
         }
 
+        // don't add text from within <script> section, here e.g. a "if 1<a" expression could confuse tag detection
+        if (this.tagStack.size()>0 && this.tagStack.lastElement().name.equals(TagName.script.name())) {
+            return new char[0];
+        }
+
         // an opening tag
         tagend = tagEnd(in, 1);
         tag = new String(in, 1, tagend - 1).toLowerCase();
diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java
index 97ce36717..02c79e651 100644
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@@ -3,6 +3,7 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
@@ -141,4 +142,34 @@ public class htmlParserTest extends TestCase {
         System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
         assertEquals(txt, textSource);
     }
+
+    /**
+     * Test for parseToScraper of class htmlParser for scraping html with a
+     * <script> tag which contains code with similar to other opening tag
+     * like "<a " see https://github.com/yacy/yacy_search_server/issues/109
+     */
+    @Test
+    public void testParteToScraper_ScriptTag() throws MalformedURLException, IOException {
+        final AnchorURL url = new AnchorURL("http://localhost/");
+        final String charset = StandardCharsets.UTF_8.name();
+        final String textSource = "test text";
+        // extract from test case provided by https://github.com/yacy/yacy_search_server/issues/109
+        String testhtml = "<!doctype html>"
+                + "<html class=\"a-no-js\" data-19ax5a9jf=\"dingo\">"
+                + "<head><script>var aPageStart = (new Date()).getTime();</script><meta charset=\"utf-8\"><!--  emit CSM JS -->\n"
+                + "<script>\n"
+                + "function D(){if(E){var a=f.innerWidth?{w:f.innerWidth,h:f.innerHeight}:{w:k.clientWidth,h:k.clientHeight};5<Math.abs(a.w-\n"
+                //  the  50<a  is a possible error case
+                + "P.w)||50<a.h-P.h?(P=a,Q=4,(a=l.mobile||l.tablet?450<a.w&&a.w>a.h:1250==a.w)?C(k,\"a-ws\"):ca(k,\"a-ws\")):Q--&&(ea=setTimeout(D,16))}}function na(a){(E=void 0===a?!E:!!a)&&D()}"
+                + "</script>\n"
+                + "</head>\n"
+                + "<body>" + textSource + "</body>\n"
+                + "</html>";
+        ContentScraper scraper = parseToScraper(url, charset, new VocabularyScraper(), 0, testhtml, 10);
+
+        System.out.println(scraper.getText());
+        String txt = scraper.getText();
+        System.out.println("ScraperTagTest: [" + textSource + "] = [" + txt + "]");
+        assertEquals(txt, textSource);
+    }
 }