diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index cba76c34d..78af86373 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -373,7 +373,7 @@
Filter div class names
diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java
index e0980c21b..1f4a5fd0b 100644
--- a/source/net/yacy/document/parser/html/AbstractScraper.java
+++ b/source/net/yacy/document/parser/html/AbstractScraper.java
@@ -65,17 +65,6 @@ public abstract class AbstractScraper implements Scraper {
return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase()));
}
- //the 'missing' method that shall be implemented:
- @Override
- public abstract void scrapeText(char[] text, String insideTag);
-
- // the other methods must take into account to construct the return value correctly
- @Override
- public abstract void scrapeTag0(ContentScraper.Tag tag);
-
- @Override
- public abstract void scrapeTag1(ContentScraper.Tag tag);
-
public static String stripAllTags(final char[] s) {
if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return "";
final StringBuilder r = new StringBuilder(s.length);
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 76981ffc2..1a4d46bab 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -145,6 +145,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String name;
public Properties opts;
public CharBuffer content;
+
+ /** Set to true when this tag should be ignored from scraping */
+ private boolean ignore = false;
+
public Tag(final String name) {
this.name = name;
this.opts = new Properties();
@@ -174,6 +178,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String toString() {
return "<" + name + " " + opts + ">" + content + "" + name + ">";
}
+
+ /** @return true when this tag should be ignored from scraping */
+ public boolean isIgnore() {
+ return this.ignore;
+ }
+
+ /**
+ * @param ignore true when this tag should be ignored from scraping
+ */
+ public void setIgnore(final boolean ignore) {
+ this.ignore = ignore;
+ }
}
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
@@ -216,7 +232,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final int maxAnchors;
private final VocabularyScraper vocabularyScraper;
- private final Set ignore_class_name;
+
+ /** Set of CSS class names whose matching div elements content should be ignored */
+ private final Set ignoreDivClassNames;
+
private final int timezoneOffset;
private int breadcrumbs;
@@ -245,18 +264,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param root the document root url
* @param maxAnchors the maximum number of URLs to process and store in the anchors property.
* @param maxLinks the maximum number of links (other than a, area, and canonical and stylesheet links) to store
+ * @param ignoreDivClassNames an eventual set of CSS class names whose matching div elements content should be ignored
* @param vocabularyScraper handles maps from class names to vocabulary names and from documents to a map from vocabularies to terms
* @param timezoneOffset local time zone offset
*/
@SuppressWarnings("unchecked")
- public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set ignore_class_name, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
+ public ContentScraper(final DigestURL root, final int maxAnchors, final int maxLinks, final Set ignoreDivClassNames, final VocabularyScraper vocabularyScraper, int timezoneOffset) {
// the root value here will not be used to load the resource.
// it is only the reference for relative links
super(linkTags0, linkTags1);
assert root != null;
this.root = root;
this.vocabularyScraper = vocabularyScraper;
- this.ignore_class_name = ignore_class_name;
+ this.ignoreDivClassNames = ignoreDivClassNames;
this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap(maxLinks);
@@ -314,9 +334,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
@Override
- public void scrapeText(final char[] newtext0, final String insideTag) {
- // System.out.println("SCRAPE: " + UTF8.String(newtext));
- if (insideTag != null && (TagName.script.name().equals(insideTag) || TagName.style.name().equals(insideTag))) return;
+ public void scrapeText(final char[] newtext0, final Tag insideTag) {
+ if (insideTag != null) {
+ if(insideTag.ignore) {
+ return;
+ }
+ if ((TagName.script.name().equals(insideTag.name) || TagName.style.name().equals(insideTag.name))) {
+ return;
+ }
+ }
int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
@@ -377,7 +403,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// find tags inside text
String b = cleanLine(stripAllTags(newtext));
- if ((insideTag != null) && (!(insideTag.equals("a")))) {
+ if ((insideTag != null) && (!(insideTag.name.equals(TagName.a.name())))) {
// texts inside tags sometimes have no punctuation at the line end
// this is bad for the text semantics, because it is not possible for the
// condenser to distinguish headlines from text beginnings.
@@ -697,6 +723,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
@Override
public void scrapeTag0(final Tag tag) {
+ if(tag.ignore) {
+ return;
+ }
checkOpts(tag);
if (tag.name.equalsIgnoreCase("img")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
@@ -861,6 +890,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
@Override
public void scrapeTag1(final Tag tag) {
+ if(tag.ignore) {
+ return;
+ }
checkOpts(tag);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
@@ -882,18 +914,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
final String h;
if (tag.name.equalsIgnoreCase("div")) {
- final String classn = tag.opts.getProperty("class", EMPTY_STRING);
- if (classn.length() > 0 && this.ignore_class_name.contains(classn)) {
- // we remove everything inside that tag, so it can be ignored
- tag.content.clear();
- } else {
- final String id = tag.opts.getProperty("id", EMPTY_STRING);
- this.evaluationScores.match(Element.divid, id);
- final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
- if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
- breadcrumbs++;
- }
- }
+ final String id = tag.opts.getProperty("id", EMPTY_STRING);
+ this.evaluationScores.match(Element.divid, id);
+ final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
+ if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
+ breadcrumbs++;
+ }
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.headlines[0].add(h);
@@ -974,14 +1000,32 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
*/
@Override
- public void scrapeAnyTagOpening(final String tagName, final Properties tagAttributes) {
- if (tagAttributes != null) {
+ public void scrapeAnyTagOpening(final Tag tag) {
+ if (tag != null && !tag.ignore && tag.opts != null) {
/*
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
* scraping to the limited sets in linkTags0 and linkTags1
*/
- this.linkedDataTypes.addAll(parseMicrodataItemType(tagAttributes));
+ this.linkedDataTypes.addAll(parseMicrodataItemType(tag.opts));
+ }
+ }
+
+ @Override
+ public boolean shouldIgnoreTag(final Tag tag, final Tag parentTag) {
+ boolean ignore = false;
+
+ /* First, inherit ignore property from eventual parent */
+ if(parentTag != null) {
+ ignore = parentTag.ignore;
+ }
+
+ /* Parent is not marked as ignored : let's check the current tag */
+ if (!ignore && this.ignoreDivClassNames != null && tag != null && TagName.div.name().equals(tag.name)) {
+ final String classAttr = tag.opts.getProperty("class", EMPTY_STRING);
+ final Set classes = ContentScraper.parseSpaceSeparatedTokens(classAttr);
+ ignore = !Collections.disjoint(this.ignoreDivClassNames, classes);
}
+ return ignore;
}
/**
diff --git a/source/net/yacy/document/parser/html/Scraper.java b/source/net/yacy/document/parser/html/Scraper.java
index b483d5a8b..704b3560b 100644
--- a/source/net/yacy/document/parser/html/Scraper.java
+++ b/source/net/yacy/document/parser/html/Scraper.java
@@ -24,8 +24,6 @@
package net.yacy.document.parser.html;
-import java.util.Properties;
-
public interface Scraper {
/**
@@ -50,7 +48,12 @@ public interface Scraper {
*/
public boolean isTag1(String tag);
- public void scrapeText(char[] text, String insideTag);
+ /**
+ * Process plain text
+ * @param plain text to process
+ * @param insideTag the eventual direct parent tag. May be null.
+ */
+ public void scrapeText(char[] text, ContentScraper.Tag insideTag);
/**
* Process a tag belonging to the first category of tags according to the Scraper implementation
@@ -66,10 +69,18 @@ public interface Scraper {
/**
* Processing applied to any kind of tag opening.
- * @param tagName the tag name
- * @param tagAttributes the atttributes of the tag
+ * @param tag a parsed tag
*/
- public void scrapeAnyTagOpening(String tagName, Properties tagAttributes);
+ public void scrapeAnyTagOpening(ContentScraper.Tag tag);
+
+ /**
+ * @param tag
+ * a parsed tag
+ * @param parentTag the eventual parent tag
+ * @return true when the tag should be ignored according to the scraper
+ * implementation rules
+ */
+ public boolean shouldIgnoreTag(final ContentScraper.Tag tag, final ContentScraper.Tag parentTag);
public void scrapeComment(final char[] comment);
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index eb246a997..1bf300e5e 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -232,15 +232,19 @@ public final class TransformerWriter extends Writer {
if (this.tagStack.size() == 0) {
// we are not collection tag text -> case (1) - (3)
// case (1): this is not a tag opener/closer
- if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null);
- if (this.transformer != null) return this.transformer.transformText(content);
+ if (this.scraper != null && content.length > 0) {
+ this.scraper.scrapeText(content, null);
+ }
+ if (this.transformer != null) {
+ return this.transformer.transformText(content);
+ }
return content;
}
// we are collection tag text for the tag 'filterTag' -> case (4) - (7)
// case (4): getting no tag, go on collecting content
if (this.scraper != null) {
- this.scraper.scrapeText(content, this.tagStack.lastElement().name);
+ this.scraper.scrapeText(content, this.tagStack.lastElement());
}
if (this.transformer != null) {
this.tagStack.lastElement().content.append(this.transformer.transformText(content));
@@ -293,8 +297,22 @@ public final class TransformerWriter extends Writer {
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
charBuffer.close();
+ final ContentScraper.Tag parentTag;
+ if(this.tagStack.size() > 0) {
+ parentTag = this.tagStack.lastElement();
+ } else {
+ parentTag = null;
+ }
+
+ /* Check scraper ignoring rules */
+ if (this.scraper != null && this.scraper.shouldIgnoreTag(tag, parentTag)) {
+ tag.setIgnore(true);
+ }
+
/* Apply processing relevant for any kind of tag opening */
- this.scraper.scrapeAnyTagOpening(tag.name, tag.opts);
+ if(this.scraper != null) {
+ this.scraper.scrapeAnyTagOpening(tag);
+ }
if (this.scraper != null && this.scraper.isTag0(tagname)) {
// this single tag is collected at once here
diff --git a/test/java/net/yacy/document/parser/htmlParserTest.java b/test/java/net/yacy/document/parser/htmlParserTest.java
index 4366d8c4b..5c4b62c28 100644
--- a/test/java/net/yacy/document/parser/htmlParserTest.java
+++ b/test/java/net/yacy/document/parser/htmlParserTest.java
@@ -13,6 +13,7 @@ import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
+import java.util.Set;
import org.junit.Test;
@@ -138,6 +139,107 @@ public class htmlParserTest extends TestCase {
}
}
}
+
+ /**
+ * Test the htmlParser.parse() method, when filtering out div elements on their CSS class.
+ *
+ * @throws Exception
+ * when an unexpected error occurred
+ */
+ @Test
+ public void testParseHtmlDivClassFilter() throws Exception {
+ final AnchorURL url = new AnchorURL("http://localhost/test.html");
+ final String mimetype = "text/html";
+ final StringBuilder testHtml = new StringBuilder("Test document");
+
+ testHtml.append("Top text");
+ testHtml.append("
Top link");
+ testHtml.append("
");
+
+ testHtml.append("");
+
+ testHtml.append("A paragraph
");
+
+ testHtml.append("Text-only optional block
");
+
+ testHtml.append("");
+ testHtml.append("
");
+ testHtml.append("
");
+ testHtml.append("
Child text at depth 3
");
+ testHtml.append("
");
+
+ testHtml.append("
");
+
+ final htmlParser parser = new htmlParser();
+
+ /* No CSS class filter */
+ try (InputStream sourceStream = new ByteArrayInputStream(
+ testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
+ final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
+ final Document doc = docs[0];
+ final String parsedDext = doc.getTextString();
+
+ /* Check everything has been parsed */
+ assertEquals(2, doc.getAnchors().size());
+ assertEquals(1, doc.getImages().size());
+ assertEquals(1, doc.getLinkedDataTypes().size());
+ assertTrue(parsedDext.contains("Top"));
+ assertTrue(parsedDext.contains("Some"));
+ assertTrue(parsedDext.contains("from"));
+ assertTrue(parsedDext.contains("paragraph"));
+ assertTrue(parsedDext.contains("Text-only"));
+ assertTrue(parsedDext.contains("depth"));
+ }
+
+ /* Filter on CSS classes with no matching elements */
+ try (InputStream sourceStream = new ByteArrayInputStream(
+ testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
+ final Set ignore = new HashSet<>();
+ ignore.add("opt");
+ ignore.add("head");
+ ignore.add("container");
+ final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
+ final Document doc = docs[0];
+ final String parsedDext = doc.getTextString();
+
+ /* Check everything has been parsed */
+ assertEquals(2, doc.getAnchors().size());
+ assertEquals(1, doc.getImages().size());
+ assertEquals(1, doc.getLinkedDataTypes().size());
+ assertTrue(parsedDext.contains("Top"));
+ assertTrue(parsedDext.contains("Some"));
+ assertTrue(parsedDext.contains("from"));
+ assertTrue(parsedDext.contains("paragraph"));
+ assertTrue(parsedDext.contains("Text-only"));
+ assertTrue(parsedDext.contains("depth"));
+ }
+
+ /* Filter on CSS class with matching elements */
+ try (InputStream sourceStream = new ByteArrayInputStream(
+ testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
+ final Set ignore = new HashSet<>();
+ ignore.add("optional");
+ final Document[] docs = parser.parse(url, mimetype, null, ignore, new VocabularyScraper(), 0, sourceStream);
+ final Document doc = docs[0];
+ final String parsedDext = doc.getTextString();
+
+ /* Check matching blocks have been ignored */
+ assertEquals(1, doc.getAnchors().size());
+ assertEquals("http://localhost/top.html", doc.getAnchors().iterator().next().toString());
+ assertEquals(0, doc.getLinkedDataTypes().size());
+ assertEquals(0, doc.getImages().size());
+ assertFalse(parsedDext.contains("Some"));
+ assertFalse(parsedDext.contains("from"));
+ assertFalse(parsedDext.contains("depth"));
+
+ /* Check non-matching blocks have been normally parsed */
+ assertTrue(parsedDext.contains("Top"));
+ assertTrue(parsedDext.contains("Text-only"));
+ assertTrue(parsedDext.contains("paragraph"));
+ }
+ }
/**
* Test the htmlParser.parseWithLimits() method with test content within bounds.