From 8acae852a0dba1db846ff0a98c5972c4753db046 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 25 Jun 2014 11:51:11 +0200 Subject: [PATCH] write -tagged texts also into the bold_txt field --- source/net/yacy/document/parser/html/ContentScraper.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 896ad89f0..a41a9cc4d 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -109,6 +109,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { h6(TagType.pair), title(TagType.pair), b(TagType.pair), + em(TagType.pair), strong(TagType.pair), u(TagType.pair), i(TagType.pair), @@ -563,6 +564,9 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) { h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.bold.inc(h); + } else if ((tag.name.equalsIgnoreCase("em")) && (tag.content.length() < 1024)) { + h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); + if (h.length() > 0) this.bold.inc(h); } else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) { h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); if (h.length() > 0) this.italic.inc(h);