fixed documentation and some details of handling of keywords

pull/575/head
Michael Peter Christen 2 years ago
parent 5cb7dc1fd7
commit 1c0f50985c

@ -180,7 +180,7 @@ description_txt
## flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false, boolean ## flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false, boolean
#description_unique_b #description_unique_b
## content of keywords tag; words are separated by space ## content of keywords tag; words are separated by comma, semicolon or space
keywords keywords
## character encoding, string ## character encoding, string

@ -76,7 +76,7 @@
<!-- content of author-tag --> <!-- content of author-tag -->
<field name="author" type="text_general" indexed="true" stored="true"/> <field name="author" type="text_general" indexed="true" stored="true"/>
<!-- content of keywords tag; words are separated by space --> <!-- content of keywords tag; words are separated by comma, semicolon or spacee -->
<field name="keywords" type="text_general" indexed="true" stored="true"/> <field name="keywords" type="text_general" indexed="true" stored="true"/>
<!-- all visible text --> <!-- all visible text -->

File diff suppressed because one or more lines are too long

@ -80,10 +80,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// statics: for initialization of the HTMLFilterAbstractScraper // statics: for initialization of the HTMLFilterAbstractScraper
/** Set of tag names processed as singletons (no end tag, or not processing the eventual end tag) */ /** Set of tag names processed as singletons (no end tag, or not processing the eventual end tag) */
private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f); private static final Set<String> linkTags0 = new HashSet<>(12,0.99f);
/** Set of tag names processed by pairs of start and end tag */ /** Set of tag names processed by pairs of start and end tag */
private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f); private static final Set<String> linkTags1 = new HashSet<>(15,0.99f);
private static final Pattern LB = Pattern.compile("\n"); private static final Pattern LB = Pattern.compile("\n");
@ -147,19 +147,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public Properties opts; public Properties opts;
public CharBuffer content; public CharBuffer content;
private TagValency tv; private TagValency tv;
public Tag(final String name, TagValency defaultValency) { public Tag(final String name, final TagValency defaultValency) {
this.name = name; this.name = name;
this.tv = defaultValency; this.tv = defaultValency;
this.opts = new Properties(); this.opts = new Properties();
this.content = new CharBuffer(MAX_TAGSIZE); this.content = new CharBuffer(MAX_TAGSIZE);
} }
public Tag(final String name, TagValency defaultValency, final Properties opts) { public Tag(final String name, final TagValency defaultValency, final Properties opts) {
this.name = name; this.name = name;
this.tv = defaultValency; this.tv = defaultValency;
this.opts = opts; this.opts = opts;
this.content = new CharBuffer(MAX_TAGSIZE); this.content = new CharBuffer(MAX_TAGSIZE);
} }
public Tag(final String name, TagValency defaultValency, final Properties opts, final CharBuffer content) { public Tag(final String name, final TagValency defaultValency, final Properties opts, final CharBuffer content) {
this.name = name; this.name = name;
this.tv = defaultValency; this.tv = defaultValency;
this.opts = opts; this.opts = opts;
@ -173,7 +173,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
@Override @Override
public String toString() { public String toString() {
return "<" + name + " " + opts + ">" + content + "</" + name + ">"; return "<" + this.name + " " + this.opts + ">" + this.content + "</" + this.name + ">";
} }
/** @return true when this tag should be ignored from scraping */ /** @return true when this tag should be ignored from scraping */
@ -212,7 +212,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final SizeLimitedMap<String, String> metas; private final SizeLimitedMap<String, String> metas;
private final SizeLimitedMap<String, DigestURL> hreflang, navigation; private final SizeLimitedMap<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles; private final LinkedHashSet<String> titles;
private final List<String> articles; private final List<String> articles;
private final List<Date> startDates, endDates; private final List<Date> startDates, endDates;
//private String headline; //private String headline;
@ -274,7 +274,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final Set<String> valencySwitchTagNames, final Set<String> valencySwitchTagNames,
final TagValency defaultValency, final TagValency defaultValency,
final VocabularyScraper vocabularyScraper, final VocabularyScraper vocabularyScraper,
int timezoneOffset) { final int timezoneOffset) {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
@ -285,31 +285,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.defaultValency = defaultValency; this.defaultValency = defaultValency;
this.timezoneOffset = timezoneOffset; this.timezoneOffset = timezoneOffset;
this.evaluationScores = new Evaluation(); this.evaluationScores = new Evaluation();
this.rss = new SizeLimitedMap<DigestURL, String>(maxLinks); this.rss = new SizeLimitedMap<>(maxLinks);
this.css = new SizeLimitedMap<DigestURL, String>(maxLinks); this.css = new SizeLimitedMap<>(maxLinks);
this.anchors = new ArrayList<AnchorURL>(); this.anchors = new ArrayList<>();
this.images = new ArrayList<ImageEntry>(); this.images = new ArrayList<>();
this.icons = new HashMap<>(); this.icons = new HashMap<>();
this.embeds = new SizeLimitedMap<AnchorURL, EmbedEntry>(maxLinks); this.embeds = new SizeLimitedMap<>(maxLinks);
this.frames = new SizeLimitedSet<AnchorURL>(maxLinks); this.frames = new SizeLimitedSet<>(maxLinks);
this.iframes = new SizeLimitedSet<AnchorURL>(maxLinks); this.iframes = new SizeLimitedSet<>(maxLinks);
this.linkedDataTypes = new SizeLimitedSet<>(maxLinks); this.linkedDataTypes = new SizeLimitedSet<>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks); this.metas = new SizeLimitedMap<>(maxLinks);
this.hreflang = new SizeLimitedMap<String, DigestURL>(maxLinks); this.hreflang = new SizeLimitedMap<>(maxLinks);
this.navigation = new SizeLimitedMap<String, DigestURL>(maxLinks); this.navigation = new SizeLimitedMap<>(maxLinks);
this.script = new SizeLimitedSet<AnchorURL>(maxLinks); this.script = new SizeLimitedSet<>(maxLinks);
this.titles = new LinkedHashSet<String>(); this.titles = new LinkedHashSet<>();
this.articles = new ArrayList<String>(); this.articles = new ArrayList<>();
this.startDates = new ArrayList<>(); this.startDates = new ArrayList<>();
this.endDates = new ArrayList<>(); this.endDates = new ArrayList<>();
this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6); this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6);
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>(); for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<>();
this.bold = new ClusteredScoreMap<String>(false); this.bold = new ClusteredScoreMap<>(false);
this.italic = new ClusteredScoreMap<String>(false); this.italic = new ClusteredScoreMap<>(false);
this.underline = new ClusteredScoreMap<String>(false); this.underline = new ClusteredScoreMap<>(false);
this.li = new ArrayList<String>(); this.li = new ArrayList<>();
this.dt = new ArrayList<String>(); this.dt = new ArrayList<>();
this.dd = new ArrayList<String>(); this.dd = new ArrayList<>();
this.content = new CharBuffer(MAX_DOCSIZE, 1024); this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList(); this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0d; this.lon = 0.0d;
@ -336,10 +336,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final Set<String> valencySwitchTagNames, final Set<String> valencySwitchTagNames,
final TagValency defaultValency, final TagValency defaultValency,
final VocabularyScraper vocabularyScraper, final VocabularyScraper vocabularyScraper,
int timezoneOffset) { final int timezoneOffset) {
this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset); this(root, Integer.MAX_VALUE, maxLinks, valencySwitchTagNames, defaultValency, vocabularyScraper, timezoneOffset);
} }
@Override
public TagValency defaultValency() { public TagValency defaultValency() {
return this.defaultValency; return this.defaultValency;
} }
@ -360,7 +361,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
int p, pl, q, s = 0; int p, pl, q, s = 0;
char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray(); final char[] newtext = CharacterCoding.html2unicode(new String(newtext0)).toCharArray();
// match evaluation pattern // match evaluation pattern
this.evaluationScores.match(Element.text, newtext); this.evaluationScores.match(Element.text, newtext);
@ -430,7 +431,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
// find absolute URLs inside text // find absolute URLs inside text
final Object[] listeners = this.htmlFilterEventListeners.getListenerList(); final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
List<ContentScraperListener> anchorListeners = new ArrayList<>(); final List<ContentScraperListener> anchorListeners = new ArrayList<>();
for (int i = 0; i < listeners.length; i += 2) { for (int i = 0; i < listeners.length; i += 2) {
if (listeners[i] == ContentScraperListener.class) { if (listeners[i] == ContentScraperListener.class) {
anchorListeners.add((ContentScraperListener)listeners[i+1]); anchorListeners.add((ContentScraperListener)listeners[i+1]);
@ -508,7 +509,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
urls.add(url); urls.add(url);
} }
if(listeners != null) { if(listeners != null) {
for(ContentScraperListener listener : listeners) { for(final ContentScraperListener listener : listeners) {
listener.anchorAdded(url.toNormalform(false)); listener.anchorAdded(url.toNormalform(false));
} }
} }
@ -636,7 +637,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.vocabularyScraper.check(this.root, classprop, tag.content); this.vocabularyScraper.check(this.root, classprop, tag.content);
// itemprop microdata property (standard definition at https://www.w3.org/TR/microdata/#dfn-attr-itemprop) // itemprop microdata property (standard definition at https://www.w3.org/TR/microdata/#dfn-attr-itemprop)
String itemprop = tag.opts.getProperty("itemprop"); final String itemprop = tag.opts.getProperty("itemprop");
if (itemprop != null) { if (itemprop != null) {
String propval = tag.opts.getProperty("content"); // value for <meta itemprop="" content=""> see https://html.spec.whatwg.org/multipage/microdata.html#values String propval = tag.opts.getProperty("content"); // value for <meta itemprop="" content=""> see https://html.spec.whatwg.org/multipage/microdata.html#values
if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 + schema.org#itemprop example: <time itemprop="startDate" datetime="2016-01-26">today</time> while each prop is optional if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 + schema.org#itemprop example: <time itemprop="startDate" datetime="2016-01-26">today</time> while each prop is optional
@ -654,16 +655,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
case "startDate": // <meta itemprop="startDate" content="2016-04-21T20:00"> case "startDate": // <meta itemprop="startDate" content="2016-04-21T20:00">
try { try {
// parse ISO 8601 date // parse ISO 8601 date
Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); final Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime();
this.startDates.add(startDate); this.startDates.add(startDate);
} catch (ParseException e) {} } catch (final ParseException e) {}
break; break;
case "endDate": case "endDate":
try { try {
// parse ISO 8601 date // parse ISO 8601 date
Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime(); final Date endDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime();
this.endDates.add(endDate); this.endDates.add(endDate);
} catch (ParseException e) {} } catch (final ParseException e) {}
break; break;
} }
} }
@ -679,16 +680,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* sizes attribute string, may be null * sizes attribute string, may be null
* @return a set of sizes eventually empty. * @return a set of sizes eventually empty.
*/ */
public static Set<Dimension> parseSizes(String sizesAttr) { public static Set<Dimension> parseSizes(final String sizesAttr) {
Set<Dimension> sizes = new HashSet<Dimension>(); final Set<Dimension> sizes = new HashSet<>();
Set<String> tokens = parseSpaceSeparatedTokens(sizesAttr); final Set<String> tokens = parseSpaceSeparatedTokens(sizesAttr);
for (String token : tokens) { for (final String token : tokens) {
/* /*
* "any" keyword may be present, but doesn't have to produce a * "any" keyword may be present, but doesn't have to produce a
* dimension result * dimension result
*/ */
if (token != null) { if (token != null) {
Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token); final Matcher matcher = IconEntry.SIZE_PATTERN.matcher(token);
if (matcher.matches()) { if (matcher.matches()) {
/* With given pattern no NumberFormatException can occur */ /* With given pattern no NumberFormatException can occur */
sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2)))); sizes.add(new Dimension(Integer.parseInt(matcher.group(1)), Integer.parseInt(matcher.group(2))));
@ -708,11 +709,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @return a set of tokens eventually empty * @return a set of tokens eventually empty
*/ */
public static Set<String> parseSpaceSeparatedTokens(final String attr) { public static Set<String> parseSpaceSeparatedTokens(final String attr) {
Set<String> tokens = new HashSet<>(); final Set<String> tokens = new HashSet<>();
/* Check attr string is not empty to avoid adding a single empty string /* Check attr string is not empty to avoid adding a single empty string
* in result */ * in result */
if (attr != null && !attr.trim().isEmpty()) { if (attr != null && !attr.trim().isEmpty()) {
String[] items = attr.trim().split(CommonPattern.SPACES.pattern()); final String[] items = attr.trim().split(CommonPattern.SPACES.pattern());
Collections.addAll(tokens, items); Collections.addAll(tokens, items);
} }
return tokens; return tokens;
@ -723,9 +724,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* @param relTokens relationship tokens (parsed from a rel attribute) * @param relTokens relationship tokens (parsed from a rel attribute)
* @return a Set of icon relations, eventually empty * @return a Set of icon relations, eventually empty
*/ */
public Set<String> retainIconRelations(Collection<String> relTokens) { public Set<String> retainIconRelations(final Collection<String> relTokens) {
HashSet<String> iconRels = new HashSet<>(); final HashSet<String> iconRels = new HashSet<>();
for(String token : relTokens) { for(final String token : relTokens) {
if(IconLinkRelations.isIconRel(token)) { if(IconLinkRelations.isIconRel(token)) {
iconRels.add(token.toLowerCase(Locale.ENGLISH)); iconRels.add(token.toLowerCase(Locale.ENGLISH));
} }
@ -803,7 +804,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String href = tag.opts.getProperty("href", EMPTY_STRING); final String href = tag.opts.getProperty("href", EMPTY_STRING);
if (href.length() > 0) { if (href.length() > 0) {
tag.opts.put("name", areatitle); tag.opts.put("name", areatitle);
AnchorURL url = absolutePath(href); final AnchorURL url = absolutePath(href);
if(url != null) { if(url != null) {
tag.opts.put("href", url.toNormalform(true)); tag.opts.put("href", url.toNormalform(true));
url.setAll(tag.opts); url.setAll(tag.opts);
@ -816,19 +817,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (newLink != null) { if (newLink != null) {
tag.opts.put("href", newLink.toNormalform(true)); tag.opts.put("href", newLink.toNormalform(true));
String rel = tag.opts.getProperty("rel", EMPTY_STRING); final String rel = tag.opts.getProperty("rel", EMPTY_STRING);
/* Rel attribute is supposed to be a set of space-separated tokens */ /* Rel attribute is supposed to be a set of space-separated tokens */
Set<String> relTokens = parseSpaceSeparatedTokens(rel); final Set<String> relTokens = parseSpaceSeparatedTokens(rel);
final String linktitle = tag.opts.getProperty("title", EMPTY_STRING); final String linktitle = tag.opts.getProperty("title", EMPTY_STRING);
final String type = tag.opts.getProperty("type", EMPTY_STRING); final String type = tag.opts.getProperty("type", EMPTY_STRING);
final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING); final String hreflang = tag.opts.getProperty("hreflang", EMPTY_STRING);
Set<String> iconRels = retainIconRelations(relTokens); final Set<String> iconRels = retainIconRelations(relTokens);
/* Distinguish icons from images. It will enable for example to later search only images and no icons */ /* Distinguish icons from images. It will enable for example to later search only images and no icons */
if (!iconRels.isEmpty()) { if (!iconRels.isEmpty()) {
String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING); final String sizesAttr = tag.opts.getProperty("sizes", EMPTY_STRING);
Set<Dimension> sizes = parseSizes(sizesAttr); final Set<Dimension> sizes = parseSizes(sizesAttr);
IconEntry icon = this.icons.get(newLink); IconEntry icon = this.icons.get(newLink);
/* There is already an icon with same URL for this document : /* There is already an icon with same URL for this document :
* they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */ * they may have different rel attribute or different sizes (multi sizes ico file) or this may be a duplicate */
@ -880,7 +881,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if(tag.name.equalsIgnoreCase("param")) { } else if(tag.name.equalsIgnoreCase("param")) {
final String name = tag.opts.getProperty("name", EMPTY_STRING); final String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) { if (name.equalsIgnoreCase("movie")) {
AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING)); final AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
if(url != null) { if(url != null) {
tag.opts.put("value", url.toNormalform(true)); tag.opts.put("value", url.toNormalform(true));
url.setAll(tag.opts); url.setAll(tag.opts);
@ -918,7 +919,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
checkOpts(tag); checkOpts(tag);
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING); final String href = tag.opts.getProperty("href", EMPTY_STRING);
AnchorURL url; AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
if (followDenied()) { if (followDenied()) {
@ -939,7 +940,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.divid, id); this.evaluationScores.match(Element.divid, id);
final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING); final String itemtype = tag.opts.getProperty("itemtype", EMPTY_STRING);
if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) { if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
breadcrumbs++; this.breadcrumbs++;
} }
} else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) { } else if ((tag.name.equalsIgnoreCase("h1")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars()))); h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
@ -990,7 +991,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if (tag.name.equalsIgnoreCase("script")) { } else if (tag.name.equalsIgnoreCase("script")) {
final String src = tag.opts.getProperty("src", EMPTY_STRING); final String src = tag.opts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) { if (src.length() > 0) {
AnchorURL absoluteSrc = absolutePath(src); final AnchorURL absoluteSrc = absolutePath(src);
if(absoluteSrc != null) { if(absoluteSrc != null) {
this.script.add(absoluteSrc); this.script.add(absoluteSrc);
} }
@ -1005,9 +1006,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h = tag.opts.getProperty("datetime"); // TODO: checkOpts() also parses datetime property if in combination with schema.org itemprop=startDate/endDate h = tag.opts.getProperty("datetime"); // TODO: checkOpts() also parses datetime property if in combination with schema.org itemprop=startDate/endDate
if (h != null) { // datetime property is optional if (h != null) { // datetime property is optional
try { try {
Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime(); final Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime();
this.startDates.add(startDate); this.startDates.add(startDate);
} catch (ParseException ex) { } } catch (final ParseException ex) { }
} }
} }
@ -1049,7 +1050,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* Add an anchor to the anchors list, and trigger any eventual listener * Add an anchor to the anchors list, and trigger any eventual listener
* @param anchor anchor to add. Must not be null. * @param anchor anchor to add. Must not be null.
*/ */
protected void addAnchor(AnchorURL anchor) { protected void addAnchor(final AnchorURL anchor) {
if(this.anchors.size() >= this.maxAnchors) { if(this.anchors.size() >= this.maxAnchors) {
this.maxAnchorsExceeded = true; this.maxAnchorsExceeded = true;
} else { } else {
@ -1067,7 +1068,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public List<String> getTitles() { public List<String> getTitles() {
// some documents have a title tag as meta tag // some documents have a title tag as meta tag
String s = this.metas.get("title"); final String s = this.metas.get("title");
if (s != null && s.length() > 0) { if (s != null && s.length() > 0) {
this.titles.add(s); this.titles.add(s);
} }
@ -1083,7 +1084,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
// extract headline from file name // extract headline from file name
ArrayList<String> t = new ArrayList<String>(); final ArrayList<String> t = new ArrayList<>();
t.addAll(this.titles); t.addAll(this.titles);
return t; return t;
} }
@ -1094,7 +1095,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
public String[] getBold() { public String[] getBold() {
final List<String> a = new ArrayList<String>(); final List<String> a = new ArrayList<>();
final Iterator<String> i = this.bold.keys(false); final Iterator<String> i = this.bold.keys(false);
while (i.hasNext()) a.add(i.next()); while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]); return a.toArray(new String[a.size()]);
@ -1107,7 +1108,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
public String[] getItalic() { public String[] getItalic() {
final List<String> a = new ArrayList<String>(); final List<String> a = new ArrayList<>();
final Iterator<String> i = this.italic.keys(false); final Iterator<String> i = this.italic.keys(false);
while (i.hasNext()) a.add(i.next()); while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]); return a.toArray(new String[a.size()]);
@ -1120,7 +1121,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
public String[] getUnderline() { public String[] getUnderline() {
final List<String> a = new ArrayList<String>(); final List<String> a = new ArrayList<>();
final Iterator<String> i = this.underline.keys(false); final Iterator<String> i = this.underline.keys(false);
while (i.hasNext()) a.add(i.next()); while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]); return a.toArray(new String[a.size()]);
@ -1154,7 +1155,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public DigestURL[] getFlash() { public DigestURL[] getFlash() {
String ext; String ext;
ArrayList<DigestURL> f = new ArrayList<DigestURL>(); final ArrayList<DigestURL> f = new ArrayList<>();
for (final DigestURL url: this.anchors) { for (final DigestURL url: this.anchors) {
ext = MultiProtocolURL.getFileExtension(url.getFileName()); ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext == null) continue; if (ext == null) continue;
@ -1323,7 +1324,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public List<String> getDescriptions() { public List<String> getDescriptions() {
String s = this.metas.get("description"); String s = this.metas.get("description");
if (s == null) s = this.metas.get("dc.description"); if (s == null) s = this.metas.get("dc.description");
List<String> descriptions = new ArrayList<String>(); final List<String> descriptions = new ArrayList<>();
if (s == null) return descriptions; if (s == null) return descriptions;
descriptions.add(s); descriptions.add(s);
return descriptions; return descriptions;
@ -1358,7 +1359,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
String s = this.metas.get("content-language"); String s = this.metas.get("content-language");
if (s == null) s = this.metas.get("dc.language"); if (s == null) s = this.metas.get("dc.language");
if (s == null) return null; if (s == null) return null;
final Set<String> hs = new HashSet<String>(); final Set<String> hs = new HashSet<>();
final String[] cl = commaSepPattern.split(s); final String[] cl = commaSepPattern.split(s);
int p; int p;
for (int i = 0; i < cl.length; i++) { for (int i = 0; i < cl.length; i++) {
@ -1378,9 +1379,32 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (s.isEmpty()) { if (s.isEmpty()) {
return new String[0]; return new String[0];
} }
if (s.contains(",")) return commaSepPattern.split(s); String[] k = null;
if (s.contains(";")) return semicSepPattern.split(s); if (s.contains(","))
return s.split("\\s"); k = commaSepPattern.split(s);
else if (s.contains(";"))
k = semicSepPattern.split(s);
else
k = s.split("\\s");
// trim the Strings
for (int i = 0; i < k.length; i++)
k[i] = k[i].trim();
// remove empty strings
int p = 0;
while (p < k.length) {
if (k[p].length() == 0) {
final String[] k1 = new String[k.length - 1];
System.arraycopy(k, 0, k1, 0, p);
System.arraycopy(k, p + 1, k1, p, k1.length - p);
k = k1;
} else {
p++;
}
}
return k;
} }
public int getRefreshSeconds() { public int getRefreshSeconds() {
@ -1412,27 +1436,27 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// <meta name="date" content="YYYY-MM-DD..." /> // <meta name="date" content="YYYY-MM-DD..." />
content = this.metas.get("date"); content = this.metas.get("date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {}
// <meta name="DC.date.modified" content="YYYY-MM-DD" /> // <meta name="DC.date.modified" content="YYYY-MM-DD" />
content = this.metas.get("dc.date.modified"); content = this.metas.get("dc.date.modified");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {}
// <meta name="DC.date.created" content="YYYY-MM-DD" /> // <meta name="DC.date.created" content="YYYY-MM-DD" />
content = this.metas.get("dc.date.created"); content = this.metas.get("dc.date.created");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {}
// <meta name="DC.date" content="YYYY-MM-DD" /> // <meta name="DC.date" content="YYYY-MM-DD" />
content = this.metas.get("dc.date"); content = this.metas.get("dc.date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {}
// <meta name="DC:date" content="YYYY-MM-DD" /> // <meta name="DC:date" content="YYYY-MM-DD" />
content = this.metas.get("dc:date"); content = this.metas.get("dc:date");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {}
// <meta http-equiv="last-modified" content="YYYY-MM-DD" /> // <meta http-equiv="last-modified" content="YYYY-MM-DD" />
content = this.metas.get("last-modified"); content = this.metas.get("last-modified");
if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (ParseException e) {} if (content != null) try {return ISO8601Formatter.FORMATTER.parse(content, this.timezoneOffset).getTime();} catch (final ParseException e) {}
return new Date(); return new Date();
} }
@ -1482,7 +1506,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
public String[] getEvaluationModelScoreNames(final String modelName) { public String[] getEvaluationModelScoreNames(final String modelName) {
final List<String> a = new ArrayList<String>(); final List<String> a = new ArrayList<>();
final ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName); final ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
if (scores != null) { if (scores != null) {
final Iterator<String> i = scores.keys(false); final Iterator<String> i = scores.keys(false);
@ -1537,7 +1561,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
public void print() { public void print() {
for (String t: this.titles) { for (final String t: this.titles) {
System.out.println("TITLE :" + t); System.out.println("TITLE :" + t);
} }
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {

@ -24,13 +24,13 @@ import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import org.apache.poi.ss.formula.atp.DateParser; import org.apache.poi.ss.formula.atp.DateParser;
import org.apache.poi.ss.formula.eval.EvaluationException; import org.apache.poi.ss.formula.eval.EvaluationException;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
public enum CollectionSchema implements SchemaDeclaration { public enum CollectionSchema implements SchemaDeclaration {
// mandatory // mandatory
@ -97,7 +97,7 @@ public enum CollectionSchema implements SchemaDeclaration {
description_txt(SolrType.text_general, true, true, true, false, true, "content of description-tag(s)"), description_txt(SolrType.text_general, true, true, true, false, true, "content of description-tag(s)"),
description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"), description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"),
description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false"), description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false"),
keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"), keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by comma, semicolon or space"),
charset_s(SolrType.string, true, true, false, false, false, "character encoding"), charset_s(SolrType.string, true, true, false, false, false, "character encoding"),
wordcount_i(SolrType.num_integer, true, true, false, false, false, "number of words in visible area"), wordcount_i(SolrType.num_integer, true, true, false, false, false, "number of words in visible area"),
linkscount_i(SolrType.num_integer, true, true, false, false, false, "number of all outgoing links; including linksnofollowcount_i"), linkscount_i(SolrType.num_integer, true, true, false, false, false, "number of all outgoing links; including linksnofollowcount_i"),
@ -286,10 +286,10 @@ public enum CollectionSchema implements SchemaDeclaration {
this.mandatory = mandatory; this.mandatory = mandatory;
this.docValues = (type == SolrType.string || type == SolrType.date || type.name().startsWith("num_")); this.docValues = (type == SolrType.string || type == SolrType.date || type.name().startsWith("num_"));
// verify our naming scheme // verify our naming scheme
String name = this.name(); final String name = this.name();
int p = name.indexOf('_'); final int p = name.indexOf('_');
if (p > 0) { if (p > 0) {
String ext = name.substring(p + 1); final String ext = name.substring(p + 1);
assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name; assert !ext.equals("i") || (type == SolrType.num_integer && !multiValued) : name;
assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name; assert !ext.equals("l") || (type == SolrType.num_long && !multiValued) : name;
assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name; assert !ext.equals("b") || (type == SolrType.bool && !multiValued) : name;
@ -320,7 +320,7 @@ public enum CollectionSchema implements SchemaDeclaration {
* @param theValue = the field name * @param theValue = the field name
*/ */
@Override @Override
public final void setSolrFieldName(String theValue) { public final void setSolrFieldName(final String theValue) {
// make sure no empty string is assigned // make sure no empty string is assigned
if ( (theValue != null) && (!theValue.isEmpty()) ) { if ( (theValue != null) && (!theValue.isEmpty()) ) {
this.solrFieldName = theValue.toLowerCase(Locale.ROOT); this.solrFieldName = theValue.toLowerCase(Locale.ROOT);
@ -444,11 +444,11 @@ public enum CollectionSchema implements SchemaDeclaration {
} else if (this.type == SolrType.date) { } else if (this.type == SolrType.date) {
assert (value.iterator().next() instanceof String) || (value.iterator().next() instanceof Date) : "type: " + value.iterator().next().getClass().getName(); assert (value.iterator().next() instanceof String) || (value.iterator().next() instanceof Date) : "type: " + value.iterator().next().getClass().getName();
if (value.iterator().next() instanceof String) { if (value.iterator().next() instanceof String) {
Date[] da = new Date[value.size()]; final Date[] da = new Date[value.size()];
for (int i = 0; i < value.size(); i++) { for (int i = 0; i < value.size(); i++) {
try { try {
da[i] = DateParser.parseDate((String) value.get(i)).getTime(); da[i] = DateParser.parseDate((String) value.get(i)).getTime();
} catch (EvaluationException e) { } catch (final EvaluationException e) {
da[i] = null; da[i] = null;
} }
} }

Loading…
Cancel
Save