From be67c70a4764c1aaa1c521dfef6711d4d462fc87 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 7 Sep 2012 22:06:51 +0200 Subject: [PATCH] added Solr fields: inboundlinks_text_chars_val inboundlinks_text_words_val inboundlinks_alttag_txt outboundlinks_text_chars_val outboundlinks_text_words_val outboundlinks_alttag_txt --- defaults/solr.keys.list | 18 ++++++ .../yacy/search/index/SolrConfiguration.java | 61 +++++++++++++------ source/net/yacy/search/index/YaCySchema.java | 6 ++ 3 files changed, 66 insertions(+), 19 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 7ede1c94a..f24602d02 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -195,6 +195,15 @@ h6_txt ## internal links, the text content of the a-tag #inboundlinks_text_txt +## internal links, the length of the a-tag as number of characters +#inboundlinks_text_chars_val + +## internal links, the length of the a-tag as number of words +#inboundlinks_text_words_val + +##if the link is an image link, this contains the alt tag if the image is also liked as img link +#inboundlinks_alttag_txt + ## external links, normalized (absolute URLs), as - tag with anchor text and nofollow #outboundlinks_tag_txt @@ -216,6 +225,15 @@ h6_txt ## external links, the text content of the a-tag #outboundlinks_text_txt +## external links, the length of the a-tag as number of characters +#outboundlinks_text_chars_val + +## external links, the length of the a-tag as number of words +#outboundlinks_text_words_val + +##if the link is an image link, this contains the alt tag if the image is also liked as img link +#outboundlinks_alttag_txt + ## all image tags, encoded as tag inclusive alt- and title property #images_tag_txt diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 0799f7212..98659fe20 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -32,6 +32,7 @@ import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -419,12 +420,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable // get list of all links; they will be shrinked by urls that appear in other fields of the solr scheme Set inboundLinks = yacydoc.inboundLinks(); - Set ouboundLinks = yacydoc.outboundLinks(); + Set outboundLinks = yacydoc.outboundLinks(); int c = 0; final Object parser = yacydoc.getParserObject(); + Map images = new HashMap(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; + images = html.getImages(); // header tags int h = 0; @@ -505,7 +508,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (li.length > 0) add(doc, YaCySchema.li_txt, li); // images - final Collection imagesc = html.getImages().values(); + final Collection imagesc = images.values(); final List imgtags = new ArrayList(imagesc.size()); final List imgprots = new ArrayList(imagesc.size()); final List imgstubs = new ArrayList(imagesc.size()); @@ -513,7 +516,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable for (final ImageEntry ie: imagesc) { final MultiProtocolURI uri = ie.url(); inboundLinks.remove(uri); - ouboundLinks.remove(uri); + outboundLinks.remove(uri); imgtags.add(ie.toString()); String protocol = uri.getProtocol(); imgprots.add(protocol); @@ -535,7 +538,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable for (final Map.Entry entry: csss.entrySet()) { final String url = entry.getKey().toNormalform(false, false); inboundLinks.remove(url); - ouboundLinks.remove(url); + outboundLinks.remove(url); css_tag[c] = ""; @@ -554,7 +557,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable c = 0; for (final MultiProtocolURI url: scriptss) { inboundLinks.remove(url); - ouboundLinks.remove(url); + outboundLinks.remove(url); scripts[c++] = url.toNormalform(false, false); } add(doc, YaCySchema.scriptscount_i, scripts.length); @@ -568,7 +571,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable c = 0; for (final MultiProtocolURI url: framess) { inboundLinks.remove(url); - ouboundLinks.remove(url); + outboundLinks.remove(url); frames[c++] = url.toNormalform(false, false); } add(doc, YaCySchema.framesscount_i, frames.length); @@ -582,7 +585,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable c = 0; for (final MultiProtocolURI url: iframess) { inboundLinks.remove(url); - ouboundLinks.remove(url); + outboundLinks.remove(url); iframes[c++] = url.toNormalform(false, false); } add(doc, YaCySchema.iframesscount_i, iframes.length); @@ -594,7 +597,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final MultiProtocolURI canonical = html.getCanonical(); if (canonical != null) { inboundLinks.remove(canonical); - ouboundLinks.remove(canonical); + outboundLinks.remove(canonical); add(doc, YaCySchema.canonical_t, canonical.toNormalform(false, false)); } } @@ -608,7 +611,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable refreshURL = refresh.startsWith("http") ? new MultiProtocolURI(html.getRefreshPath()) : new MultiProtocolURI(digestURI, html.getRefreshPath()); if (refreshURL != null) { inboundLinks.remove(refreshURL); - ouboundLinks.remove(refreshURL); + outboundLinks.remove(refreshURL); add(doc, YaCySchema.refresh_s, refreshURL.toNormalform(false, false)); } } catch (MalformedURLException e) { @@ -623,7 +626,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable for (MultiProtocolURI u: flashURLs) { // remove all flash links from ibound/outbound links inboundLinks.remove(u); - ouboundLinks.remove(u); + outboundLinks.remove(u); } add(doc, YaCySchema.flash_b, flashURLs.length > 0); } @@ -654,6 +657,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final List inboundlinksName = new ArrayList(inboundLinks.size()); final List inboundlinksRel = new ArrayList(inboundLinks.size()); final List inboundlinksText = new ArrayList(inboundLinks.size()); + final List inboundlinksTextChars = new ArrayList(inboundLinks.size()); + final List inboundlinksTextWords = new ArrayList(inboundLinks.size()); + final List inboundlinksAltTag = new ArrayList(inboundLinks.size()); for (final MultiProtocolURI url: inboundLinks) { final Properties p = alllinks.get(url); if (p == null) continue; @@ -667,12 +673,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable inboundlinksName.add(name.length() > 0 ? name : ""); inboundlinksRel.add(rel.length() > 0 ? rel : ""); inboundlinksText.add(text.length() > 0 ? text : ""); + inboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); + inboundlinksTextWords.add(text.length() > 0 ? text.split(" ").length : 0); inboundlinksTag.add( " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""); + ImageEntry ientry = images.get(url); + inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); c++; } if (allAttr || contains(YaCySchema.inboundlinks_tag_txt)) add(doc, YaCySchema.inboundlinks_tag_txt, inboundlinksTag); @@ -682,17 +692,23 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.inboundlinks_rel_sxt)) add(doc, YaCySchema.inboundlinks_rel_sxt, inboundlinksRel); if (allAttr || contains(YaCySchema.inboundlinks_relflags_val)) add(doc, YaCySchema.inboundlinks_relflags_val, relEval(inboundlinksRel)); if (allAttr || contains(YaCySchema.inboundlinks_text_txt)) add(doc, YaCySchema.inboundlinks_text_txt, inboundlinksText); + if (allAttr || contains(YaCySchema.inboundlinks_text_chars_val)) add(doc, YaCySchema.inboundlinks_text_chars_val, inboundlinksTextChars); + if (allAttr || contains(YaCySchema.inboundlinks_text_words_val)) add(doc, YaCySchema.inboundlinks_text_words_val, inboundlinksTextWords); + if (allAttr || contains(YaCySchema.inboundlinks_alttag_txt)) add(doc, YaCySchema.inboundlinks_alttag_txt, inboundlinksAltTag); c = 0; - if (allAttr || contains(YaCySchema.outboundlinkscount_i)) add(doc, YaCySchema.outboundlinkscount_i, ouboundLinks.size()); + if (allAttr || contains(YaCySchema.outboundlinkscount_i)) add(doc, YaCySchema.outboundlinkscount_i, outboundLinks.size()); if (allAttr || contains(YaCySchema.outboundlinksnofollowcount_i)) add(doc, YaCySchema.outboundlinksnofollowcount_i, yacydoc.outboundLinkNofollowCount()); - final List outboundlinksTag = new ArrayList(ouboundLinks.size()); - final List outboundlinksURLProtocol = new ArrayList(ouboundLinks.size()); - final List outboundlinksURLStub = new ArrayList(ouboundLinks.size()); - final List outboundlinksName = new ArrayList(ouboundLinks.size()); - final List outboundlinksRel = new ArrayList(ouboundLinks.size()); - final List outboundlinksText = new ArrayList(ouboundLinks.size()); - for (final MultiProtocolURI url: ouboundLinks) { + final List outboundlinksTag = new ArrayList(outboundLinks.size()); + final List outboundlinksURLProtocol = new ArrayList(outboundLinks.size()); + final List outboundlinksURLStub = new ArrayList(outboundLinks.size()); + final List outboundlinksName = new ArrayList(outboundLinks.size()); + final List outboundlinksRel = new ArrayList(outboundLinks.size()); + final List outboundlinksTextChars = new ArrayList(outboundLinks.size()); + final List outboundlinksTextWords = new ArrayList(outboundLinks.size()); + final List outboundlinksText = new ArrayList(outboundLinks.size()); + final List outboundlinksAltTag = new ArrayList(outboundLinks.size()); + for (final MultiProtocolURI url: outboundLinks) { final Properties p = alllinks.get(url); if (p == null) continue; final String name = p.getProperty("name", ""); // the name attribute @@ -705,12 +721,16 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable outboundlinksName.add(name.length() > 0 ? name : ""); outboundlinksRel.add(rel.length() > 0 ? rel : ""); outboundlinksText.add(text.length() > 0 ? text : ""); + outboundlinksTextChars.add(text.length() > 0 ? text.length() : 0); + outboundlinksTextWords.add(text.length() > 0 ? text.split(" ").length : 0); outboundlinksTag.add( " 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") + ">" + ((text.length() > 0) ? text : "") + ""); + ImageEntry ientry = images.get(url); + inboundlinksAltTag.add(ientry == null ? "" : ientry.alt()); c++; } if (allAttr || contains(YaCySchema.outboundlinks_tag_txt)) add(doc, YaCySchema.outboundlinks_tag_txt, outboundlinksTag); @@ -718,8 +738,11 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (allAttr || contains(YaCySchema.outboundlinks_urlstub_txt)) add(doc, YaCySchema.outboundlinks_urlstub_txt, outboundlinksURLStub); if (allAttr || contains(YaCySchema.outboundlinks_name_txt)) add(doc, YaCySchema.outboundlinks_name_txt, outboundlinksName); if (allAttr || contains(YaCySchema.outboundlinks_rel_sxt)) add(doc, YaCySchema.outboundlinks_rel_sxt, outboundlinksRel); - if (allAttr || contains(YaCySchema.outboundlinks_relflags_val)) add(doc, YaCySchema.outboundlinks_relflags_val, relEval(inboundlinksRel)); + if (allAttr || contains(YaCySchema.outboundlinks_relflags_val)) add(doc, YaCySchema.outboundlinks_relflags_val, relEval(outboundlinksRel)); if (allAttr || contains(YaCySchema.outboundlinks_text_txt)) add(doc, YaCySchema.outboundlinks_text_txt, outboundlinksText); + if (allAttr || contains(YaCySchema.outboundlinks_text_chars_val)) add(doc, YaCySchema.outboundlinks_text_chars_val, outboundlinksTextChars); + if (allAttr || contains(YaCySchema.outboundlinks_text_words_val)) add(doc, YaCySchema.outboundlinks_text_words_val, outboundlinksTextWords); + if (allAttr || contains(YaCySchema.outboundlinks_alttag_txt)) add(doc, YaCySchema.outboundlinks_alttag_txt, outboundlinksAltTag); // charset if (allAttr || contains(YaCySchema.charset_s)) add(doc, YaCySchema.charset_s, yacydoc.getCharset()); diff --git a/source/net/yacy/search/index/YaCySchema.java b/source/net/yacy/search/index/YaCySchema.java index cfa44e605..ec8d7818a 100644 --- a/source/net/yacy/search/index/YaCySchema.java +++ b/source/net/yacy/search/index/YaCySchema.java @@ -106,6 +106,9 @@ public enum YaCySchema implements Schema { inboundlinks_rel_sxt(SolrType.string, true, true, true, "internal links, the rel property of the a-tag"), inboundlinks_relflags_val(SolrType.integer, true, true, true, "internal links, the rel property of the a-tag, coded binary"), inboundlinks_text_txt(SolrType.text_general, true, true, true, "internal links, the text content of the a-tag"), + inboundlinks_text_chars_val(SolrType.integer, true, true, true, "internal links, the length of the a-tag as number of characters"), + inboundlinks_text_words_val(SolrType.integer, true, true, true, "internal links, the length of the a-tag as number of words"), + inboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), outboundlinks_tag_txt(SolrType.text_general, true, true, true, "external links, normalized (absolute URLs), as - tag with anchor text and nofollow"), outboundlinks_protocol_sxt(SolrType.string, true, true, true, "external links, only the protocol"), outboundlinks_urlstub_txt(SolrType.text_general, true, true, true, "external links, the url only without the protocol"), @@ -113,6 +116,9 @@ public enum YaCySchema implements Schema { outboundlinks_rel_sxt(SolrType.string, true, true, true, "external links, the rel property of the a-tag"), outboundlinks_relflags_val(SolrType.integer, true, true, true, "external links, the rel property of the a-tag, coded binary"), outboundlinks_text_txt(SolrType.text_general, true, true, true, "external links, the text content of the a-tag"), + outboundlinks_text_chars_val(SolrType.integer, true, true, true, "external links, the length of the a-tag as number of characters"), + outboundlinks_text_words_val(SolrType.integer, true, true, true, "external links, the length of the a-tag as number of words"), + outboundlinks_alttag_txt(SolrType.text_general, true, true, true, "if the link is an image link, this contains the alt tag if the image is also liked as img link"), images_tag_txt(SolrType.text_general, true, true, true, " all image tags, encoded as tag inclusive alt- and title property"), images_urlstub_txt(SolrType.text_general, true, true, true, "all image links without the protocol and '://'"), images_protocol_sxt(SolrType.text_general, true, true, true, "all image link protocols"),