From cca851a4177ed6781263b4f0da56d44a009e77d7 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 2 Apr 2014 23:37:01 +0200 Subject: [PATCH] introduced new solr field crawldepth_i which records the crawl depth of a document. This is the upper limit for the clickdepth_i value which may be shorter in case that the crawler did not take the shortest path to the document. --- defaults/solr.collection.schema | 3 +++ htroot/HostBrowser.java | 16 ++++++++-------- source/net/yacy/document/Document.java | 19 +++++++++++++++++-- source/net/yacy/search/Switchboard.java | 3 +++ .../schema/CollectionConfiguration.java | 7 ++++++- .../yacy/search/schema/CollectionSchema.java | 1 + 6 files changed, 38 insertions(+), 11 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 595769132..6caad64f9 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -87,6 +87,9 @@ references_exthosts_i ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url clickdepth_i +## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i +crawldepth_i + ## needed (post-)processing steps on this metadata set process_sxt diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 22626bc0a..c9cb48255 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -298,6 +298,7 @@ public class HostBrowser { CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(), CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(), CollectionSchema.clickdepth_i.getSolrFieldName(), + CollectionSchema.crawldepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), CollectionSchema.references_external_i.getSolrFieldName(), @@ -560,17 +561,19 @@ public class HostBrowser { public static final class InfoCacheEntry { public Integer cr_n; public Double cr_c; - public int clickdepth, references, references_internal, references_external, references_exthosts; + public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts; public List references_internal_urls, references_external_urls; public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) { this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName()); - this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); + this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName()); + Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName()); Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue(); + this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue(); this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue(); this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); // calculate the url reference list @@ -622,14 +625,11 @@ public class HostBrowser { } if (sbe.length() > 0) sbe.insert(0, "
external referrer:
"); return - (this.clickdepth >= 0 ? - "clickdepth: " + this.clickdepth : - "") + + (this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") + + (this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") + (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + (this.cr_n != null ? ", crn=" + this.cr_n : "") + - (this.references >= 0 ? - ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : - ""); + (this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : ""); } } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index acc55e9d1..8b8b3cdfc 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -95,6 +95,7 @@ public class Document { private final Object parserObject; // the source object that was used to create the Document private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document private final Date date; + private int crawldepth; public Document(final DigestURL location, final String mimeType, final String charset, final Object parserObject, @@ -146,8 +147,9 @@ public class Document { this.text = text == null ? "" : text; this.generic_facets = new HashMap>(); this.date = date == null ? new Date() : date; + this.crawldepth = 999; // unknown yet } - + /** * Get the content domain of a document. This tries to get the content domain from the mime type * and if this fails it uses alternatively the content domain from the file extension. @@ -740,6 +742,14 @@ dc_rights return this.indexingDenied; } + public void setDepth(int depth) { + this.crawldepth = depth; + } + + public int getDepth() { + return this.crawldepth; + } + public void writeXML(final Writer os, final Date date) throws IOException { os.write("\n"); final String title = dc_title(); @@ -819,6 +829,7 @@ dc_rights double lon = 0.0d, lat = 0.0d; Date date = new Date(); + int mindepth = 999; for (final Document doc: docs) { if (doc == null) continue; @@ -857,6 +868,8 @@ dc_rights images.putAll(doc.getImages()); if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); } if (doc.date.before(date)) date = doc.date; + + if (doc.getDepth() < mindepth) mindepth = doc.getDepth(); } // clean up parser data @@ -871,7 +884,7 @@ dc_rights // return consolidation ArrayList titlesa = new ArrayList(); titlesa.addAll(titles); - return new Document( + Document newDoc = new Document( location, globalMime, null, @@ -890,6 +903,8 @@ dc_rights images, false, date); + newDoc.setDepth(mindepth); + return newDoc; } public static Map getHyperlinks(final Document[] documents) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d81a18500..0cf9c6dfb 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2591,6 +2591,9 @@ public final class Switchboard extends serverSwitch { response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(true)).matches() ) ) { + + for (Document d: documents) d.setDepth(response.depth()); + // get the hyperlinks final Map hl = Document.getHyperlinks(documents); if (response.profile().indexMedia()) { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index a18f3c285..4a13cd9c9 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -377,10 +377,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } else { clickdepth = 999; } - processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut + if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that. + if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index } + if ((allAttr || contains(CollectionSchema.crawldepth_i))) { + CollectionSchema.crawldepth_i.add(doc, document.getDepth()); + } + if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) { processTypes.add(ProcessType.CITATION); // postprocessing needed } diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 361167270..2b62c24f6 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -58,6 +58,7 @@ public enum CollectionSchema implements SchemaDeclaration { references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), + crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"), process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"), harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),