From cca851a4177ed6781263b4f0da56d44a009e77d7 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Wed, 2 Apr 2014 23:37:01 +0200
Subject: [PATCH] introduced new solr field crawldepth_i which records the
 crawl depth of a document. This is the upper limit for the clickdepth_i value
 which may be shorter in case that the crawler did not take the shortest path
 to the document.

---
 defaults/solr.collection.schema               |  3 +++
 htroot/HostBrowser.java                       | 16 ++++++++--------
 source/net/yacy/document/Document.java        | 19 +++++++++++++++++--
 source/net/yacy/search/Switchboard.java       |  3 +++
 .../schema/CollectionConfiguration.java       |  7 ++++++-
 .../yacy/search/schema/CollectionSchema.java  |  1 +
 6 files changed, 38 insertions(+), 11 deletions(-)
diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema
index 595769132..6caad64f9 100644
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@@ -87,6 +87,9 @@ references_exthosts_i
 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
 clickdepth_i
 
+## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
+crawldepth_i
+
 ## needed (post-)processing steps on this metadata set
 process_sxt
 
diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java
index 22626bc0a..c9cb48255 100644
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@@ -298,6 +298,7 @@ public class HostBrowser {
                         CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
                         CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
                         CollectionSchema.clickdepth_i.getSolrFieldName(),
+                        CollectionSchema.crawldepth_i.getSolrFieldName(),
                         CollectionSchema.references_i.getSolrFieldName(),
                         CollectionSchema.references_internal_i.getSolrFieldName(),
                         CollectionSchema.references_external_i.getSolrFieldName(),
@@ -560,17 +561,19 @@ public class HostBrowser {
     public static final class InfoCacheEntry {
         public Integer cr_n;
         public Double  cr_c;
-        public int clickdepth, references, references_internal, references_external, references_exthosts;
+        public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts;
         public List<String> references_internal_urls, references_external_urls;
         public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) {
             this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
-            this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());            
+            this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
             Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
+            Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
             Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
             Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
             Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
             Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
             this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue();
+            this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue();
             this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
             this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
             // calculate the url reference list
@@ -622,14 +625,11 @@ public class HostBrowser {
             }
             if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:</br>");
             return
-                    (this.clickdepth >= 0 ?
-                            "clickdepth: " + this.clickdepth :
-                            "") +
+                    (this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") +
+                    (this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") +
                     (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
                     (this.cr_n != null ? ", crn=" + this.cr_n : "") +
-                    (this.references >= 0 ?
-                            ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() :
-                            "");
+                    (this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : "");
         }
     }
 
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index acc55e9d1..8b8b3cdfc 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -95,6 +95,7 @@ public class Document {
     private final Object parserObject; // the source object that was used to create the Document
     private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
     private final Date date;
+    private int crawldepth;
 
     public Document(final DigestURL location, final String mimeType, final String charset,
                     final Object parserObject,
@@ -146,8 +147,9 @@ public class Document {
         this.text = text == null ? "" : text;
         this.generic_facets = new HashMap<String, Set<String>>();
         this.date = date == null ? new Date() : date;
+        this.crawldepth = 999; // unknown yet
     }
-
+    
     /**
      * Get the content domain of a document. This tries to get the content domain from the mime type
      * and if this fails it uses alternatively the content domain from the file extension.
@@ -740,6 +742,14 @@ dc_rights
         return this.indexingDenied;
     }
 
+    public void setDepth(int depth) {
+        this.crawldepth = depth;
+    }
+    
+    public int getDepth() {
+        return this.crawldepth;
+    }
+    
     public void writeXML(final Writer os, final Date date) throws IOException {
         os.write("<record>\n");
         final String title = dc_title();
@@ -819,6 +829,7 @@ dc_rights
         double lon = 0.0d, lat = 0.0d;
         Date date = new Date();
 
+        int mindepth = 999;
         for (final Document doc: docs) {
 
         	if (doc == null) continue;
@@ -857,6 +868,8 @@ dc_rights
             images.putAll(doc.getImages());
             if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
             if (doc.date.before(date)) date = doc.date;
+            
+            if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
         }
 
         // clean up parser data
@@ -871,7 +884,7 @@ dc_rights
         // return consolidation
         ArrayList<String> titlesa = new ArrayList<String>();
         titlesa.addAll(titles);
-        return new Document(
+        Document newDoc = new Document(
                 location,
                 globalMime,
                 null,
@@ -890,6 +903,8 @@ dc_rights
                 images,
                 false,
                 date);
+        newDoc.setDepth(mindepth);
+        return newDoc;
     }
 
     public static Map<DigestURL, String> getHyperlinks(final Document[] documents) {
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index d81a18500..0cf9c6dfb 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2591,6 +2591,9 @@ public final class Switchboard extends serverSwitch {
                 response.profile().crawlerNoDepthLimitMatchPattern().matcher(response.url().toNormalform(true)).matches()
             )
            ) {
+            
+            for (Document d: documents) d.setDepth(response.depth());
+            
             // get the hyperlinks
             final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
             if (response.profile().indexMedia()) {
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index a18f3c285..4a13cd9c9 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -377,10 +377,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
             } else {
                 clickdepth = 999;
             }
-            processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
+            if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that.
+            if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
             CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index
         }
         
+        if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
+            CollectionSchema.crawldepth_i.add(doc, document.getDepth());
+        }
+        
         if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
             processTypes.add(ProcessType.CITATION); // postprocessing needed
         }
diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java
index 361167270..2b62c24f6 100644
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@@ -58,6 +58,7 @@ public enum CollectionSchema implements SchemaDeclaration {
     references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
     references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
     clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
+    crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
     process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
     harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),