removed clickdepth_i field and related postprocessing. This information

is now available in the crawldepth_i field which is identical to clickdepth_i because of a specific crawler strategy.
11 years ago · 9a5ab4e2c1
parent da86f150ab
commit 9a5ab4e2c1
14 changed files with 33 additions and 243 deletions
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@ -84,10 +84,7 @@ references_external_i
 ## number of external hosts which provide http references
 references_exthosts_i

-## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
-clickdepth_i
-
-## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
+## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth
 crawldepth_i

 ## needed (post-)processing steps on this metadata set
--- a/defaults/solr.webgraph.schema
+++ b/defaults/solr.webgraph.schema
@ -24,7 +24,7 @@ last_modified
 ## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
 collection_sxt

-## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
+## needed (post-)processing steps on this metadata set
 #process_sxt

 ## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
@ -72,7 +72,7 @@ source_id_s
 #source_parameter_value_sxt

 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
-#source_clickdepth_i
+#source_crawldepth_i

 ## copy of the citation rank norm value from the source link
 #source_cr_host_norm_i
@ -173,7 +173,7 @@ target_path_folders_sxt
 #target_parameter_value_sxt

 ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
-#target_clickdepth_i
+#target_crawldepth_i

 ## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
 #target_cr_host_norm_i
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -1218,7 +1218,3 @@ browser.load4everyone = false
 # with some heuristics like: loading linked documents and adding a twitter search.
 # When the learning mode is finished, the user may switch on individual heuristics by himself.
 greedylearning.active = true
-
-# postprocessing parametrization
-postprocessing.clickdepth.maxtime = 100
-postprocessing.clickdepth.maxdepth = 6
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -297,7 +297,6 @@ public class HostBrowser {
                        CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(),
                        CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(),
                        CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(),
-                        CollectionSchema.clickdepth_i.getSolrFieldName(),
                        CollectionSchema.crawldepth_i.getSolrFieldName(),
                        CollectionSchema.references_i.getSolrFieldName(),
                        CollectionSchema.references_internal_i.getSolrFieldName(),
@ -564,18 +563,16 @@ public class HostBrowser {
    public static final class InfoCacheEntry {
        public Integer cr_n;
        public Double  cr_c;
-        public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts;
+        public int crawldepth, references, references_internal, references_external, references_exthosts;
        public List<String> references_internal_urls, references_external_urls;
        public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) {
            this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName());
            this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName());
-            Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName());
            Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName());
            Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName());
            Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName());
            Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName());
            Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
-            this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue();
            this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue();
            this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue();
            this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue();
@ -628,7 +625,6 @@ public class HostBrowser {
            }
            if (sbe.length() > 0) sbe.insert(0, "<br/>external referrer:</br>");
            return
-                    (this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") +
                    (this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") +
                    (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") +
                    (this.cr_n != null ? ", crn=" + this.cr_n : "") +
--- a/htroot/RankingSolr_p.java
+++ b/htroot/RankingSolr_p.java
@ -81,7 +81,7 @@ public class RankingSolr_p {
            }
        }
        if (post != null && post.containsKey("ResetBQ")) {
-            String bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4";
+            String bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4";
            if (bq != null) {
                sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + profileNr, bq);
                sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostQuery(bq);
--- a/source/net/yacy/cora/document/id/DigestURL.java
+++ b/source/net/yacy/cora/document/id/DigestURL.java
@ -290,10 +290,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable {
    }

    public final static Pattern rootPattern = Pattern.compile("/|/\\?|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php");
-    
-    public final boolean probablyRootURL() {
-        return this.path.length() <= 1 || rootPattern.matcher(this.path).matches();
-    }

    private static final String hosthash5(final String protocol, final String host, final int port) {
        if (host == null) {
--- a/source/net/yacy/cora/federate/solr/ProcessType.java
+++ b/source/net/yacy/cora/federate/solr/ProcessType.java
@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr;
 */
 public enum ProcessType {

-    CLICKDEPTH, CITATION, UNIQUE;
+    CITATION, UNIQUE;
    
 }
--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@ -41,7 +41,6 @@ import net.yacy.cora.storage.Configuration;
 import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.search.index.Segment;
-import net.yacy.search.index.Segment.ClickdepthCache;
 import net.yacy.search.index.Segment.ReferenceReport;
 import net.yacy.search.index.Segment.ReferenceReportCache;
 import net.yacy.search.schema.CollectionSchema;
@ -177,21 +176,6 @@ public class SchemaConfiguration extends Configuration implements Serializable {
        }
        return changed;
    }
-    
-    public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield) {
-        // get new click depth and compare with old
-        Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName());
-        if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again
-        try {
-            int clickdepth = clickdepthCache.getClickdepth(url);
-            if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) {
-                sid.setField(clickdepthfield.getSolrFieldName(), clickdepth);
-                return true;
-            }
-        } catch (final IOException e) {
-        }
-        return false;
-    }

    public boolean postprocessing_references(final ReferenceReportCache rrCache, final SolrInputDocument sid, final DigestURL url, final Map<String, Long> hostExtentCount) {
        if (!(this.contains(CollectionSchema.references_i) ||
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -190,7 +190,6 @@ import net.yacy.repository.FilterEngine;
 import net.yacy.repository.LoaderDispatcher;
 import net.yacy.search.index.Fulltext;
 import net.yacy.search.index.Segment;
-import net.yacy.search.index.Segment.ClickdepthCache;
 import net.yacy.search.index.Segment.ReferenceReportCache;
 import net.yacy.search.query.AccessTracker;
 import net.yacy.search.query.SearchEvent;
@ -484,9 +483,9 @@ public final class Switchboard extends serverSwitch {
            String bq = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + i, "");
            String bf = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + i, "");
            // apply some hard-coded patches for earlier experiments we do not want any more
-            if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(clickdepth_i,1)))") ||
+            if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(crawldepth_i,1)))") ||
                bf.equals("scale(cr_host_norm_i,1,20)")) bf = "";
-            if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4";
+            if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4";
            if (boosts.equals("url_paths_sxt^1000.0,synonyms_sxt^1.0,title^10000.0,text_t^2.0,h1_txt^1000.0,h2_txt^100.0,host_organization_s^100000.0")) boosts = "url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^2.0";
            r.setName(name);
            r.updateBoosts(boosts);
@ -2307,9 +2306,6 @@ public final class Switchboard extends serverSwitch {
                
                // we optimize first because that is useful for postprocessing
                ReferenceReportCache rrCache = index.getReferenceReportCache();
-                int clickdepth_maxtime = this.getConfigInt("postprocessing.clickdepth.maxtime", 100);
-                int clickdepth_maxdepth = this.getConfigInt("postprocessing.clickdepth.maxdepth", 6);
-                ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache, clickdepth_maxtime, clickdepth_maxdepth);
                Set<String> deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ?
                        this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet<String>();
                int cleanupByHarvestkey = deletionCandidates.size();
@ -2320,7 +2316,7 @@ public final class Switchboard extends serverSwitch {
                        postprocessingRunning = true;
                        postprocessingStartTime[0] = System.currentTimeMillis();
                        try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {}
-                        for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, profileHash);
+                        for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash);
                        postprocessingStartTime[0] = 0;
                        try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know
                        
@ -2331,7 +2327,7 @@ public final class Switchboard extends serverSwitch {
                        postprocessingRunning = true;
                        postprocessingStartTime[0] = System.currentTimeMillis();
                        try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {}
-                        proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, null);
+                        proccount += collection1Configuration.postprocessing(index, rrCache, null);
                        postprocessingStartTime[0] = 0;
                        try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know

--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -30,8 +30,6 @@ import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Date;
-import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@ -82,7 +80,6 @@ import net.yacy.repository.LoaderDispatcher;
 import net.yacy.search.query.SearchEvent;
 import net.yacy.search.schema.CollectionConfiguration;
 import net.yacy.search.schema.CollectionSchema;
-import net.yacy.search.schema.HyperlinkGraph;
 import net.yacy.search.schema.WebgraphConfiguration;
 import net.yacy.search.schema.WebgraphSchema;

@ -204,77 +201,7 @@ public class Segment {
    public IndexCell<CitationReference> urlCitation() {
        return this.urlCitationIndex;
    }
-
-    /**
-     * compute the click level using the citation reference database
-     * @param citations the citation database
-     * @param searchhash the hash of the url to be checked
-     * @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
-     * @throws IOException
-     */
-    private int getClickDepth(final ReferenceReportCache rrc, final DigestURL url, final int maxtime, final int maxdepth) throws IOException {
-
-        final byte[] searchhash = url.hash();
-        RowHandleSet rootCandidates = getPossibleRootHashes(url);
-        if (rootCandidates.has(searchhash)) return 0; // the url is a root candidate itself
-        
-        Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops
-        Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
-        levelhashes.add(ASCII.String(searchhash));
-        final byte[] hosthash = new byte[6]; // the host of the url to be checked
-        System.arraycopy(searchhash, 6, hosthash, 0, 6);
-        
-        long timeout = System.currentTimeMillis() + maxtime;
-        mainloop: for (int leveldepth = 0; leveldepth < maxdepth && System.currentTimeMillis() < timeout; leveldepth++) {
-            
-            Set<String> checknext = new HashSet<String>();
-            
-            // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
-            checkloop: for (String urlhashs: levelhashes) {
-    
-                // get all the citations for this url and iterate
-                ReferenceReport rr = rrc.getReferenceReport(urlhashs, false);
-                //ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null);
-                if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know
-                Iterator<byte[]> i = rr.getInternallIDs().iterator();
-                nextloop: while (i.hasNext()) {
-                    byte[] u = i.next();
-                    if (u == null) continue nextloop;
-                    
-                    // check if this is from the same host
-                    assert (ByteBuffer.equals(u, 6, hosthash, 0, 6));
-                    String us = ASCII.String(u);
-                    // check ignore
-                    if (ignore.contains(us)) continue nextloop;
-                    
-                    // check if the url is a root url
-                    if (rootCandidates.has(u)) {
-                        return leveldepth + 1;
-                    }
-                    
-                    checknext.add(us);
-                    ignore.add(us);
-                }
-                if (System.currentTimeMillis() > timeout) break mainloop;
-            }
-            levelhashes = checknext;
-        }
-        return 999;
-    }
-    
    
-    private static RowHandleSet getPossibleRootHashes(final DigestURL url) {
-        RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10);
-        String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : "");
-        try {
-            rootCandidates.put(new DigestURL(rootStub).hash());
-            for (String rootfn: HyperlinkGraph.ROOTFNS) rootCandidates.put(new DigestURL(rootStub + rootfn).hash());
-            rootCandidates.optimize();
-        } catch (final Throwable e) {}
-        rootCandidates.optimize();
-        return rootCandidates;
-    }
-
    public ReferenceReportCache getReferenceReportCache()  {
        return new ReferenceReportCache();
    }
@ -299,54 +226,6 @@ public class Segment {
        }
    }
    
-    public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc, final int maxtime, final int maxdepth)  {
-        return new ClickdepthCache(rrc, maxtime, maxdepth);
-    }
-    
-    public class ClickdepthCache {
-        private final ReferenceReportCache rrc;
-        private final Map<String, HyperlinkGraph> hyperlinkGraphCache; // map from host name to a HyperlinkGraph for that host name
-        private final Map<String, Integer> cache;
-        public final int maxdepth; // maximum clickdepth
-        public final int maxtime; // maximum time to compute clickdepth
-        public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) {
-            this.rrc = rrc;
-            this.hyperlinkGraphCache = new HashMap<String, HyperlinkGraph>();
-            this.cache = new ConcurrentHashMap<String, Integer>();
-            this.maxdepth = maxdepth;
-            this.maxtime = maxtime;
-        }
-        public int getClickdepth(final DigestURL url) throws IOException {
-            // first try: get the clickdepth from the cache
-            Integer clickdepth = cache.get(ASCII.String(url.hash()));
-            if (MemoryControl.shortStatus()) cache.clear();
-            if (clickdepth != null) {
-                //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
-                return clickdepth.intValue();
-            }
-            
-            // second try: get the clickdepth from a hyperlinGraphCache (forward clickdepth)
-            HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost());
-            if (hlg == null) {
-                hlg = new HyperlinkGraph();
-                hlg.fill(fulltext.getDefaultConnector(), url.getHost(), null, 300000, 10000000);
-                hlg.findLinkDepth();
-                hyperlinkGraphCache.put(url.getHost(), hlg);
-            }
-            clickdepth = hlg.getDepth(url);
-            if (clickdepth != null) {
-                return clickdepth.intValue();
-            }
-                    
-            
-            // third try: get the clickdepth from a reverse link graph
-            clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth);
-            //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
-            this.cache.put(ASCII.String(url.hash()), clickdepth);
-            return clickdepth.intValue();
-        }
-    }
-    
    /**
     * A ReferenceReport object is a container for all referenced to a specific url.
     * The class stores the number of links from domain-internal and domain-external backlinks,
@ -654,7 +533,7 @@ public class Segment {
        char docType = Response.docType(document.dc_format());
        
        // CREATE SOLR DOCUMENT
-        final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName);
+        final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
        
        // ENRICH DOCUMENT WITH RANKING INFORMATION
        this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -82,7 +82,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
 import net.yacy.kelondro.util.Bitfield;
 import net.yacy.kelondro.util.MemoryControl;
 import net.yacy.search.index.Segment;
-import net.yacy.search.index.Segment.ClickdepthCache;
 import net.yacy.search.index.Segment.ReferenceReport;
 import net.yacy.search.index.Segment.ReferenceReportCache;
 import net.yacy.search.query.QueryParams;
@ -367,22 +366,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
        
        String us = digestURL.toNormalform(true);
-
-        int clickdepth = 999;
-        if ((allAttr || contains(CollectionSchema.clickdepth_i))) {
-            if (digestURL.probablyRootURL()) {
-                clickdepth = 0;
-            } else {
-                clickdepth = 999;
-            }
-            if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that.
-            if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
-            CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index
-        }
        
+        int crawldepth = document.getDepth();
        if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
-            int depth = document.getDepth();
-            CollectionSchema.crawldepth_i.add(doc, depth);
+            CollectionSchema.crawldepth_i.add(doc, crawldepth);
        }
        
        if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) {
@ -670,7 +657,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                add(doc, CollectionSchema.framesscount_i, frames.length);
                if (frames.length > 0) {
                    add(doc, CollectionSchema.frames_sxt, frames);
-                    //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
+                    //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
                }
            }

@ -687,7 +674,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                add(doc, CollectionSchema.iframesscount_i, iframes.length);
                if (iframes.length > 0) {
                    add(doc, CollectionSchema.iframes_sxt, iframes);
-                    //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
+                    //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
                }
            }

@ -856,9 +843,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
        
        // create a subgraph
-        if (!containsCanonical) {
+        if (!containsCanonical && webgraph != null) {
            // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
-            webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, document.getAnchors(), sourceName);
+            webgraph.addEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName);
        }
            
        // list all links
@ -923,7 +910,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
     * @param urlCitation
     * @return
     */
-    public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final ClickdepthCache clickdepthCache, final String harvestkey) {
+    public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey) {
        if (!this.contains(CollectionSchema.process_sxt)) return 0;
        if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0;
        final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector();
@ -1054,7 +1041,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                            @Override
                            public void run() {
                                Thread.currentThread().setName(name);
-                                SolrDocument doc; String protocol, urlstub, id; DigestURL url;
+                                SolrDocument doc; String id;
                                try {
                                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                                        SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields);
@ -1081,30 +1068,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                                            }
                                        }
                                        
-                                        // set clickdepth
-                                        if (process.contains(ProcessType.CLICKDEPTH)) {
-                                            if (webgraph.contains(WebgraphSchema.source_clickdepth_i) && webgraph.contains(WebgraphSchema.source_protocol_s) && webgraph.contains(WebgraphSchema.source_urlstub_s) && webgraph.contains(WebgraphSchema.source_id_s)) {
-                                                protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName());
-                                                urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName());
-                                                id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
-                                                try {
-                                                    url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
-                                                    postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i);
-                                                } catch (MalformedURLException e) {
-                                                }
-                                            }
-                                            if (webgraph.contains(WebgraphSchema.target_clickdepth_i) && webgraph.contains(WebgraphSchema.target_protocol_s) && webgraph.contains(WebgraphSchema.target_urlstub_s) && webgraph.contains(WebgraphSchema.target_id_s)) {
-                                                protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName());
-                                                urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
-                                                id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName());
-                                                try {
-                                                    url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id));
-                                                    postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i);
-                                                } catch (MalformedURLException e) {
-                                                }
-                                            }
-                                        }
-                                        
                                        // write document back to index
                                        try {
                                            sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
@ -1148,7 +1111,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            Set<String> omitFields = new HashSet<String>();
            omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
            omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
-            int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
+            int proccount = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0;
            long count = collectionConnector.getCountByQuery(query);
            long start = System.currentTimeMillis();
            ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
@ -1170,9 +1133,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                        
                        // switch over tag types
                        ProcessType tagtype = ProcessType.valueOf((String) tag);
-                        if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) {
-                            if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++;
-                        }

                        if (tagtype == ProcessType.CITATION &&
                            collection.contains(CollectionSchema.cr_host_count_i) &&
@ -1228,7 +1188,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            }
            if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null
            ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " +
-                        proccount_clickdepthchange + " clickdepth changes, " +
                        proccount_referencechange + " reference-count changes, " +
                        proccount_uniquechange + " unique field changes, " +
                        proccount_citationchange + " citation ranking changes.");
@ -1534,12 +1493,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                configuration.add(doc, CollectionSchema.collection_sxt, cs);
            }

-            // clickdepth, cr and postprocessing
+            // cr and postprocessing
            Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
-            if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) {
-                processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
-                CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index
-            }
            if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) {
                processTypes.add(ProcessType.CITATION); // postprocessing needed
            }
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -57,8 +57,7 @@ public enum CollectionSchema implements SchemaDeclaration {
    references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"),
    references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"),
    references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"),
-    clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
-    crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
+    crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth"),
    process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"),
    harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
    
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@ -112,7 +112,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
    
    public void addEdges(
            final Subgraph subgraph,
-            final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
+            final DigestURL source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
            final List<ImageEntry> images, final Collection<AnchorURL> links,
            final String sourceName) {
        boolean allAttr = this.isEmpty();
@ -120,7 +120,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
        int target_order = 0;
        for (final AnchorURL target_url: links) {
            SolrInputDocument edge = getEdge(
-                    subgraph, source, responseHeader, collections, clickdepth_source, images,
+                    subgraph, source, responseHeader, collections, crawldepth_source, images,
                    sourceName, allAttr, generalNofollow, target_order, target_url);
            target_order++;
            // add the edge to the subgraph
@ -130,7 +130,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
    
    public SolrInputDocument getEdge(
            final Subgraph subgraph,
-            final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
+            final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, int crawldepth_source,
            final List<ImageEntry> images,
            final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) {

@ -217,9 +217,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
            add(edge, WebgraphSchema.source_path_folders_count_i, paths.length);
            add(edge, WebgraphSchema.source_path_folders_sxt, paths);
        }
-        if ((allAttr || contains(WebgraphSchema.source_clickdepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
-            add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source);
-            processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
+        if ((allAttr || contains(WebgraphSchema.source_crawldepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) {
+            add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source);
        }

        // parse text to find images and clear text
@ -289,15 +288,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
            add(edge, WebgraphSchema.target_path_folders_sxt, paths);
        }

-        if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
-            if (target_url.probablyRootURL()) {
-                boolean lc = this.lazy; this.lazy = false;
-                add(edge, WebgraphSchema.target_clickdepth_i, 0);
-                this.lazy = lc;
-            } else {
-                add(edge, WebgraphSchema.target_clickdepth_i, 999);
-                processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
-            }
+        if ((allAttr || contains(WebgraphSchema.target_crawldepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) {
+            add(edge, WebgraphSchema.target_crawldepth_i, 999);
        }
        
        if (allAttr || contains(WebgraphSchema.process_sxt)) {
--- a/source/net/yacy/search/schema/WebgraphSchema.java
+++ b/source/net/yacy/search/schema/WebgraphSchema.java
@ -35,7 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
    last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
    load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"),
    collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
-    process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
+    process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set."),
    harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
    
    // source information
@ -51,7 +51,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
    source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"),
    source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
    source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
-    source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
+    source_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
    source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"),

    source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"),
@ -86,7 +86,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
    target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"),
    target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
    target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),
-    target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
+    target_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
    target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),

    target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),