diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 6caad64f9..9de9eee2a 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -84,10 +84,7 @@ references_external_i ## number of external hosts which provide http references references_exthosts_i -## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url -clickdepth_i - -## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i +## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth crawldepth_i ## needed (post-)processing steps on this metadata set diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index f0a9dc987..65fe663ea 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -24,7 +24,7 @@ last_modified ## tags that are attached to crawls/index generation to separate the search result into user-defined subsets collection_sxt -## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation. +## needed (post-)processing steps on this metadata set #process_sxt ## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated. @@ -72,7 +72,7 @@ source_id_s #source_parameter_value_sxt ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source) -#source_clickdepth_i +#source_crawldepth_i ## copy of the citation rank norm value from the source link #source_cr_host_norm_i @@ -173,7 +173,7 @@ target_path_folders_sxt #target_parameter_value_sxt ## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target) -#target_clickdepth_i +#target_crawldepth_i ## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host #target_cr_host_norm_i diff --git a/defaults/yacy.init b/defaults/yacy.init index ccc19c77f..dc787fd32 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1218,7 +1218,3 @@ browser.load4everyone = false # with some heuristics like: loading linked documents and adding a twitter search. # When the learning mode is finished, the user may switch on individual heuristics by himself. greedylearning.active = true - -# postprocessing parametrization -postprocessing.clickdepth.maxtime = 100 -postprocessing.clickdepth.maxdepth = 6 \ No newline at end of file diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index dbcc56bba..9e6b5bd5f 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -297,7 +297,6 @@ public class HostBrowser { CollectionSchema.inboundlinks_urlstub_sxt.getSolrFieldName(), CollectionSchema.outboundlinks_protocol_sxt.getSolrFieldName(), CollectionSchema.outboundlinks_urlstub_sxt.getSolrFieldName(), - CollectionSchema.clickdepth_i.getSolrFieldName(), CollectionSchema.crawldepth_i.getSolrFieldName(), CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), @@ -564,18 +563,16 @@ public class HostBrowser { public static final class InfoCacheEntry { public Integer cr_n; public Double cr_c; - public int clickdepth, crawldepth, references, references_internal, references_external, references_exthosts; + public int crawldepth, references, references_internal, references_external, references_exthosts; public List references_internal_urls, references_external_urls; public InfoCacheEntry(final Fulltext fulltext, final ReferenceReportCache rrCache, final SolrDocument doc, final String urlhash, boolean fetchReferences) { this.cr_c = (Double) doc.getFieldValue(CollectionSchema.cr_host_chance_d.getSolrFieldName()); this.cr_n = (Integer) doc.getFieldValue(CollectionSchema.cr_host_norm_i.getSolrFieldName()); - Integer cd = (Integer) doc.getFieldValue(CollectionSchema.clickdepth_i.getSolrFieldName()); Integer cr = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName()); Integer rc = (Integer) doc.getFieldValue(CollectionSchema.references_i.getSolrFieldName()); Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); - this.clickdepth = (cd == null || cd.intValue() < 0) ? 999 : cd.intValue(); this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue(); this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue(); this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); @@ -628,7 +625,6 @@ public class HostBrowser { } if (sbe.length() > 0) sbe.insert(0, "
external referrer:
"); return - (this.clickdepth >= 0 ? "clickdepth: " + this.clickdepth : "") + (this.crawldepth >= 0 ? ", crawldepth: " + this.crawldepth : "") + (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + (this.cr_n != null ? ", crn=" + this.cr_n : "") + diff --git a/htroot/RankingSolr_p.java b/htroot/RankingSolr_p.java index 91e543a11..55f1547d6 100644 --- a/htroot/RankingSolr_p.java +++ b/htroot/RankingSolr_p.java @@ -81,7 +81,7 @@ public class RankingSolr_p { } } if (post != null && post.containsKey("ResetBQ")) { - String bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4"; + String bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4"; if (bq != null) { sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + profileNr, bq); sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setBoostQuery(bq); diff --git a/source/net/yacy/cora/document/id/DigestURL.java b/source/net/yacy/cora/document/id/DigestURL.java index 74a05e2b2..b4c9b0822 100644 --- a/source/net/yacy/cora/document/id/DigestURL.java +++ b/source/net/yacy/cora/document/id/DigestURL.java @@ -290,10 +290,6 @@ public class DigestURL extends MultiProtocolURL implements Serializable { } public final static Pattern rootPattern = Pattern.compile("/|/\\?|/index.htm(l?)|/index.php|/home.htm(l?)|/home.php|/default.htm(l?)|/default.php"); - - public final boolean probablyRootURL() { - return this.path.length() <= 1 || rootPattern.matcher(this.path).matches(); - } private static final String hosthash5(final String protocol, final String host, final int port) { if (host == null) { diff --git a/source/net/yacy/cora/federate/solr/ProcessType.java b/source/net/yacy/cora/federate/solr/ProcessType.java index 1d4439e1f..8c64da5b3 100644 --- a/source/net/yacy/cora/federate/solr/ProcessType.java +++ b/source/net/yacy/cora/federate/solr/ProcessType.java @@ -26,6 +26,6 @@ package net.yacy.cora.federate.solr; */ public enum ProcessType { - CLICKDEPTH, CITATION, UNIQUE; + CITATION, UNIQUE; } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 10805fbb5..cf760f068 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -41,7 +41,6 @@ import net.yacy.cora.storage.Configuration; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.search.index.Segment; -import net.yacy.search.index.Segment.ClickdepthCache; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.schema.CollectionSchema; @@ -177,21 +176,6 @@ public class SchemaConfiguration extends Configuration implements Serializable { } return changed; } - - public boolean postprocessing_clickdepth(final ClickdepthCache clickdepthCache, final SolrInputDocument sid, final DigestURL url, final SchemaDeclaration clickdepthfield) { - // get new click depth and compare with old - Integer oldclickdepth = (Integer) sid.getFieldValue(clickdepthfield.getSolrFieldName()); - if (oldclickdepth != null && oldclickdepth.intValue() != 999) return false; // we do not want to compute that again - try { - int clickdepth = clickdepthCache.getClickdepth(url); - if (oldclickdepth == null || oldclickdepth.intValue() != clickdepth) { - sid.setField(clickdepthfield.getSolrFieldName(), clickdepth); - return true; - } - } catch (final IOException e) { - } - return false; - } public boolean postprocessing_references(final ReferenceReportCache rrCache, final SolrInputDocument sid, final DigestURL url, final Map hostExtentCount) { if (!(this.contains(CollectionSchema.references_i) || diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 05b91ae37..63a07ce98 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -190,7 +190,6 @@ import net.yacy.repository.FilterEngine; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; -import net.yacy.search.index.Segment.ClickdepthCache; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.AccessTracker; import net.yacy.search.query.SearchEvent; @@ -484,9 +483,9 @@ public final class Switchboard extends serverSwitch { String bq = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + i, ""); String bf = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + i, ""); // apply some hard-coded patches for earlier experiments we do not want any more - if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(clickdepth_i,1)))") || + if (bf.equals("product(recip(rord(last_modified),1,1000,1000),div(product(log(product(references_external_i,references_exthosts_i)),div(references_internal_i,host_extent_i)),add(crawldepth_i,1)))") || bf.equals("scale(cr_host_norm_i,1,20)")) bf = ""; - if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "clickdepth_i:0^0.8 clickdepth_i:1^0.4"; + if (i == 0 && bq.equals("fuzzy_signature_unique_b:true^100000.0")) bq = "crawldepth_i:0^0.8 crawldepth_i:1^0.4"; if (boosts.equals("url_paths_sxt^1000.0,synonyms_sxt^1.0,title^10000.0,text_t^2.0,h1_txt^1000.0,h2_txt^100.0,host_organization_s^100000.0")) boosts = "url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^2.0"; r.setName(name); r.updateBoosts(boosts); @@ -2307,9 +2306,6 @@ public final class Switchboard extends serverSwitch { // we optimize first because that is useful for postprocessing ReferenceReportCache rrCache = index.getReferenceReportCache(); - int clickdepth_maxtime = this.getConfigInt("postprocessing.clickdepth.maxtime", 100); - int clickdepth_maxdepth = this.getConfigInt("postprocessing.clickdepth.maxdepth", 6); - ClickdepthCache clickdepthCache = index.getClickdepthCache(rrCache, clickdepth_maxtime, clickdepth_maxdepth); Set deletionCandidates = collection1Configuration.contains(CollectionSchema.harvestkey_s.getSolrFieldName()) ? this.crawler.getFinishesProfiles(this.crawlQueues) : new HashSet(); int cleanupByHarvestkey = deletionCandidates.size(); @@ -2320,7 +2316,7 @@ public final class Switchboard extends serverSwitch { postprocessingRunning = true; postprocessingStartTime[0] = System.currentTimeMillis(); try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} - for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, profileHash); + for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash); postprocessingStartTime[0] = 0; try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know @@ -2331,7 +2327,7 @@ public final class Switchboard extends serverSwitch { postprocessingRunning = true; postprocessingStartTime[0] = System.currentTimeMillis(); try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} - proccount += collection1Configuration.postprocessing(index, rrCache, clickdepthCache, null); + proccount += collection1Configuration.postprocessing(index, rrCache, null); postprocessingStartTime[0] = 0; try {postprocessingCount[0] = (int) fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);} catch (IOException e) {} // should be zero but you never know diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 7825e5b03..4628806cc 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -30,8 +30,6 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; -import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -82,7 +80,6 @@ import net.yacy.repository.LoaderDispatcher; import net.yacy.search.query.SearchEvent; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; -import net.yacy.search.schema.HyperlinkGraph; import net.yacy.search.schema.WebgraphConfiguration; import net.yacy.search.schema.WebgraphSchema; @@ -204,77 +201,7 @@ public class Segment { public IndexCell urlCitation() { return this.urlCitationIndex; } - - /** - * compute the click level using the citation reference database - * @param citations the citation database - * @param searchhash the hash of the url to be checked - * @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached - * @throws IOException - */ - private int getClickDepth(final ReferenceReportCache rrc, final DigestURL url, final int maxtime, final int maxdepth) throws IOException { - - final byte[] searchhash = url.hash(); - RowHandleSet rootCandidates = getPossibleRootHashes(url); - if (rootCandidates.has(searchhash)) return 0; // the url is a root candidate itself - - Set ignore = new HashSet(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops - Set levelhashes = new HashSet(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry - levelhashes.add(ASCII.String(searchhash)); - final byte[] hosthash = new byte[6]; // the host of the url to be checked - System.arraycopy(searchhash, 6, hosthash, 0, 6); - - long timeout = System.currentTimeMillis() + maxtime; - mainloop: for (int leveldepth = 0; leveldepth < maxdepth && System.currentTimeMillis() < timeout; leveldepth++) { - - Set checknext = new HashSet(); - - // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0 - checkloop: for (String urlhashs: levelhashes) { - - // get all the citations for this url and iterate - ReferenceReport rr = rrc.getReferenceReport(urlhashs, false); - //ReferenceContainer references = this.urlCitationIndex.get(urlhash, null); - if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know - Iterator i = rr.getInternallIDs().iterator(); - nextloop: while (i.hasNext()) { - byte[] u = i.next(); - if (u == null) continue nextloop; - - // check if this is from the same host - assert (ByteBuffer.equals(u, 6, hosthash, 0, 6)); - String us = ASCII.String(u); - // check ignore - if (ignore.contains(us)) continue nextloop; - - // check if the url is a root url - if (rootCandidates.has(u)) { - return leveldepth + 1; - } - - checknext.add(us); - ignore.add(us); - } - if (System.currentTimeMillis() > timeout) break mainloop; - } - levelhashes = checknext; - } - return 999; - } - - private static RowHandleSet getPossibleRootHashes(final DigestURL url) { - RowHandleSet rootCandidates = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 10); - String rootStub = url.getProtocol() + "://" + url.getHost() + (url.getProtocol().equals("http") && url.getPort() != 80 ? (":" + url.getPort()) : ""); - try { - rootCandidates.put(new DigestURL(rootStub).hash()); - for (String rootfn: HyperlinkGraph.ROOTFNS) rootCandidates.put(new DigestURL(rootStub + rootfn).hash()); - rootCandidates.optimize(); - } catch (final Throwable e) {} - rootCandidates.optimize(); - return rootCandidates; - } - public ReferenceReportCache getReferenceReportCache() { return new ReferenceReportCache(); } @@ -299,54 +226,6 @@ public class Segment { } } - public ClickdepthCache getClickdepthCache(ReferenceReportCache rrc, final int maxtime, final int maxdepth) { - return new ClickdepthCache(rrc, maxtime, maxdepth); - } - - public class ClickdepthCache { - private final ReferenceReportCache rrc; - private final Map hyperlinkGraphCache; // map from host name to a HyperlinkGraph for that host name - private final Map cache; - public final int maxdepth; // maximum clickdepth - public final int maxtime; // maximum time to compute clickdepth - public ClickdepthCache(final ReferenceReportCache rrc, final int maxtime, final int maxdepth) { - this.rrc = rrc; - this.hyperlinkGraphCache = new HashMap(); - this.cache = new ConcurrentHashMap(); - this.maxdepth = maxdepth; - this.maxtime = maxtime; - } - public int getClickdepth(final DigestURL url) throws IOException { - // first try: get the clickdepth from the cache - Integer clickdepth = cache.get(ASCII.String(url.hash())); - if (MemoryControl.shortStatus()) cache.clear(); - if (clickdepth != null) { - //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT"); - return clickdepth.intValue(); - } - - // second try: get the clickdepth from a hyperlinGraphCache (forward clickdepth) - HyperlinkGraph hlg = hyperlinkGraphCache.get(url.getHost()); - if (hlg == null) { - hlg = new HyperlinkGraph(); - hlg.fill(fulltext.getDefaultConnector(), url.getHost(), null, 300000, 10000000); - hlg.findLinkDepth(); - hyperlinkGraphCache.put(url.getHost(), hlg); - } - clickdepth = hlg.getDepth(url); - if (clickdepth != null) { - return clickdepth.intValue(); - } - - - // third try: get the clickdepth from a reverse link graph - clickdepth = Segment.this.getClickDepth(this.rrc, url, this.maxtime, this.maxdepth); - //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth); - this.cache.put(ASCII.String(url.hash()), clickdepth); - return clickdepth.intValue(); - } - } - /** * A ReferenceReport object is a container for all referenced to a specific url. * The class stores the number of links from domain-internal and domain-external backlinks, @@ -654,7 +533,7 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext.getWebgraphConfiguration(), sourceName); + final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); // ENRICH DOCUMENT WITH RANKING INFORMATION this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index abdb61c72..fce4aefd5 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -82,7 +82,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.index.Segment; -import net.yacy.search.index.Segment.ClickdepthCache; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; import net.yacy.search.query.QueryParams; @@ -367,22 +366,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Set processTypes = new LinkedHashSet(); String us = digestURL.toNormalform(true); - - int clickdepth = 999; - if ((allAttr || contains(CollectionSchema.clickdepth_i))) { - if (digestURL.probablyRootURL()) { - clickdepth = 0; - } else { - clickdepth = 999; - } - if (document.getDepth() < 2) clickdepth = Math.min(clickdepth, document.getDepth()); // thats not true if the start url was not a root URL. We need a test for that. - if (clickdepth > 2) processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut - CollectionSchema.clickdepth_i.add(doc, clickdepth); // no lazy value checking to get a '0' into the index - } + int crawldepth = document.getDepth(); if ((allAttr || contains(CollectionSchema.crawldepth_i))) { - int depth = document.getDepth(); - CollectionSchema.crawldepth_i.add(doc, depth); + CollectionSchema.crawldepth_i.add(doc, crawldepth); } if (allAttr || (contains(CollectionSchema.cr_host_chance_d) && contains(CollectionSchema.cr_host_count_i) && contains(CollectionSchema.cr_host_norm_i))) { @@ -670,7 +657,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.framesscount_i, frames.length); if (frames.length > 0) { add(doc, CollectionSchema.frames_sxt, frames); - //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound + //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound } } @@ -687,7 +674,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.iframesscount_i, iframes.length); if (iframes.length > 0) { add(doc, CollectionSchema.iframes_sxt, iframes); - //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound + //webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound } } @@ -856,9 +843,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); // create a subgraph - if (!containsCanonical) { + if (!containsCanonical && webgraph != null) { // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document - webgraph.addEdges(subgraph, digestURL, responseHeader, collections, clickdepth, images, document.getAnchors(), sourceName); + webgraph.addEdges(subgraph, digestURL, responseHeader, collections, crawldepth, images, document.getAnchors(), sourceName); } // list all links @@ -923,7 +910,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param urlCitation * @return */ - public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final ClickdepthCache clickdepthCache, final String harvestkey) { + public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey) { if (!this.contains(CollectionSchema.process_sxt)) return 0; if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0; final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector(); @@ -1054,7 +1041,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri @Override public void run() { Thread.currentThread().setName(name); - SolrDocument doc; String protocol, urlstub, id; DigestURL url; + SolrDocument doc; String id; try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields); @@ -1081,30 +1068,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } - // set clickdepth - if (process.contains(ProcessType.CLICKDEPTH)) { - if (webgraph.contains(WebgraphSchema.source_clickdepth_i) && webgraph.contains(WebgraphSchema.source_protocol_s) && webgraph.contains(WebgraphSchema.source_urlstub_s) && webgraph.contains(WebgraphSchema.source_id_s)) { - protocol = (String) doc.getFieldValue(WebgraphSchema.source_protocol_s.getSolrFieldName()); - urlstub = (String) doc.getFieldValue(WebgraphSchema.source_urlstub_s.getSolrFieldName()); - id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); - try { - url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id)); - postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.source_clickdepth_i); - } catch (MalformedURLException e) { - } - } - if (webgraph.contains(WebgraphSchema.target_clickdepth_i) && webgraph.contains(WebgraphSchema.target_protocol_s) && webgraph.contains(WebgraphSchema.target_urlstub_s) && webgraph.contains(WebgraphSchema.target_id_s)) { - protocol = (String) doc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()); - urlstub = (String) doc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName()); - id = (String) doc.getFieldValue(WebgraphSchema.target_id_s.getSolrFieldName()); - try { - url = new DigestURL(protocol + "://" + urlstub, ASCII.getBytes(id)); - postprocessing_clickdepth(clickdepthCache, sid, url, WebgraphSchema.target_clickdepth_i); - } catch (MalformedURLException e) { - } - } - } - // write document back to index try { sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); @@ -1148,7 +1111,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Set omitFields = new HashSet(); omitFields.add(CollectionSchema.process_sxt.getSolrFieldName()); omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName()); - int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; + int proccount = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; long count = collectionConnector.getCountByQuery(query); long start = System.currentTimeMillis(); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey); @@ -1170,9 +1133,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // switch over tag types ProcessType tagtype = ProcessType.valueOf((String) tag); - if (tagtype == ProcessType.CLICKDEPTH && collection.contains(CollectionSchema.clickdepth_i)) { - if (postprocessing_clickdepth(clickdepthCache, sid, url, CollectionSchema.clickdepth_i)) proccount_clickdepthchange++; - } if (tagtype == ProcessType.CITATION && collection.contains(CollectionSchema.cr_host_count_i) && @@ -1228,7 +1188,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + - proccount_clickdepthchange + " clickdepth changes, " + proccount_referencechange + " reference-count changes, " + proccount_uniquechange + " unique field changes, " + proccount_citationchange + " citation ranking changes."); @@ -1534,12 +1493,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri configuration.add(doc, CollectionSchema.collection_sxt, cs); } - // clickdepth, cr and postprocessing + // cr and postprocessing Set processTypes = new LinkedHashSet(); - if ((allAttr || configuration.contains(CollectionSchema.clickdepth_i))) { - processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut - CollectionSchema.clickdepth_i.add(doc, digestURL.probablyRootURL() ? 0 : 999); // no lazy value checking to get a '0' into the index - } if (allAttr || (configuration.contains(CollectionSchema.cr_host_chance_d) && configuration.contains(CollectionSchema.cr_host_count_i) && configuration.contains(CollectionSchema.cr_host_norm_i))) { processTypes.add(ProcessType.CITATION); // postprocessing needed } diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 2b62c24f6..afc97e073 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -57,8 +57,7 @@ public enum CollectionSchema implements SchemaDeclaration { references_internal_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from same host to referenced url"), references_external_i(SolrType.num_integer, true, true, false, false, false, "number of unique http references from external hosts"), references_exthosts_i(SolrType.num_integer, true, true, false, false, false, "number of external hosts which provide http references"), - clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"), - crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"), + crawldepth_i(SolrType.num_integer, true, true, false, false, false, "crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth"), process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set"), harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."), diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 22ae657d1..f04aa0230 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -112,7 +112,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial public void addEdges( final Subgraph subgraph, - final DigestURL source, final ResponseHeader responseHeader, Map collections, int clickdepth_source, + final DigestURL source, final ResponseHeader responseHeader, Map collections, int crawldepth_source, final List images, final Collection links, final String sourceName) { boolean allAttr = this.isEmpty(); @@ -120,7 +120,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial int target_order = 0; for (final AnchorURL target_url: links) { SolrInputDocument edge = getEdge( - subgraph, source, responseHeader, collections, clickdepth_source, images, + subgraph, source, responseHeader, collections, crawldepth_source, images, sourceName, allAttr, generalNofollow, target_order, target_url); target_order++; // add the edge to the subgraph @@ -130,7 +130,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial public SolrInputDocument getEdge( final Subgraph subgraph, - final DigestURL source_url, final ResponseHeader responseHeader, Map collections, int clickdepth_source, + final DigestURL source_url, final ResponseHeader responseHeader, Map collections, int crawldepth_source, final List images, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, AnchorURL target_url) { @@ -217,9 +217,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.source_path_folders_count_i, paths.length); add(edge, WebgraphSchema.source_path_folders_sxt, paths); } - if ((allAttr || contains(WebgraphSchema.source_clickdepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { - add(edge, WebgraphSchema.source_clickdepth_i, clickdepth_source); - processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut + if ((allAttr || contains(WebgraphSchema.source_crawldepth_i)) && this.contains(WebgraphSchema.source_protocol_s) && this.contains(WebgraphSchema.source_urlstub_s) && this.contains(WebgraphSchema.source_id_s)) { + add(edge, WebgraphSchema.source_crawldepth_i, crawldepth_source); } // parse text to find images and clear text @@ -289,15 +288,8 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.target_path_folders_sxt, paths); } - if ((allAttr || contains(WebgraphSchema.target_clickdepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { - if (target_url.probablyRootURL()) { - boolean lc = this.lazy; this.lazy = false; - add(edge, WebgraphSchema.target_clickdepth_i, 0); - this.lazy = lc; - } else { - add(edge, WebgraphSchema.target_clickdepth_i, 999); - processTypes.add(ProcessType.CLICKDEPTH); // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut - } + if ((allAttr || contains(WebgraphSchema.target_crawldepth_i)) && this.contains(WebgraphSchema.target_protocol_s) && this.contains(WebgraphSchema.target_urlstub_s) && this.contains(WebgraphSchema.target_id_s)) { + add(edge, WebgraphSchema.target_crawldepth_i, 999); } if (allAttr || contains(WebgraphSchema.process_sxt)) { diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 117b7ab99..a9d220007 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -35,7 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration { last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), load_date_dt(SolrType.date, true, true, false, false, false, "time when resource was loaded"), collection_sxt(SolrType.string, true, true, true, false, false, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"), - process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."), + process_sxt(SolrType.string, true, true, true, false, false, "needed (post-)processing steps on this metadata set."), harvestkey_s(SolrType.string, true, true, false, false, false, "key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."), // source information @@ -51,7 +51,7 @@ public enum WebgraphSchema implements SchemaDeclaration { source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"), source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"), source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"), - source_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"), + source_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"), source_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the source link"), source_host_s(SolrType.string, true, true, false, false, false, "host of the url (source)"), @@ -86,7 +86,7 @@ public enum WebgraphSchema implements SchemaDeclaration { target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"), target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"), target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"), - target_clickdepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"), + target_crawldepth_i(SolrType.num_integer, true, true, false, false, false, "depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"), target_cr_host_norm_i(SolrType.num_integer, true, true, false, false, false, "copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"), target_host_s(SolrType.string, true, true, false, false, true, "host of the url (target)"),