## number of external hosts which provide http references
references_exthosts_i
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url
clickdepth_i
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i
## crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth
crawldepth_i
## needed (post-)processing steps on this metadata set
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
collection_sxt
## needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation.
## needed (post-)processing steps on this metadata set
#process_sxt
## key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated.
@ -72,7 +72,7 @@ source_id_s
#source_parameter_value_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)
#source_clickdepth_i
#source_crawldepth_i
## copy of the citation rank norm value from the source link
#source_cr_host_norm_i
@ -173,7 +173,7 @@ target_path_folders_sxt
#target_parameter_value_sxt
## depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)
#target_clickdepth_i
#target_crawldepth_i
## copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host
try{postprocessingCount[0]=(int)fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName()+AbstractSolrConnector.CATCHALL_DTERM);}catch(IOExceptione){}// should be zero but you never know
@ -2331,7 +2327,7 @@ public final class Switchboard extends serverSwitch {
try{postprocessingCount[0]=(int)fulltext.getDefaultConnector().getCountByQuery(CollectionSchema.process_sxt.getSolrFieldName()+AbstractSolrConnector.CATCHALL_DTERM);}catch(IOExceptione){}// should be zero but you never know
if(rootCandidates.has(searchhash))return0;// the url is a root candidate itself
Set<String>ignore=newHashSet<String>();// a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent endless loops
Set<String>levelhashes=newHashSet<String>();// all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
levelhashes.add(ASCII.String(searchhash));
finalbyte[]hosthash=newbyte[6];// the host of the url to be checked
if(document.getDepth()<2)clickdepth=Math.min(clickdepth,document.getDepth());// thats not true if the start url was not a root URL. We need a test for that.
if(clickdepth>2)processTypes.add(ProcessType.CLICKDEPTH);// postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
CollectionSchema.clickdepth_i.add(doc,clickdepth);// no lazy value checking to get a '0' into the index
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, framess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
@ -687,7 +674,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
//webgraph.addEdges(subgraph, digestURI, responseHeader, collections, crawldepth, alllinks, images, true, iframess, citations); // add here because links have been removed from remaining inbound/outbound
}
}
@ -856,9 +843,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
@ -1228,7 +1188,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
if(count!=countcheck)ConcurrentLog.warn("CollectionConfiguration","ambiguous collection document count for harvestkey "+harvestkey+": expected="+count+", counted="+countcheck);// big gap for harvestkey = null
ConcurrentLog.info("CollectionConfiguration","cleanup_processing: re-calculated "+proccount+" new documents, "+
@ -57,8 +57,7 @@ public enum CollectionSchema implements SchemaDeclaration {
references_internal_i(SolrType.num_integer,true,true,false,false,false,"number of unique http references from same host to referenced url"),
references_external_i(SolrType.num_integer,true,true,false,false,false,"number of unique http references from external hosts"),
references_exthosts_i(SolrType.num_integer,true,true,false,false,false,"number of external hosts which provide http references"),
clickdepth_i(SolrType.num_integer,true,true,false,false,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url"),
crawldepth_i(SolrType.num_integer,true,true,false,false,false,"crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is the maximum of clickdepth_i"),
crawldepth_i(SolrType.num_integer,true,true,false,false,false,"crawl depth of web page according to the number of steps that the crawler did to get to this document; if the crawl was started at a root document, then this is equal to the clickdepth"),
process_sxt(SolrType.string,true,true,true,false,false,"needed (post-)processing steps on this metadata set"),
harvestkey_s(SolrType.string,true,true,false,false,false,"key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
@ -35,7 +35,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
last_modified(SolrType.date,true,true,false,false,false,"last-modified from http header"),
load_date_dt(SolrType.date,true,true,false,false,false,"time when resource was loaded"),
collection_sxt(SolrType.string,true,true,true,false,false,"tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
process_sxt(SolrType.string,true,true,true,false,false,"needed (post-)processing steps on this metadata set, used i.e. for clickdepth-computation."),
process_sxt(SolrType.string,true,true,true,false,false,"needed (post-)processing steps on this metadata set."),
harvestkey_s(SolrType.string,true,true,false,false,false,"key from a harvest process (i.e. the crawl profile hash key) which is needed for near-realtime postprocessing. This shall be deleted as soon as postprocessing has been terminated."),
// source information
@ -51,7 +51,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_parameter_count_i(SolrType.num_integer,true,true,false,false,false,"number of key-value pairs in search part of the url (source)"),
source_parameter_key_sxt(SolrType.string,true,true,true,false,false,"the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string,true,true,true,false,false,"the values from key-value pairs in the search part of the url (source)"),
source_clickdepth_i(SolrType.num_integer,true,true,false,false,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_crawldepth_i(SolrType.num_integer,true,true,false,false,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (source)"),
source_cr_host_norm_i(SolrType.num_integer,true,true,false,false,false,"copy of the citation rank norm value from the source link"),
source_host_s(SolrType.string,true,true,false,false,false,"host of the url (source)"),
@ -86,7 +86,7 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_parameter_count_i(SolrType.num_integer,true,true,false,false,false,"number of key-value pairs in search part of the url (target)"),
target_parameter_key_sxt(SolrType.string,true,true,true,false,false,"the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string,true,true,true,false,true,"the values from key-value pairs in the search part of the url (target)"),
target_clickdepth_i(SolrType.num_integer,true,true,false,false,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_crawldepth_i(SolrType.num_integer,true,true,false,false,false,"depth of web page according to number of clicks from the 'main' page, which is the page that appears if only the host is entered as url (target)"),
target_cr_host_norm_i(SolrType.num_integer,true,true,false,false,false,"copy of the citation rank norm value from the target link; this is only filled if the target host is identical to the source host"),
target_host_s(SolrType.string,true,true,false,false,true,"host of the url (target)"),