- enhanced generation of url objects

- enhanced computation of link structure graphics
- enhanced collection of data for link structures
pull/1/head
Michael Peter Christen 13 years ago
parent 4023d88b0b
commit 21fe8339b4

@ -181,7 +181,7 @@ public class CrawlStartScanner_p
final Scanner.Service u = se.next().getKey(); final Scanner.Service u = se.next().getKey();
DigestURI uu; DigestURI uu;
try { try {
uu = new DigestURI(u.url()); uu = DigestURI.toDigestURI(u.url());
pkmap.put(uu.hash(), uu); pkmap.put(uu.hash(), uu);
} catch ( final MalformedURLException e ) { } catch ( final MalformedURLException e ) {
Log.logException(e); Log.logException(e);
@ -236,7 +236,7 @@ public class CrawlStartScanner_p
while ( se.hasNext() ) { while ( se.hasNext() ) {
host = se.next(); host = se.next();
try { try {
u = new DigestURI(host.getKey().url()); u = DigestURI.toDigestURI(host.getKey().url());
urlString = u.toNormalform(true); urlString = u.toNormalform(true);
if ( host.getValue() == Access.granted if ( host.getValue() == Access.granted
&& Scanner.inIndex(apiCommentCache, urlString) == null ) { && Scanner.inIndex(apiCommentCache, urlString) == null ) {

@ -296,7 +296,7 @@ public class Crawler_p {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay);
// get links and generate filter // get links and generate filter
for (MultiProtocolURI u: scraper.getAnchors().keySet()) { for (MultiProtocolURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(new DigestURI(u)); newRootURLs.add(DigestURI.toDigestURI(u));
} }
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);

@ -63,7 +63,7 @@ public class ServerScannerList {
while (se.hasNext()) { while (se.hasNext()) {
host = se.next(); host = se.next();
try { try {
u = new DigestURI(host.getKey().url()); u = DigestURI.toDigestURI(host.getKey().url());
urlString = u.toNormalform(true); urlString = u.toNormalform(true);
prop.put("servertable_list_" + i + "_edit", edit ? 1 : 0); prop.put("servertable_list_" + i + "_edit", edit ? 1 : 0);
prop.put("servertable_list_" + i + "_edit_pk", ASCII.String(u.hash())); prop.put("servertable_list_" + i + "_edit_pk", ASCII.String(u.hash()));

@ -68,6 +68,17 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<h2>Web Structure</h2> <h2>Web Structure</h2>
#(hosts)#::
<fieldset><legend>Host List</legend>
#{list}#
<div style="float:left; padding:1px 5px 1px 5px;">
<div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><a href="/WatchWebStructure_p.html?host=#[host]#&depth=3&time=1000">#[host]#</a></div>
<div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]# outlinks</div>
</div>
#{/list}#
</fieldset>
#(/hosts)#
<div id="left"> <div id="left">
<form action="/WatchWebStructure_p.html" accept-charset="UTF-8" onsubmit="return checkform(this);"> <form action="/WatchWebStructure_p.html" accept-charset="UTF-8" onsubmit="return checkform(this);">
<fieldset> <fieldset>

@ -4,7 +4,10 @@
//$LastChangedBy$ //$LastChangedBy$
// //
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
@ -27,7 +30,7 @@ public class WatchWebStructure_p {
int width = 1024; int width = 1024;
int height = 576; int height = 576;
int depth = 3; int depth = 3;
int nodes = 500; // maximum number of host nodes that are painted int nodes = 300; // maximum number of host nodes that are painted
int time = -1; int time = -1;
String host = "auto"; String host = "auto";
String besthost; String besthost;
@ -36,7 +39,7 @@ public class WatchWebStructure_p {
width = post.getInt("width", 1024); width = post.getInt("width", 1024);
height = post.getInt("height", 576); height = post.getInt("height", 576);
depth = post.getInt("depth", 3); depth = post.getInt("depth", 3);
nodes = post.getInt("nodes", width * height * 100 / 1024 / 576); nodes = post.getInt("nodes", width * height * 300 / 1024 / 576);
time = post.getInt("time", -1); time = post.getInt("time", -1);
host = post.get("host", "auto"); host = post.get("host", "auto");
color_text = post.get("colortext", color_text); color_text = post.get("colortext", color_text);
@ -71,6 +74,22 @@ public class WatchWebStructure_p {
} }
} }
if (post != null && post.containsKey("hosts")) {
int maxcount = 200;
ReversibleScoreMap<String> score = sb.webStructure.hostReferenceScore();
int c = 0;
Iterator<String> i = score.keys(false);
String h;
while (i.hasNext() && c < maxcount) {
h = i.next();
prop.put("hosts_list_" + c + "_host", h);
prop.put("hosts_list_" + c + "_count", score.get(h));
c++;
}
prop.put("hosts_list", c);
prop.put("hosts", 1);
}
// find start point // find start point
if (host == null || if (host == null ||
host.isEmpty() || host.isEmpty() ||

@ -28,7 +28,6 @@ import java.net.MalformedURLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -61,7 +60,7 @@ public class WebStructurePicture_p {
int width = 1024; int width = 1024;
int height = 576; int height = 576;
int depth = 3; int depth = 3;
int nodes = 100; // maximum number of host nodes that are painted int nodes = 300; // maximum number of host nodes that are painted
int time = -1; int time = -1;
String host = null; String host = null;
int cyc = 0; int cyc = 0;
@ -156,16 +155,13 @@ public class WebStructurePicture_p {
final double radius = 1.0 / (1 << nextlayer); final double radius = 1.0 / (1 << nextlayer);
final WebStructureGraph.StructureEntry sr = structure.outgoingReferences(centerhash); final WebStructureGraph.StructureEntry sr = structure.outgoingReferences(centerhash);
final Map<String, Integer> next = (sr == null) ? new HashMap<String, Integer>() : sr.references; final Map<String, Integer> next = (sr == null) ? new HashMap<String, Integer>() : sr.references;
Map.Entry<String, Integer> entry;
String targethash, targethost; String targethash, targethost;
// first set points to next hosts // first set points to next hosts
final Iterator<Map.Entry<String, Integer>> i = next.entrySet().iterator();
final List<String[]> targets = new ArrayList<String[]>(); final List<String[]> targets = new ArrayList<String[]>();
int maxtargetrefs = 8, maxthisrefs = 8; int maxtargetrefs = 8, maxthisrefs = 8;
int targetrefs, thisrefs; int targetrefs, thisrefs;
double rr, re; double rr, re;
while (i.hasNext() && maxnodes > 0 && System.currentTimeMillis() < timeout) { for (Map.Entry<String, Integer> entry: next.entrySet()) {
entry = i.next();
targethash = entry.getKey(); targethash = entry.getKey();
targethost = structure.hostHash2hostName(targethash); targethost = structure.hostHash2hostName(targethash);
if (targethost == null) continue; if (targethost == null) continue;
@ -181,15 +177,12 @@ public class WebStructurePicture_p {
rr = radius * 0.25 * (1 - targetrefs / (double) maxtargetrefs); rr = radius * 0.25 * (1 - targetrefs / (double) maxtargetrefs);
re = radius * 0.5 * (thisrefs / (double) maxthisrefs); re = radius * 0.5 * (thisrefs / (double) maxthisrefs);
graph.addNode(targethost, x + (radius - rr - re) * Math.cos(angle), y + (radius - rr - re) * Math.sin(angle), nextlayer); graph.addNode(targethost, x + (radius - rr - re) * Math.cos(angle), y + (radius - rr - re) * Math.sin(angle), nextlayer);
maxnodes--;
mynodes++; mynodes++;
if (maxnodes-- <= 0 || System.currentTimeMillis() >= timeout) break;
} }
// recursively set next hosts // recursively set next hosts
final Iterator<String[]> j = targets.iterator();
String[] target;
int nextnodes; int nextnodes;
while (j.hasNext()) { for (String[] target: targets) {
target = j.next();
targethash = target[0]; targethash = target[0];
targethost = target[1]; targethost = target[1];
final GraphPlotter.Point c = graph.getNode(targethost); final GraphPlotter.Point c = graph.getNode(targethost);

@ -79,14 +79,14 @@ public class webstructure {
prop.put("out", 1); prop.put("out", 1);
prop.put("in", 1); prop.put("in", 1);
WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hosthash); WebStructureGraph.StructureEntry sentry = sb.webStructure.outgoingReferences(hosthash);
if (sentry != null) { if (sentry != null && sentry.references.size() > 0) {
reference(prop, "out", 0, sentry, sb.webStructure); reference(prop, "out", 0, sentry, sb.webStructure);
prop.put("out_domains", 1); prop.put("out_domains", 1);
} else { } else {
prop.put("out_domains", 0); prop.put("out_domains", 0);
} }
sentry = sb.webStructure.incomingReferences(hosthash); sentry = sb.webStructure.incomingReferences(hosthash);
if (sentry != null) { if (sentry != null && sentry.references.size() > 0) {
reference(prop, "in", 0, sentry, sb.webStructure); reference(prop, "in", 0, sentry, sb.webStructure);
prop.put("in_domains", 1); prop.put("in_domains", 1);
} else { } else {
@ -113,7 +113,7 @@ public class webstructure {
int d = 0; int d = 0;
Iterator<MultiProtocolURI> i = scraper.inboundLinks().iterator(); Iterator<MultiProtocolURI> i = scraper.inboundLinks().iterator();
while (i.hasNext()) { while (i.hasNext()) {
DigestURI refurl = new DigestURI(i.next()); DigestURI refurl = DigestURI.toDigestURI(i.next());
byte[] refhash = refurl.hash(); byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash); prop.put("references_documents_0_anchors_" + d + "_hash", refhash);
@ -122,7 +122,7 @@ public class webstructure {
} }
i = scraper.outboundLinks().iterator(); i = scraper.outboundLinks().iterator();
while (i.hasNext()) { while (i.hasNext()) {
DigestURI refurl = new DigestURI(i.next()); DigestURI refurl = DigestURI.toDigestURI(i.next());
byte[] refhash = refurl.hash(); byte[] refhash = refurl.hash();
prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true)); prop.putXML("references_documents_0_anchors_" + d + "_url", refurl.toNormalform(true));
prop.put("references_documents_0_anchors_" + d + "_hash", refhash); prop.put("references_documents_0_anchors_" + d + "_hash", refhash);

@ -204,7 +204,7 @@ public final class CrawlStacker {
if (e.getKey() == null) continue; if (e.getKey() == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here) // delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = new DigestURI(e.getKey()); final DigestURI url = DigestURI.toDigestURI(e.getKey());
final byte[] urlhash = url.hash(); final byte[] urlhash = url.hash();
if (replace) { if (replace) {
this.indexSegment.fulltext().remove(urlhash); this.indexSegment.fulltext().remove(urlhash);

@ -150,7 +150,7 @@ public final class HTTPLoader {
} }
// normalize URL // normalize URL
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// restart crawling with new url // restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
@ -289,7 +289,7 @@ public final class HTTPLoader {
} }
// normalizing URL // normalizing URL
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); final DigestURI redirectionUrl = DigestURI.toDigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling

@ -155,7 +155,7 @@ public class BookmarkHelper {
if ("".equals(title)) {//cannot be displayed if ("".equals(title)) {//cannot be displayed
title = url.toString(); title = url.toString();
} }
bm = db.new Bookmark(new DigestURI(url)); bm = db.new Bookmark(DigestURI.toDigestURI(url));
bm.setProperty(Bookmark.BOOKMARK_TITLE, title); bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
bm.setTags(tags); bm.setTags(tags);
bm.setPublic(importPublic); bm.setPublic(importPublic);

@ -168,7 +168,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
Document[] theDocs; Document[] theDocs;
// workaround for relative links in file, normally '#' shall be used behind the location, see // workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects // below for reversion of the effects
final DigestURI url = new DigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath)); final DigestURI url = DigestURI.toDigestURI(MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath));
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());

@ -90,7 +90,7 @@ public class tarParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize()); FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(new DigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp); subDocs = TextParser.parseSource(DigestURI.toDigestURI(MultiProtocolURI.newURL(url,"#" + name)), mime, null, tmp);
if (subDocs == null) continue; if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d); for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -87,7 +87,7 @@ public class zipParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(zis, tmp, entry.getSize()); FileUtils.copy(zis, tmp, entry.getSize());
final DigestURI virtualURL = new DigestURI(MultiProtocolURI.newURL(url, "#" + name)); final DigestURI virtualURL = DigestURI.toDigestURI(MultiProtocolURI.newURL(url, "#" + name));
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp); docs = TextParser.parseSource(virtualURL, mime, null, tmp);
if (docs == null) continue; if (docs == null) continue;

@ -133,11 +133,16 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
/** /**
* DigestURI from general URI * DigestURI from general URI
* @param baseURL * @param u
*/ */
public DigestURI(final MultiProtocolURI baseURL) { private DigestURI(final MultiProtocolURI u) {
super(baseURL); super(u);
this.hash = (baseURL instanceof DigestURI) ? ((DigestURI) baseURL).hash : null; this.hash = (u instanceof DigestURI) ? ((DigestURI) u).hash : null;
}
public static DigestURI toDigestURI(MultiProtocolURI u) {
return (u instanceof DigestURI) ? ((DigestURI) u) : new DigestURI(u);
} }
/** /**

@ -49,6 +49,8 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.util.LookAheadIterator; import net.yacy.cora.util.LookAheadIterator;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.Document; import net.yacy.document.Document;
@ -64,8 +66,7 @@ import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
public class WebStructureGraph public class WebStructureGraph {
{
public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia) public static int maxref = 300; // maximum number of references, to avoid overflow when a large link farm occurs (i.e. wikipedia)
public static int maxhosts = 50000; // maximum number of hosts in web structure map public static int maxhosts = 50000; // maximum number of hosts in web structure map
@ -75,17 +76,16 @@ public class WebStructureGraph
private final File structureFile; private final File structureFile;
private final TreeMap<String, byte[]> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}* private final TreeMap<String, byte[]> structure_old; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private final TreeMap<String, byte[]> structure_new; private final TreeMap<String, byte[]> structure_new;
private final BlockingQueue<leanrefObject> publicRefDNSResolvingQueue; private final BlockingQueue<LearnObject> publicRefDNSResolvingQueue;
private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker; private final PublicRefDNSResolvingProcess publicRefDNSResolvingWorker;
private final static leanrefObject leanrefObjectPOISON = new leanrefObject(null, null); private final static LearnObject leanrefObjectPOISON = new LearnObject(null, null);
private static class leanrefObject private static class LearnObject {
{
private final DigestURI url; private final DigestURI url;
private final Set<MultiProtocolURI> globalRefURLs; private final Set<MultiProtocolURI> globalRefURLs;
private leanrefObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) { private LearnObject(final DigestURI url, final Set<MultiProtocolURI> globalRefURLs) {
this.url = url; this.url = url;
this.globalRefURLs = globalRefURLs; this.globalRefURLs = globalRefURLs;
} }
@ -95,7 +95,7 @@ public class WebStructureGraph
this.structure_old = new TreeMap<String, byte[]>(); this.structure_old = new TreeMap<String, byte[]>();
this.structure_new = new TreeMap<String, byte[]>(); this.structure_new = new TreeMap<String, byte[]>();
this.structureFile = structureFile; this.structureFile = structureFile;
this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<leanrefObject>(); this.publicRefDNSResolvingQueue = new LinkedBlockingQueue<LearnObject>();
// load web structure // load web structure
Map<String, byte[]> loadedStructureB; Map<String, byte[]> loadedStructureB;
@ -142,7 +142,7 @@ public class WebStructureGraph
@Override @Override
public void run() { public void run() {
leanrefObject lro; LearnObject lro;
try { try {
while ( (lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON ) { while ( (lro = WebStructureGraph.this.publicRefDNSResolvingQueue.take()) != leanrefObjectPOISON ) {
learnrefs(lro); learnrefs(lro);
@ -170,7 +170,7 @@ public class WebStructureGraph
globalRefURLs.add(u); globalRefURLs.add(u);
} }
} }
final leanrefObject lro = new leanrefObject(url, globalRefURLs); final LearnObject lro = new LearnObject(url, globalRefURLs);
if ( !globalRefURLs.isEmpty() ) { if ( !globalRefURLs.isEmpty() ) {
try { try {
if ( this.publicRefDNSResolvingWorker.isAlive() ) { if ( this.publicRefDNSResolvingWorker.isAlive() ) {
@ -184,34 +184,6 @@ public class WebStructureGraph
} }
} }
private void learnrefs(final leanrefObject lro) {
final StringBuilder cpg = new StringBuilder(240);
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
//final String refhashp = ASCII.String(lro.url.hash(), 6, 6); // ref hash part
String nexturlhash;
for ( final MultiProtocolURI u : lro.globalRefURLs ) {
if (Switchboard.getSwitchboard().shallTerminate()) break;
final byte[] nexturlhashb = new DigestURI(u).hash();
assert nexturlhashb != null;
if ( nexturlhashb != null ) {
nexturlhash = ASCII.String(nexturlhashb);
assert nexturlhash.length() == 12 : "nexturlhash.length() = "
+ nexturlhash.length()
+ ", nexturlhash = "
+ nexturlhash;
//assert !nexturlhash.substring(6).equals(refhashp);
// this is a global link
cpg.append(nexturlhash); // store complete hash
assert cpg.length() % 12 == 0 : "cpg.length() = "
+ cpg.length()
+ ", cpg = "
+ cpg.toString();
}
}
assert cpg.length() % 12 == 0 : "cpg.length() = " + cpg.length() + ", cpg = " + cpg.toString();
learn(lro.url, cpg);
}
private static int refstr2count(final String refs) { private static int refstr2count(final String refs) {
if ( (refs == null) || (refs.length() <= 8) ) { if ( (refs == null) || (refs.length() <= 8) ) {
return 0; return 0;
@ -220,7 +192,7 @@ public class WebStructureGraph
return (refs.length() - 8) / 10; return (refs.length() - 8) / 10;
} }
static Map<String, Integer> refstr2map(final String refs) { private static Map<String, Integer> refstr2map(final String refs) {
if ( (refs == null) || (refs.length() <= 8) ) { if ( (refs == null) || (refs.length() <= 8) ) {
return new HashMap<String, Integer>(); return new HashMap<String, Integer>();
} }
@ -240,8 +212,12 @@ public class WebStructureGraph
return map; return map;
} }
private static String none2refstr() {
return GenericFormatter.SHORT_DAY_FORMATTER.format();
}
private static String map2refstr(final Map<String, Integer> map) { private static String map2refstr(final Map<String, Integer> map) {
final StringBuilder s = new StringBuilder(map.size() * 10); final StringBuilder s = new StringBuilder(GenericFormatter.PATTERN_SHORT_DAY.length() + map.size() * 10);
s.append(GenericFormatter.SHORT_DAY_FORMATTER.format()); s.append(GenericFormatter.SHORT_DAY_FORMATTER.format());
String h; String h;
for ( final Map.Entry<String, Integer> entry : map.entrySet() ) { for ( final Map.Entry<String, Integer> entry : map.entrySet() ) {
@ -265,6 +241,31 @@ public class WebStructureGraph
return s.toString(); return s.toString();
} }
public boolean exists(final String hosthash) {
// returns a map with a hosthash(String):refcount(Integer) relation
assert hosthash.length() == 6;
SortedMap<String, byte[]> tailMap;
synchronized ( this.structure_old ) {
tailMap = this.structure_old.tailMap(hosthash);
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
return true;
}
}
}
synchronized ( this.structure_new ) {
tailMap = this.structure_new.tailMap(hosthash);
if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) {
return true;
}
}
}
return false;
}
public StructureEntry outgoingReferences(final String hosthash) { public StructureEntry outgoingReferences(final String hosthash) {
// returns a map with a hosthash(String):refcount(Integer) relation // returns a map with a hosthash(String):refcount(Integer) relation
assert hosthash.length() == 6; assert hosthash.length() == 6;
@ -279,7 +280,7 @@ public class WebStructureGraph
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) { if ( key.startsWith(hosthash) ) {
hostname = key.substring(7); hostname = key.substring(7);
ref = UTF8.String(tailMap.get(key)); ref = ASCII.String(tailMap.get(key));
date = ref.substring(0, 8); date = ref.substring(0, 8);
h = refstr2map(ref); h = refstr2map(ref);
} }
@ -290,7 +291,7 @@ public class WebStructureGraph
if ( !tailMap.isEmpty() ) { if ( !tailMap.isEmpty() ) {
final String key = tailMap.firstKey(); final String key = tailMap.firstKey();
if ( key.startsWith(hosthash) ) { if ( key.startsWith(hosthash) ) {
ref = UTF8.String(tailMap.get(key)); ref = ASCII.String(tailMap.get(key));
if ( hostname.isEmpty() ) { if ( hostname.isEmpty() ) {
hostname = key.substring(7); hostname = key.substring(7);
} }
@ -371,7 +372,7 @@ public class WebStructureGraph
private final Row.Entry entry; private final Row.Entry entry;
public HostReference(final byte[] hostHash, final long modified, final int count) { private HostReference(final byte[] hostHash, final long modified, final int count) {
assert (hostHash.length == 6) : "hostHash = " + ASCII.String(hostHash); assert (hostHash.length == 6) : "hostHash = " + ASCII.String(hostHash);
this.entry = hostReferenceFactory.getRow().newEntry(); this.entry = hostReferenceFactory.getRow().newEntry();
this.entry.setCol(0, hostHash); this.entry.setCol(0, hostHash);
@ -383,7 +384,7 @@ public class WebStructureGraph
this.entry = hostReferenceFactory.getRow().newEntry(json, true); this.entry = hostReferenceFactory.getRow().newEntry(json, true);
} }
public HostReference(final Row.Entry entry) { private HostReference(final Row.Entry entry) {
this.entry = entry; this.entry = entry;
} }
@ -402,7 +403,7 @@ public class WebStructureGraph
return this.entry.getPrimaryKeyBytes(); return this.entry.getPrimaryKeyBytes();
} }
public int count() { private int count() {
return (int) this.entry.getColLong(2); return (int) this.entry.getColLong(2);
} }
@ -436,9 +437,9 @@ public class WebStructureGraph
} }
public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory(); public static final HostReferenceFactory hostReferenceFactory = new HostReferenceFactory();
public static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null; private static ReferenceContainerCache<HostReference> hostReferenceIndexCache = null;
public static long hostReferenceIndexCacheTime = 0; private static long hostReferenceIndexCacheTime = 0;
public static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache private static final long hostReferenceIndexCacheTTL = 1000 * 60 * 60 * 12; // 12 hours time to live for cache
public synchronized ReferenceContainerCache<HostReference> incomingReferences() { public synchronized ReferenceContainerCache<HostReference> incomingReferences() {
// we return a cache if the cache is filled and not stale // we return a cache if the cache is filled and not stale
@ -508,22 +509,6 @@ public class WebStructureGraph
} }
} }
/*
private void incomingReferencesTest(ReferenceContainerCache<HostReference> idx) {
for (ReferenceContainer<HostReference> references: idx) {
log.logInfo("Term-Host: " + hostHash2hostName(UTF8.String(references.getTermHash())));
Iterator<HostReference> referenceIterator = references.entries();
StringBuilder s = new StringBuilder();
HostReference reference;
while (referenceIterator.hasNext()) {
reference = referenceIterator.next();
s.append(reference.toPropertyForm());
log.logInfo(" ... referenced by " + hostHash2hostName(UTF8.String(reference.metadataHash())) + ", " + reference.count() + " references");
}
}
}
*/
public int referencesCount(final String hosthash) { public int referencesCount(final String hosthash) {
// returns the number of hosts that are referenced by this hosthash // returns the number of hosts that are referenced by this hosthash
assert hosthash.length() == 6 : "hosthash = " + hosthash; assert hosthash.length() == 6 : "hosthash = " + hosthash;
@ -578,21 +563,31 @@ public class WebStructureGraph
return null; return null;
} }
private void learn(final DigestURI url, final StringBuilder reference /*string of b64(12digits)-hashes*/) {
final String hosthash = ASCII.String(url.hash(), 6, 6); private void learnrefs(final LearnObject lro) {
final Set<String> refhosts = new HashSet<String>();
DigestURI du;
String hosthash;
for ( final MultiProtocolURI u : lro.globalRefURLs ) {
if (Switchboard.getSwitchboard().shallTerminate()) break;
du = DigestURI.toDigestURI(u);
hosthash = ASCII.String(du.hash(), 6, 12);
if (!exists(hosthash)) {
// this must be recorded as an host with no references
synchronized ( this.structure_new ) {
this.structure_new.put(hosthash + "," + u.getHost(), UTF8.getBytes(none2refstr()));
}
}
refhosts.add(hosthash);
}
final DigestURI url = lro.url;
hosthash = ASCII.String(url.hash(), 6, 6);
// parse the new reference string and join it with the stored references // parse the new reference string and join it with the stored references
final StructureEntry structure = outgoingReferences(hosthash); final StructureEntry structure = outgoingReferences(hosthash);
final Map<String, Integer> refs = final Map<String, Integer> refs = (structure == null) ? new HashMap<String, Integer>() : structure.references;
(structure == null) ? new HashMap<String, Integer>() : structure.references;
assert reference.length() % 12 == 0 : "reference.length() = "
+ reference.length()
+ ", reference = "
+ reference.toString();
String dom;
int c; int c;
for ( int i = 0; i < reference.length() / 12; i++ ) { for (String dom: refhosts) {
dom = reference.substring(i * 12 + 6, (i + 1) * 12);
c = 0; c = 0;
if ( refs.containsKey(dom) ) { if ( refs.containsKey(dom) ) {
c = (refs.get(dom)).intValue(); c = (refs.get(dom)).intValue();
@ -682,13 +677,26 @@ public class WebStructureGraph
return maxhost; return maxhost;
} }
public ReversibleScoreMap<String> hostReferenceScore() {
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(ASCII.identityASCIIComparator);
synchronized ( this.structure_old ) {
for ( final Map.Entry<String, byte[]> entry : this.structure_old.entrySet() ) {
result.set(entry.getKey().substring(7), (entry.getValue().length - 8) / 10);
}
}
synchronized ( this.structure_new ) {
for ( final Map.Entry<String, byte[]> entry : this.structure_new.entrySet() ) {
result.set(entry.getKey().substring(7), (entry.getValue().length - 8) / 10);
}
}
return result;
}
public Iterator<StructureEntry> structureEntryIterator(final boolean latest) { public Iterator<StructureEntry> structureEntryIterator(final boolean latest) {
return new StructureIterator(latest); return new StructureIterator(latest);
} }
private class StructureIterator extends LookAheadIterator<StructureEntry> implements private class StructureIterator extends LookAheadIterator<StructureEntry> implements Iterator<StructureEntry> {
Iterator<StructureEntry>
{
private final Iterator<Map.Entry<String, byte[]>> i; private final Iterator<Map.Entry<String, byte[]>> i;
@ -727,13 +735,17 @@ public class WebStructureGraph
} }
} }
public static class StructureEntry public static class StructureEntry implements Comparable<StructureEntry> {
{
public String hosthash; // the tail of the host hash public String hosthash; // the tail of the host hash
public String hostname; // the host name public String hostname; // the host name
public String date; // date of latest change public String date; // date of latest change
public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host public Map<String, Integer> references; // a map from the referenced host hash to the number of referenced to that host
private StructureEntry(final String hosthash, final String hostname) {
this(hosthash, hostname, GenericFormatter.SHORT_DAY_FORMATTER.format(), new HashMap<String, Integer>());
}
private StructureEntry( private StructureEntry(
final String hosthash, final String hosthash,
final String hostname, final String hostname,
@ -744,6 +756,22 @@ public class WebStructureGraph
this.date = date; this.date = date;
this.references = references; this.references = references;
} }
@Override
public int compareTo(StructureEntry arg0) {
return hosthash.compareTo(arg0.hosthash);
}
@Override
public boolean equals(Object o) {
if (!(o instanceof StructureEntry)) return false;
return hosthash.equals(((StructureEntry)o).hosthash);
}
@Override
public int hashCode() {
return this.hosthash.hashCode();
}
} }
public synchronized void close() { public synchronized void close() {

@ -2552,7 +2552,7 @@ public final class Switchboard extends serverSwitch
// CREATE INDEX // CREATE INDEX
final String dc_title = document.dc_title(); final String dc_title = document.dc_title();
final DigestURI url = new DigestURI(document.dc_source()); final DigestURI url = DigestURI.toDigestURI(document.dc_source());
final DigestURI referrerURL = queueEntry.referrerURL(); final DigestURI referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
@ -2620,7 +2620,7 @@ public final class Switchboard extends serverSwitch
rssRow.put("title", UTF8.getBytes(rssEntry.getValue())); rssRow.put("title", UTF8.getBytes(rssEntry.getValue()));
rssRow.put("recording_date", new Date()); rssRow.put("recording_date", new Date());
try { try {
this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow); this.tables.update("rss", DigestURI.toDigestURI(rssEntry.getKey()).hash(), rssRow);
} catch ( final IOException e ) { } catch ( final IOException e ) {
Log.logException(e); Log.logException(e);
} }
@ -3180,7 +3180,7 @@ public final class Switchboard extends serverSwitch
final Iterator<MultiProtocolURI> i = links.keySet().iterator(); final Iterator<MultiProtocolURI> i = links.keySet().iterator();
final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false); final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
while (i.hasNext()) { while (i.hasNext()) {
url = new DigestURI(i.next()); url = DigestURI.toDigestURI(i.next());
boolean islocal = url.getHost().contentEquals(startUrl.getHost()); boolean islocal = url.getHost().contentEquals(startUrl.getHost());
// add all external links or links to different page to crawler // add all external links or links to different page to crawler
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) { if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {

@ -266,7 +266,7 @@ public class Segment {
final long urldate = urlModified.getTime(); final long urldate = urlModified.getTime();
for (Map.Entry<MultiProtocolURI, Properties> anchorEntry: anchors.entrySet()) { for (Map.Entry<MultiProtocolURI, Properties> anchorEntry: anchors.entrySet()) {
MultiProtocolURI anchor = anchorEntry.getKey(); MultiProtocolURI anchor = anchorEntry.getKey();
byte[] refhash = new DigestURI(anchor).hash(); byte[] refhash = DigestURI.toDigestURI(anchor).hash();
//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString()); //System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
if (this.urlCitationIndex != null) try { if (this.urlCitationIndex != null) try {
this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate)); this.urlCitationIndex.add(refhash, new CitationReference(urlhash, urldate));

@ -213,7 +213,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
} }
final SolrInputDocument doc = new SolrInputDocument(); final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(md.url()); final DigestURI digestURI = DigestURI.toDigestURI(md.url());
boolean allAttr = this.isEmpty(); boolean allAttr = this.isEmpty();
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, "");
@ -341,7 +341,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) { public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) {
// we use the SolrCell design as index scheme // we use the SolrCell design as index scheme
final SolrInputDocument doc = new SolrInputDocument(); final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); final DigestURI digestURI = DigestURI.toDigestURI(yacydoc.dc_source());
boolean allAttr = this.isEmpty(); boolean allAttr = this.isEmpty();
add(doc, YaCySchema.id, id); add(doc, YaCySchema.id, id);
if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before) if (allAttr || contains(YaCySchema.failreason_t)) add(doc, YaCySchema.failreason_t, ""); // overwrite a possible fail reason (in case that there was a fail reason before)

@ -177,7 +177,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
final List<MediaSnippet> result = new ArrayList<MediaSnippet>(); final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
url = new DigestURI(entry.getKey()); url = DigestURI.toDigestURI(entry.getKey());
desc = entry.getValue(); desc = entry.getValue();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() + final int ranking = removeAppearanceHashes(url.toNormalform(true), queryhashes).size() +
@ -202,7 +202,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
final List<MediaSnippet> result = new ArrayList<MediaSnippet>(); final List<MediaSnippet> result = new ArrayList<MediaSnippet>();
while (i.hasNext()) { while (i.hasNext()) {
ientry = i.next(); ientry = i.next();
url = new DigestURI(ientry.url()); url = DigestURI.toDigestURI(ientry.url());
final String u = url.toString(); final String u = url.toString();
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue; if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue; if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;

@ -308,7 +308,7 @@ public final class HTTPDProxyHandler {
DigestURI url = null; DigestURI url = null;
try { try {
url = new DigestURI(HeaderFramework.getRequestURL(conProp)); url = DigestURI.toDigestURI(HeaderFramework.getRequestURL(conProp));
if (log.isFine()) log.logFine(reqID +" GET "+ url); if (log.isFine()) log.logFine(reqID +" GET "+ url);
if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader);
@ -391,7 +391,7 @@ public final class HTTPDProxyHandler {
final Request request = new Request( final Request request = new Request(
null, null,
url, url,
requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(),
"", "",
cachedResponseHeader.lastModified(), cachedResponseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),
@ -527,7 +527,7 @@ public final class HTTPDProxyHandler {
final Request request = new Request( final Request request = new Request(
null, null,
url, url,
requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), requestHeader.referer() == null ? null : DigestURI.toDigestURI(requestHeader.referer()).hash(),
"", "",
responseHeader.lastModified(), responseHeader.lastModified(),
sb.crawler.defaultProxyProfile.handle(), sb.crawler.defaultProxyProfile.handle(),

Loading…
Cancel
Save