enhaced data structures for balancer and latency computation which

should produce a bit better prognosis about forced waiting times.
pull/1/head
Michael Peter Christen 13 years ago
parent ac9540dfb6
commit 0fe8be7981

@ -121,7 +121,7 @@ public class IndexCreateQueues_p {
prop.put("crawler_embed_deletepattern", deletepattern); prop.put("crawler_embed_deletepattern", deletepattern);
prop.put("crawler_embed_queuename", stackType.name()); prop.put("crawler_embed_queuename", stackType.name());
final Map<String, Integer[]> hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType); final Map<String, Integer[]> hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType, sb.robots);
int hc = 0; int hc = 0;
for (Map.Entry<String, Integer[]> host: hosts.entrySet()) { for (Map.Entry<String, Integer[]> host: hosts.entrySet()) {

@ -76,13 +76,22 @@ public class Balancer {
private BufferedObjectIndex urlFileIndex; private BufferedObjectIndex urlFileIndex;
// class variables computed during operation // class variables computed during operation
private final ConcurrentMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs private final ConcurrentMap<String, HostHandles> domainStacks; // a map from host name to lists with url hashs
private final HandleSet double_push_check; // for debugging private final HandleSet double_push_check; // for debugging
private long lastDomainStackFill; private long lastDomainStackFill;
private int domStackInitSize; private int domStackInitSize;
private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates; private final List<Map.Entry<String, byte[]>> zeroWaitingCandidates;
private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting
private static class HostHandles {
public String hosthash;
public HandleSet handleSet;
public HostHandles(final String hosthash, final HandleSet handleSet) {
this.hosthash = hosthash;
this.handleSet = handleSet;
}
}
public Balancer( public Balancer(
final File cachePath, final File cachePath,
final String stackname, final String stackname,
@ -92,7 +101,7 @@ public class Balancer {
final boolean useTailCache, final boolean useTailCache,
final boolean exceed134217727) { final boolean exceed134217727) {
this.cacheStacksPath = cachePath; this.cacheStacksPath = cachePath;
this.domainStacks = new ConcurrentHashMap<String, HandleSet>(); this.domainStacks = new ConcurrentHashMap<String, HostHandles>();
this.minimumLocalDelta = minimumLocalDelta; this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta; this.minimumGlobalDelta = minimumGlobalDelta;
this.myAgentIDs = myAgentIDs; this.myAgentIDs = myAgentIDs;
@ -204,10 +213,10 @@ public class Balancer {
assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s; assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
// iterate through the domain stacks // iterate through the domain stacks
final Iterator<Map.Entry<String, HandleSet>> q = this.domainStacks.entrySet().iterator(); final Iterator<Map.Entry<String, HostHandles>> q = this.domainStacks.entrySet().iterator();
HandleSet stack; HandleSet stack;
while (q.hasNext()) { while (q.hasNext()) {
stack = q.next().getValue(); stack = q.next().getValue().handleSet;
for (final byte[] handle: urlHashes) stack.remove(handle); for (final byte[] handle: urlHashes) stack.remove(handle);
if (stack.isEmpty()) q.remove(); if (stack.isEmpty()) q.remove();
} }
@ -242,8 +251,8 @@ public class Balancer {
private boolean domainStacksNotEmpty() { private boolean domainStacksNotEmpty() {
if (this.domainStacks == null) return false; if (this.domainStacks == null) return false;
synchronized (this.domainStacks) { synchronized (this.domainStacks) {
for (final HandleSet l: this.domainStacks.values()) { for (final HostHandles l: this.domainStacks.values()) {
if (!l.isEmpty()) return true; if (!l.handleSet.isEmpty()) return true;
} }
} }
return false; return false;
@ -285,11 +294,11 @@ public class Balancer {
* get a list of domains that are currently maintained as domain stacks * get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time} * @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
*/ */
public Map<String, Integer[]> getDomainStackHosts() { public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) { for (Map.Entry<String, HostHandles> entry: this.domainStacks.entrySet()) {
int size = entry.getValue().size(); int size = entry.getValue().handleSet.size();
int delta = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); int delta = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
map.put(entry.getKey(), new Integer[]{size, delta}); map.put(entry.getKey(), new Integer[]{size, delta});
} }
return map; return map;
@ -333,8 +342,10 @@ public class Balancer {
* @return a list of crawl loader requests * @return a list of crawl loader requests
*/ */
public List<Request> getDomainStackReferences(String host, int maxcount) { public List<Request> getDomainStackReferences(String host, int maxcount) {
HandleSet domainList = this.domainStacks.get(host); HostHandles hh = this.domainStacks.get(host);
if (domainList == null || domainList.isEmpty()) return new ArrayList<Request>(0); if (hh == null) return new ArrayList<Request>(0);
HandleSet domainList = hh.handleSet;
if (domainList.isEmpty()) return new ArrayList<Request>(0);
ArrayList<Request> cel = new ArrayList<Request>(maxcount); ArrayList<Request> cel = new ArrayList<Request>(maxcount);
for (int i = 0; i < maxcount; i++) { for (int i = 0; i < maxcount; i++) {
if (domainList.size() <= i) break; if (domainList.size() <= i) break;
@ -358,16 +369,17 @@ public class Balancer {
return cel; return cel;
} }
private void pushHashToDomainStacks(String host, final byte[] urlhash) throws SpaceExceededException { private void pushHashToDomainStacks(String host, String hosthash, final byte[] urlhash) throws SpaceExceededException {
// extend domain stack // extend domain stack
if (host == null) host = Domains.LOCALHOST; if (host == null) host = Domains.LOCALHOST;
HandleSet domainList = this.domainStacks.get(host); HostHandles hh = this.domainStacks.get(host);
if (domainList == null) { if (hh == null) {
// create new list // create new list
domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1); HandleSet domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
domainList.put(urlhash); domainList.put(urlhash);
this.domainStacks.put(host, domainList); this.domainStacks.put(host, new HostHandles(hosthash, domainList));
} else { } else {
HandleSet domainList = hh.handleSet;
// extend existent domain list // extend existent domain list
domainList.put(urlhash); domainList.put(urlhash);
} }
@ -376,11 +388,12 @@ public class Balancer {
private void removeHashFromDomainStacks(String host, final byte[] urlhash) { private void removeHashFromDomainStacks(String host, final byte[] urlhash) {
// reduce domain stack // reduce domain stack
if (host == null) host = Domains.LOCALHOST; if (host == null) host = Domains.LOCALHOST;
final HandleSet domainList = this.domainStacks.get(host); HostHandles hh = this.domainStacks.get(host);
if (domainList == null) { if (hh == null) {
this.domainStacks.remove(host); this.domainStacks.remove(host);
return; return;
} }
HandleSet domainList = hh.handleSet;
domainList.remove(urlhash); domainList.remove(urlhash);
if (domainList.isEmpty()) this.domainStacks.remove(host); if (domainList.isEmpty()) this.domainStacks.remove(host);
} }
@ -495,26 +508,24 @@ public class Balancer {
} }
// iterate over the domain stacks // iterate over the domain stacks
final Iterator<Map.Entry<String, HandleSet>> i = this.domainStacks.entrySet().iterator(); final Iterator<Map.Entry<String, HostHandles>> i = this.domainStacks.entrySet().iterator();
Map.Entry<String, HandleSet> entry; Map.Entry<String, HostHandles> entry;
long smallestWaiting = Long.MAX_VALUE; OrderedScoreMap<Map.Entry<String, byte[]>> nextZeroCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
byte[] besturlhash = null; OrderedScoreMap<Map.Entry<String, byte[]>> failoverCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
String besthost = null; int newCandidatesForward = 1;
OrderedScoreMap<Map.Entry<String, byte[]>> nextZeroCandidates = new OrderedScoreMap<Map.Entry<String, byte[]>>(null);
int newCandidatesForward = 10;
while (i.hasNext() && nextZeroCandidates.size() < 1000) { while (i.hasNext() && nextZeroCandidates.size() < 1000) {
entry = i.next(); entry = i.next();
// clean up empty entries // clean up empty entries
if (entry.getValue().isEmpty()) { if (entry.getValue().handleSet.isEmpty()) {
i.remove(); i.remove();
continue; continue;
} }
final byte[] urlhash = entry.getValue().getOne(0); final byte[] urlhash = entry.getValue().handleSet.getOne(0);
if (urlhash == null) continue; if (urlhash == null) continue;
long w; int w;
Row.Entry rowEntry; Row.Entry rowEntry;
try { try {
rowEntry = this.urlFileIndex.get(urlhash, false); rowEntry = this.urlFileIndex.get(urlhash, false);
@ -526,50 +537,55 @@ public class Balancer {
//System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)); //System.out.println("*** waitingRemaining = " + w + ", guessed = " + Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta));
//System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta)); //System.out.println("*** explained: " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta));
} catch (IOException e1) { } catch (IOException e1) {
w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); w = Latency.waitingRemainingGuessed(entry.getKey(), entry.getValue().hosthash, robots, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta);
} }
if (w <= 0) { if (w <= 0) {
if (w == Integer.MIN_VALUE && newCandidatesForward > 0) { if (w == Integer.MIN_VALUE) {
// give new domains a chance, but not too much; otherwise a massive downloading of robots.txt from too much domains (dns lock!) will more likely block crawling if (newCandidatesForward-- > 0) {
newCandidatesForward--; nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 10000);
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 1000); } else {
failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), 0);
}
} else { } else {
nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), entry.getValue().size()); nextZeroCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), entry.getValue().handleSet.size());
} }
} } else {
if (w < smallestWaiting || (w == smallestWaiting && this.random.nextBoolean())) { failoverCandidates.set(new AbstractMap.SimpleEntry<String, byte[]>(entry.getKey(), urlhash), w);
smallestWaiting = w;
besturlhash = urlhash;
besthost = entry.getKey();
} }
} }
Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); Log.logInfo("Balancer", "*** getbest: created new nextZeroCandidates-list, size = " + nextZeroCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
if (besturlhash == null) { if (!nextZeroCandidates.isEmpty()) {
Log.logInfo("Balancer", "*** getbest: besturlhash == null"); // take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates
return null; // this should never happen int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3);
} Iterator<Map.Entry<String, byte[]>> k = nextZeroCandidates.keys(false);
while (k.hasNext() && pick-- > 0) {
// best case would be, if we have some zeroWaitingCandidates, this.zeroWaitingCandidates.add(k.next());
// then we select that one with the largest stack }
Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size());
if (nextZeroCandidates.isEmpty()) { return pickFromZeroWaiting();
// bad luck: just take that one with least waiting
removeHashFromDomainStacks(besthost, besturlhash);
Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost);
return besturlhash;
} }
// now take some of the nextZeroCandidates and put the best into the zeroWaitingCandidates if (!failoverCandidates.isEmpty()) {
int pick = nextZeroCandidates.size() <= 10 ? nextZeroCandidates.size() : Math.max(1, nextZeroCandidates.size() / 3); // bad luck: just take that one with least waiting
Iterator<Map.Entry<String, byte[]>> k = nextZeroCandidates.keys(false); Iterator<Map.Entry<String, byte[]>> k = failoverCandidates.keys(true);
while (k.hasNext() && pick-- > 0) { String besthost;
this.zeroWaitingCandidates.add(k.next()); byte[] besturlhash;
} Map.Entry<String, byte[]> hosthash;
Log.logInfo("Balancer", "*** getbest: created new zeroWaitingCandidates-list, size = " + zeroWaitingCandidates.size() + ", domainStacks.size = " + this.domainStacks.size()); while (k.hasNext()) {
hosthash = k.next();
besthost = hosthash.getKey();
besturlhash = hosthash.getValue();
removeHashFromDomainStacks(besthost, besturlhash);
Log.logInfo("Balancer", "*** getbest: no zero waiting candidates, besthost = " + besthost);
return besturlhash;
}
}
return pickFromZeroWaiting(); Log.logInfo("Balancer", "*** getbest: besturlhash == null");
return null; // this should never happen
} }
} }
@ -579,8 +595,8 @@ public class Balancer {
byte[] hash = null; byte[] hash = null;
while (this.zeroWaitingCandidates.size() > 0) { while (this.zeroWaitingCandidates.size() > 0) {
Map.Entry<String, byte[]> z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size())); Map.Entry<String, byte[]> z = this.zeroWaitingCandidates.remove(this.random.nextInt(this.zeroWaitingCandidates.size()));
HandleSet hs = this.domainStacks.get(z.getKey()); HostHandles hh = this.domainStacks.get(z.getKey());
if (hs == null) continue; if (hh == null) continue;
host = z.getKey(); if (host == null) continue; host = z.getKey(); if (host == null) continue;
hash = z.getValue(); if (hash == null) continue; hash = z.getValue(); if (hash == null) continue;
removeHashFromDomainStacks(host, hash); removeHashFromDomainStacks(host, hash);
@ -604,6 +620,7 @@ public class Balancer {
String host; String host;
Request request; Request request;
int count = 0; int count = 0;
long timeout = System.currentTimeMillis() + 5000;
while (i.hasNext()) { while (i.hasNext()) {
handle = i.next(); handle = i.next();
final Row.Entry entry = this.urlFileIndex.get(handle, false); final Row.Entry entry = this.urlFileIndex.get(handle, false);
@ -611,12 +628,12 @@ public class Balancer {
request = new Request(entry); request = new Request(entry);
host = request.url().getHost(); host = request.url().getHost();
try { try {
pushHashToDomainStacks(host, handle); pushHashToDomainStacks(host, request.url().hosthash(), handle);
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {
break; break;
} }
count++; count++;
if (this.domainStacks.size() >= 100 || (!this.domainStacks.isEmpty() && count > 600 * this.domainStacks.size())) break; if (this.domainStacks.size() >= 1000 || count >= 100000 || System.currentTimeMillis() > timeout) break;
} }
Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms"); Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms");
this.domStackInitSize = this.domainStacks.size(); this.domStackInitSize = this.domainStacks.size();

@ -32,6 +32,7 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry; import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
@ -47,14 +48,15 @@ public class Latency {
* @param url * @param url
* @param time the time to load the file in milliseconds * @param time the time to load the file in milliseconds
*/ */
public static void updateAfterLoad(final MultiProtocolURI url, final long time) { public static void updateAfterLoad(final DigestURI url, final long time) {
final String host = url.getHost(); final String host = url.getHost();
if (host == null) return; if (host == null) return;
Host h = map.get(host); String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) { if (h == null) {
h = new Host(host, time); h = new Host(host, time);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h); map.put(hosthash, h);
} else { } else {
h.update(time); h.update(time);
} }
@ -65,23 +67,24 @@ public class Latency {
* @param url * @param url
* @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist * @param robotsCrawlDelay the crawl-delay given by the robots; 0 if not exist
*/ */
public static void updateAfterSelection(final MultiProtocolURI url, final long robotsCrawlDelay) { public static void updateAfterSelection(final DigestURI url, final long robotsCrawlDelay) {
final String host = url.getHost(); final String host = url.getHost();
if (host == null) return; if (host == null) return;
Host h = map.get(host); String hosthash = url.hosthash();
Host h = map.get(hosthash);
if (h == null) { if (h == null) {
h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay); h = new Host(host, DEFAULT_AVERAGE, robotsCrawlDelay);
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear(); if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
map.put(host, h); map.put(hosthash, h);
} else { } else {
h.update(); h.update();
} }
} }
private static Host host(final MultiProtocolURI url) { private static Host host(final DigestURI url) {
final String host = url.getHost(); final String host = url.getHost();
if (host == null) return null; if (host == null) return null;
return map.get(host); return map.get(url.hosthash());
} }
public static Iterator<Map.Entry<String, Host>> iterator() { public static Iterator<Map.Entry<String, Host>> iterator() {
@ -105,21 +108,31 @@ public class Latency {
return robotsDelay; return robotsDelay;
} }
private static int waitingRobots(final String hostport, final RobotsTxt robots, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
int robotsDelay = 0;
RobotsTxtEntry robotsEntry = robots.getEntry(hostport, thisAgents, fetchOnlineIfNotAvailableOrNotFresh);
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return -1; // no limits if granted exclusively for this peer
return robotsDelay;
}
/** /**
* guess a minimum waiting time * guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
* also the 'isCGI' property is missing, because the full text of the domain is unknown here * also the 'isCGI' property is missing, because the full text of the domain is unknown here
* @param hostname * @param hostname
* @param hosthash
* @param robots
* @param thisAgents
* @param minimumLocalDelta * @param minimumLocalDelta
* @param minimumGlobalDelta * @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds. The return value may be negative * @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time. * which expresses how long the time is over the minimum waiting time.
*/ */
public static int waitingRemainingGuessed(final String hostname, final int minimumLocalDelta, final int minimumGlobalDelta) { public static int waitingRemainingGuessed(final String hostname, final String hosthash, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
if (hostname == null) return Integer.MIN_VALUE;
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = map.get(hostname); final Host host = map.get(hosthash);
if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere if (host == null) return Integer.MIN_VALUE; // no delay if host is new; use Integer because there is a cast to int somewhere
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
@ -128,14 +141,21 @@ public class Latency {
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)
waiting += host.flux(waiting); waiting += host.flux(waiting);
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to // this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses // consider so many external accesses
waiting = Math.max(waiting, host.average() * 3 / 2); waiting = Math.max(waiting, host.average() * 3 / 2);
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
// find the delay as given by robots.txt on target site
if (robots != null) {
int robotsDelay = waitingRobots(hostname + ":80", robots, thisAgents, false);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
}
return Math.min(60000, waiting) - timeSinceLastAccess; return Math.min(60000, waiting) - timeSinceLastAccess;
} }
@ -151,7 +171,7 @@ public class Latency {
* @param minimumGlobalDelta * @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time * @return the remaining waiting time in milliseconds. can be negative to reflect the due-time after a possible nex loading time
*/ */
public static int waitingRemaining(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) { public static int waitingRemaining(final DigestURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = host(url); final Host host = host(url);
@ -184,8 +204,7 @@ public class Latency {
return Math.min(60000, waiting) - timeSinceLastAccess; return Math.min(60000, waiting) - timeSinceLastAccess;
} }
public static String waitingRemainingExplain(final DigestURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
public static String waitingRemainingExplain(final MultiProtocolURI url, final RobotsTxt robots, final Set<String> thisAgents, final int minimumLocalDelta, final int minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before // first check if the domain was _ever_ accessed before
final Host host = host(url); final Host host = host(url);

@ -232,12 +232,12 @@ public class NoticedURL {
* get a list of domains that are currently maintained as domain stacks * get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time * @return a map of clear text strings of host names to two integers: the size of the domain stacks and the access delta time
*/ */
public Map<String, Integer[]> getDomainStackHosts(final StackType stackType) { public Map<String, Integer[]> getDomainStackHosts(final StackType stackType, RobotsTxt robots) {
switch (stackType) { switch (stackType) {
case LOCAL: return this.coreStack.getDomainStackHosts(); case LOCAL: return this.coreStack.getDomainStackHosts(robots);
case GLOBAL: return this.limitStack.getDomainStackHosts(); case GLOBAL: return this.limitStack.getDomainStackHosts(robots);
case REMOTE: return this.remoteStack.getDomainStackHosts(); case REMOTE: return this.remoteStack.getDomainStackHosts(robots);
case NOLOAD: return this.noloadStack.getDomainStackHosts(); case NOLOAD: return this.noloadStack.getDomainStackHosts(robots);
default: return null; default: return null;
} }
} }

@ -93,12 +93,11 @@ public class RobotsTxt {
public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) { public RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) {
if (theURL == null) throw new IllegalArgumentException(); if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return null; if (!theURL.getProtocol().startsWith("http")) return null;
return getEntry(theURL, thisAgents, true); return getEntry(getHostPort(theURL), thisAgents, true);
} }
private RobotsTxtEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) { public RobotsTxtEntry getEntry(final String urlHostPort, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
// this method will always return a non-null value // this method will always return a non-null value
final String urlHostPort = getHostPort(theURL);
RobotsTxtEntry robotsTxt4Host = null; RobotsTxtEntry robotsTxt4Host = null;
Map<String, byte[]> record; Map<String, byte[]> record;
BEncodedHeap robotsTable = null; BEncodedHeap robotsTable = null;

@ -159,7 +159,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
} }
} }
public static String crawlStart( protected static String crawlStart(
final Switchboard sb, final Switchboard sb,
final DigestURI startURL, final DigestURI startURL,
final String urlMustMatch, final String urlMustMatch,

@ -293,7 +293,7 @@ public class RemoteSearch extends Thread {
} }
} }
}; };
solr.start(); if (targetPeer == null) solr.run(); else solr.start();
return solr; return solr;
} }

Loading…
Cancel
Save