From 0e8d7524625b34774b35b686023f0c6dd8cff65f Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 24 Sep 2013 19:55:59 +0200 Subject: [PATCH] refactoring --- source/net/yacy/crawler/Balancer.java | 38 ++--------------------- source/net/yacy/crawler/data/Latency.java | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+), 36 deletions(-) diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index 67cbabeb5..ce48c96e6 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -42,8 +42,6 @@ import org.openjena.atlas.logging.Log; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; @@ -51,7 +49,6 @@ import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.retrieval.Request; @@ -293,37 +290,6 @@ public class Balancer { return map; } - /** - * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access - * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all. - * @param robots - * @param profileEntry - * @param crawlURL - * @return the sleep time in milliseconds; may be negative for no sleep time - */ - private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) { - if (profileEntry == null) return 0; - long sleeptime = ( - profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || - (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash())) - ) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server - return sleeptime; - } - - /** - * load a robots.txt to get the robots time. - * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution. - * This shall therefore not be called in synchronized environments. - * @param robots - * @param profileEntry - * @param crawlURL - * @return - */ - private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) { - long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server - return sleeptime < 0 ? 0 : sleeptime; - } - /** * get lists of crawl request entries for a specific host * @param host @@ -434,7 +400,7 @@ public class Balancer { continue; } // depending on the caching policy we need sleep time to avoid DoS-like situations - sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url()); + sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url()); assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); @@ -445,7 +411,7 @@ public class Balancer { } if (crawlEntry == null) return null; ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent(); - long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent); + long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent); Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); if (delay && sleeptime > 0) { // force a busy waiting here diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index 34bd026eb..2e74dabb7 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicLong; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxtEntry; @@ -262,6 +263,37 @@ public class Latency { return s.toString(); } + /** + * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access + * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all. + * @param robots + * @param profileEntry + * @param crawlURL + * @return the sleep time in milliseconds; may be negative for no sleep time + */ + public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) { + if (profileEntry == null) return 0; + long sleeptime = ( + profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || + (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash())) + ) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server + return sleeptime; + } + + /** + * load a robots.txt to get the robots time. + * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution. + * This shall therefore not be called in synchronized environments. + * @param robots + * @param profileEntry + * @param crawlURL + * @return + */ + public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) { + long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server + return sleeptime < 0 ? 0 : sleeptime; + } + public static final class Host { private AtomicLong timeacc; private AtomicLong lastacc;