From 0e8d7524625b34774b35b686023f0c6dd8cff65f Mon Sep 17 00:00:00 2001
From: orbiter <mc@yacy.net>
Date: Tue, 24 Sep 2013 19:55:59 +0200
Subject: [PATCH] refactoring

---
 source/net/yacy/crawler/Balancer.java     | 38 ++---------------------
 source/net/yacy/crawler/data/Latency.java | 32 +++++++++++++++++++
 2 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java
index 67cbabeb5..ce48c96e6 100644
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@@ -42,8 +42,6 @@ import org.openjena.atlas.logging.Log;
 
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.Domains;
@@ -51,7 +49,6 @@ import net.yacy.cora.sorting.OrderedScoreMap;
 import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
-import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.Latency;
 import net.yacy.crawler.retrieval.Request;
@@ -293,37 +290,6 @@ public class Balancer {
         return map;
     }
 
-    /**
-     * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
-     * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
-     * @param robots
-     * @param profileEntry
-     * @param crawlURL
-     * @return the sleep time in milliseconds; may be negative for no sleep time
-     */
-    private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
-        if (profileEntry == null) return 0;
-        long sleeptime = (
-            profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
-            (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
-            ) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
-        return sleeptime;
-    }
-    
-    /**
-     * load a robots.txt to get the robots time.
-     * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
-     * This shall therefore not be called in synchronized environments.
-     * @param robots
-     * @param profileEntry
-     * @param crawlURL
-     * @return
-     */
-    private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
-        long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
-        return sleeptime < 0 ? 0 : sleeptime;
-    }
-
     /**
      * get lists of crawl request entries for a specific host
      * @param host
@@ -434,7 +400,7 @@ public class Balancer {
     	        	continue;
     	        }
     	        // depending on the caching policy we need sleep time to avoid DoS-like situations
-    	        sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
+    	        sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
     
     	        assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
     	        assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
@@ -445,7 +411,7 @@ public class Balancer {
     	}
     	if (crawlEntry == null) return null;
     	ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
-    	long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
+    	long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
         Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
         if (delay && sleeptime > 0) {
             // force a busy waiting here
diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java
index 34bd026eb..2e74dabb7 100644
--- a/source/net/yacy/crawler/data/Latency.java
+++ b/source/net/yacy/crawler/data/Latency.java
@@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicLong;
 
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.crawler.robots.RobotsTxt;
 import net.yacy.crawler.robots.RobotsTxtEntry;
@@ -262,6 +263,37 @@ public class Latency {
         return s.toString();
     }
 
+    /**
+     * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
+     * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
+     * @param robots
+     * @param profileEntry
+     * @param crawlURL
+     * @return the sleep time in milliseconds; may be negative for no sleep time
+     */
+    public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
+        if (profileEntry == null) return 0;
+        long sleeptime = (
+            profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
+            (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
+            ) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+        return sleeptime;
+    }
+    
+    /**
+     * load a robots.txt to get the robots time.
+     * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
+     * This shall therefore not be called in synchronized environments.
+     * @param robots
+     * @param profileEntry
+     * @param crawlURL
+     * @return
+     */
+    public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
+        long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+        return sleeptime < 0 ? 0 : sleeptime;
+    }
+    
     public static final class Host {
         private AtomicLong timeacc;
         private AtomicLong lastacc;