From 3e742d1e34b0b2eea1a92e3ca3ed8589b55ecc0d Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 23 May 2015 02:06:39 +0200 Subject: [PATCH] Init remote crawler on demand If remote crawl option is not activated, skip init of remoteCrawlJob to save the resources of queue and ideling thread. Deploy of the remoteCrawlJob deferred on activation of the option. --- htroot/ConfigNetwork_p.java | 4 +- htroot/RemoteCrawl_p.java | 4 +- htroot/Status.java | 2 +- htroot/api/status_p.java | 2 +- htroot/yacy/crawlReceipt.java | 8 +- htroot/yacy/urls.java | 2 +- source/net/yacy/crawler/Balancer.java | 4 + source/net/yacy/crawler/HostBalancer.java | 11 +- source/net/yacy/crawler/HostQueue.java | 10 ++ source/net/yacy/crawler/LegacyBalancer.java | 10 ++ source/net/yacy/crawler/data/CrawlQueues.java | 44 ++++--- source/net/yacy/crawler/data/NoticedURL.java | 78 ++++++++---- source/net/yacy/search/Switchboard.java | 113 +++++++++++------- .../net/yacy/search/SwitchboardConstants.java | 1 + 14 files changed, 199 insertions(+), 94 deletions(-) diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java index 341e1a02d..554878943 100644 --- a/htroot/ConfigNetwork_p.java +++ b/htroot/ConfigNetwork_p.java @@ -143,7 +143,7 @@ public class ConfigNetwork_p prop.put("commit", commit); // write remote crawl request settings - prop.put("crawlResponse", sb.getConfigBool("crawlResponse", false) ? "1" : "0"); + prop.put("crawlResponse", sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false) ? "1" : "0"); final long RTCbusySleep = Math .max(1, env.getConfigInt(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 100)); @@ -166,7 +166,7 @@ public class ConfigNetwork_p prop.put("indexReceiveSearchChecked", indexReceiveSearch); // set seed information directly - sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool("crawlResponse", false)); + sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); sb.peers.mySeed().setFlagAcceptRemoteIndex(indexReceive); // set p2p/robinson mode flags and values diff --git a/htroot/RemoteCrawl_p.java b/htroot/RemoteCrawl_p.java index bbdb9f0d7..1117d47cf 100644 --- a/htroot/RemoteCrawl_p.java +++ b/htroot/RemoteCrawl_p.java @@ -56,7 +56,7 @@ public class RemoteCrawl_p { boolean crawlResponse = post.get("crawlResponse", "off").equals("on"); // read remote crawl request settings - sb.setConfig("crawlResponse", crawlResponse); + sb.initRemoteCrawler(crawlResponse); } if (post.containsKey("acceptCrawlLimit")) { @@ -70,7 +70,7 @@ public class RemoteCrawl_p { } // set seed information directly - sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool("crawlResponse", false)); + sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); // write remote crawl request settings prop.put("disabled", !sb.peers.mySeed().isActive() && !sb.peers.mySeed().getFlagAcceptRemoteCrawl() ? 1 : 0); diff --git a/htroot/Status.java b/htroot/Status.java index 5278c0ded..3e44717fb 100644 --- a/htroot/Status.java +++ b/htroot/Status.java @@ -356,7 +356,7 @@ public class Status prop.putNum( "remoteTriggeredCrawlQueueSize", - sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); + sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) != null ? sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount() : 0); prop.put( "remoteTriggeredCrawlPaused", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? "1" : "0"); diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index a46021f3b..919a4ae12 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -105,7 +105,7 @@ public class status_p { prop.put("limitCrawlState", STATE_RUNNING); //remote crawl queue - prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount()); + prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) != null ? sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount() : 0); prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING); //noload crawl queue diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index cdf5fc0a2..553b6bf6a 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -143,7 +143,7 @@ public final class crawlReceipt { return prop; } - if ("fill".equals(result)) try { + if ("fill".equals(result) && sb.crawlQueues.delegatedURL != null) try { // put new entry into database sb.index.fulltext().putMetadata(entry); ResultURLs.stack(ASCII.String(entry.url().hash()), entry.url().getHost(), youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS); @@ -159,8 +159,10 @@ public final class crawlReceipt { return prop; } - sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case - sb.crawlQueues.errorURL.push(entry.url(), 997, null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1); + if (sb.crawlQueues.delegatedURL != null) { // the delegated work is transformed into an error case + sb.crawlQueues.delegatedURL.remove(entry.hash()); + sb.crawlQueues.errorURL.push(entry.url(), 997, null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1); + } //switchboard.noticeURL.remove(receivedUrlhash); prop.put("delay", "3600"); return prop; diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 02232c845..6fd5b48a9 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -83,7 +83,7 @@ public class urls { } // place url to notice-url db - sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url()); + if (sb.crawlQueues.delegatedURL != null) sb.crawlQueues.delegatedURL.put(ASCII.String(entry.url().hash()), entry.url()); // create RSS entry prop.put("item_" + c + "_title", ""); diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index 35b70aecb..164a26c79 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -21,6 +21,7 @@ package net.yacy.crawler; +import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.List; @@ -90,6 +91,9 @@ public interface Balancer { */ public int size(); + public int getOnDemandLimit(); + + public boolean getExceed134217727(); /** * check if stack is empty * @return true iff size() == 0 diff --git a/source/net/yacy/crawler/HostBalancer.java b/source/net/yacy/crawler/HostBalancer.java index f3421ddb6..cbd65141e 100644 --- a/source/net/yacy/crawler/HostBalancer.java +++ b/source/net/yacy/crawler/HostBalancer.java @@ -202,7 +202,16 @@ public class HostBalancer implements Balancer { } return true; } - + + @Override + public int getOnDemandLimit() { + return this.onDemandLimit; + } + + @Override + public boolean getExceed134217727() { + return this.exceed134217727; + } /** * push a request to one of the host queues. If the queue does not exist, it is created * @param entry diff --git a/source/net/yacy/crawler/HostQueue.java b/source/net/yacy/crawler/HostQueue.java index d106d91e3..6157efb80 100644 --- a/source/net/yacy/crawler/HostQueue.java +++ b/source/net/yacy/crawler/HostQueue.java @@ -544,4 +544,14 @@ public class HostQueue implements Balancer { return cel; } + @Override + public int getOnDemandLimit() { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } + + @Override + public boolean getExceed134217727() { + return this.exceed134217727; + } + } diff --git a/source/net/yacy/crawler/LegacyBalancer.java b/source/net/yacy/crawler/LegacyBalancer.java index 42fab99c4..c679e859c 100644 --- a/source/net/yacy/crawler/LegacyBalancer.java +++ b/source/net/yacy/crawler/LegacyBalancer.java @@ -76,6 +76,16 @@ public class LegacyBalancer implements Balancer { private final List> zeroWaitingCandidates; private final Random random; // used to alternate between choose-from-maxstack or choose from any zero-waiting + @Override + public int getOnDemandLimit() { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } + + @Override + public boolean getExceed134217727() { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } + private static class HostHandles { public String hosthash; public HandleSet handleSet; diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 68b36c013..454ba15db 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -72,7 +72,7 @@ public class CrawlQueues { private final Switchboard sb; private final Loader[] worker; private final ArrayBlockingQueue workerQueue; - private final ArrayList remoteCrawlProviderHashes; + private ArrayList remoteCrawlProviderHashes; public NoticedURL noticeURL; public ErrorCache errorURL; @@ -83,7 +83,7 @@ public class CrawlQueues { final int maxWorkers = (int) sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10); this.worker = new Loader[maxWorkers]; this.workerQueue = new ArrayBlockingQueue(200); - this.remoteCrawlProviderHashes = new ArrayList(); + this.remoteCrawlProviderHashes = null; // start crawling management log.config("Starting Crawling Management"); @@ -92,10 +92,16 @@ public class CrawlQueues { log.config("Opening errorURL.."); this.errorURL = new ErrorCache(sb.index.fulltext()); log.config("Opening delegatedURL.."); - this.delegatedURL = new ConcurrentHashMap(); - log.config("Finishted Startup of Crawling Management"); + this.delegatedURL = null; + } + + public void initRemoteCrawlQueues () { + if (this.remoteCrawlProviderHashes == null) this.remoteCrawlProviderHashes = new ArrayList(); + if (this.delegatedURL == null) { + this.delegatedURL = new ConcurrentHashMap(); + log.config("Finishted Startup of Crawling Management"); + } } - /** * Relocation is necessary if the user switches the network. * Because this object is part of the scheduler we cannot simply close that object and create a new one. @@ -106,10 +112,10 @@ public class CrawlQueues { // removed pending requests this.workerQueue.clear(); this.errorURL.clearCache(); - this.remoteCrawlProviderHashes.clear(); + if (this.remoteCrawlProviderHashes != null) this.remoteCrawlProviderHashes.clear(); this.noticeURL.close(); this.noticeURL = new NoticedURL(newQueuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), this.sb.exceed134217727); - this.delegatedURL.clear(); + if (this.delegatedURL != null) this.delegatedURL.clear(); } public synchronized void close() { @@ -130,16 +136,16 @@ public class CrawlQueues { } } this.noticeURL.close(); - this.delegatedURL.clear(); + if (this.delegatedURL != null) this.delegatedURL.clear(); } public void clear() { // wait for all workers to finish this.workerQueue.clear(); for (final Loader w: this.worker) if (w != null) w.interrupt(); - this.remoteCrawlProviderHashes.clear(); + if (this.remoteCrawlProviderHashes != null) this.remoteCrawlProviderHashes.clear(); this.noticeURL.clear(); - this.delegatedURL.clear(); + if (this.delegatedURL != null) this.delegatedURL.clear(); } /** @@ -148,7 +154,7 @@ public class CrawlQueues { * @return if the hash exists, the name of the database is returned, otherwise null is returned */ public HarvestProcess exists(final byte[] hash) { - if (this.delegatedURL.containsKey(ASCII.String(hash))) { + if (this.delegatedURL != null && this.delegatedURL.containsKey(ASCII.String(hash))) { return HarvestProcess.DELEGATED; } //if (this.noticeURL.existsInStack(hash)) { @@ -181,7 +187,7 @@ public class CrawlQueues { public void removeURL(final byte[] hash) { assert hash != null && hash.length == 12; this.noticeURL.removeByURLHash(hash); - this.delegatedURL.remove(hash); + if (this.delegatedURL != null) this.delegatedURL.remove(hash); } public int removeHosts(final Set hosthashes) { @@ -194,9 +200,11 @@ public class CrawlQueues { if (urlhash == null || urlhash.length == 0) { return null; } - DigestURL u = this.delegatedURL.get(ASCII.String(urlhash)); - if (u != null) { - return u; + if (this.delegatedURL != null) { + DigestURL u = this.delegatedURL.get(ASCII.String(urlhash)); + if (u != null) { + return u; + } } for (final DigestURL url: activeWorkerEntries().keySet()) { if (Base64Order.enhancedCoder.equal(url.hash(), urlhash)) { @@ -456,7 +464,7 @@ public class CrawlQueues { // check if we have an entry in the provider list, otherwise fill the list Seed seed; - if (this.remoteCrawlProviderHashes.isEmpty()) { + if (this.remoteCrawlProviderHashes != null && this.remoteCrawlProviderHashes.isEmpty()) { if (this.sb.peers != null && this.sb.peers.sizeConnected() > 0) { final Iterator e = DHTSelection.getProvidesRemoteCrawlURLs(this.sb.peers); while (e.hasNext()) { @@ -467,14 +475,14 @@ public class CrawlQueues { } } } - if (this.remoteCrawlProviderHashes.isEmpty()) { + if (this.remoteCrawlProviderHashes == null || this.remoteCrawlProviderHashes.isEmpty()) { return false; } // take one entry from the provider list and load the entries from the remote peer seed = null; String hash = null; - while (seed == null && !this.remoteCrawlProviderHashes.isEmpty()) { + while (seed == null && (this.remoteCrawlProviderHashes != null && !this.remoteCrawlProviderHashes.isEmpty())) { hash = this.remoteCrawlProviderHashes.remove(this.remoteCrawlProviderHashes.size() - 1); if (hash == null) { continue; diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index 1c9673acd..baeab151b 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -46,6 +46,7 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; +import net.yacy.kelondro.util.MemoryControl; public class NoticedURL { @@ -55,8 +56,9 @@ public class NoticedURL { private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth - private Balancer remoteStack; // links from remote crawl orders + private Balancer remoteStack; // links from remote crawl orders (init on demand) private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry + private final File cachePath; protected NoticedURL( final File cachePath, @@ -64,16 +66,28 @@ public class NoticedURL { final boolean exceed134217727) { ConcurrentLog.info("NoticedURL", "START CREATING STACKS at " + cachePath.toString()); ConcurrentLog.info("NoticedURL", "opening CrawlerCoreStacks.."); + this.cachePath = cachePath; this.coreStack = new HostBalancer(new File(cachePath, "CrawlerCoreStacks"), onDemandLimit, exceed134217727); ConcurrentLog.info("NoticedURL", "opening CrawlerLimitStacks.."); this.limitStack = new HostBalancer(new File(cachePath, "CrawlerLimitStacks"), onDemandLimit, exceed134217727); - ConcurrentLog.info("NoticedURL", "opening CrawlerRemoteStacks.."); - this.remoteStack = new HostBalancer(new File(cachePath, "CrawlerRemoteStacks"), onDemandLimit, exceed134217727); + + this.remoteStack = null; // init on demand (on first push) + ConcurrentLog.info("NoticedURL", "opening CrawlerNoLoadStacks.."); this.noloadStack = new HostBalancer(new File(cachePath, "CrawlerNoLoadStacks"), onDemandLimit, exceed134217727); ConcurrentLog.info("NoticedURL", "FINISHED CREATING STACKS at " + cachePath.toString()); } + /** + * Init Remote crawl stack, internally called on 1st push to remoteStack + */ + protected void initRemoteStack() { + if (this.remoteStack == null && !MemoryControl.shortStatus()) { + ConcurrentLog.info("NoticedURL", "opening CrawlerRemoteStacks.."); + this.remoteStack = new HostBalancer(new File(this.cachePath, "CrawlerRemoteStacks"), this.coreStack.getOnDemandLimit(), this.coreStack.getExceed134217727()); + } + } + public void clear() { ConcurrentLog.info("NoticedURL", "CLEARING ALL STACKS"); if (this.coreStack != null) this.coreStack.clear(); @@ -113,7 +127,6 @@ public class NoticedURL { } public int size() { - // this does not count the overhang stack size return ((this.coreStack == null) ? 0 : this.coreStack.size()) + ((this.limitStack == null) ? 0 : this.limitStack.size()) + ((this.remoteStack == null) ? 0 : this.remoteStack.size()); } @@ -127,7 +140,7 @@ public class NoticedURL { public boolean isEmpty() { if (!isEmptyLocal()) return false; - if (!this.remoteStack.isEmpty()) return false; + if (this.remoteStack != null && !this.remoteStack.isEmpty()) return false; return true; } @@ -155,8 +168,7 @@ public class NoticedURL { return this.coreStack.has(urlhashb) || this.limitStack.has(urlhashb) || - //overhangStack.has(urlhashb) || - this.remoteStack.has(urlhashb) || + (this.remoteStack != null && this.remoteStack.has(urlhashb)) || this.noloadStack.has(urlhashb); } @@ -169,11 +181,16 @@ public class NoticedURL { public String push(final StackType stackType, final Request entry, CrawlProfile profile, final RobotsTxt robots) { try { switch (stackType) { - case LOCAL: return this.coreStack.push(entry, profile, robots); + case LOCAL: return this.coreStack.push(entry, profile, robots); case GLOBAL: return this.limitStack.push(entry, profile, robots); - case REMOTE: return this.remoteStack.push(entry, profile, robots); + case REMOTE: { + if (this.remoteStack == null) { + this.initRemoteStack(); + } + return (this.remoteStack != null) ? this.remoteStack.push(entry, profile, robots) : "remote crawler stack deactivated"; + } case NOLOAD: return this.noloadStack.push(entry, profile, robots); - default: return "stack type unknown"; + default: return "stack type unknown"; } } catch (final Exception er) { ConcurrentLog.logException(er); @@ -186,7 +203,7 @@ public class NoticedURL { try {if ((entry = this.noloadStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.coreStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} try {if ((entry = this.limitStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} - try {if ((entry = this.remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} + try {if (this.remoteStack != null && (entry = this.remoteStack.get(urlhash)) != null) return entry;} catch (final IOException e) {} return null; } @@ -204,7 +221,7 @@ public class NoticedURL { try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {ret |= this.coreStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {ret |= this.limitStack.remove(urlHashes) > 0;} catch (final IOException e) {} - try {ret |= this.remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {} + try {ret |= this.remoteStack != null && this.remoteStack.remove(urlHashes) > 0;} catch (final IOException e) {} return ret; } catch (final SpaceExceededException e) { ConcurrentLog.logException(e); @@ -217,7 +234,7 @@ public class NoticedURL { try {removed += this.noloadStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += this.coreStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} try {removed += this.limitStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} - try {removed += this.remoteStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} + if (this.remoteStack != null) try {removed += this.remoteStack.removeAllByProfileHandle(handle, timeout);} catch (final IOException e) {} return removed; } @@ -226,7 +243,7 @@ public class NoticedURL { removed += this.noloadStack.removeAllByHostHashes(hosthashes); removed += this.coreStack.removeAllByHostHashes(hosthashes); removed += this.limitStack.removeAllByHostHashes(hosthashes); - removed += this.remoteStack.removeAllByHostHashes(hosthashes); + if (this.remoteStack != null) removed += this.remoteStack.removeAllByHostHashes(hosthashes); return removed; } @@ -238,7 +255,7 @@ public class NoticedURL { switch (stackType) { case LOCAL: return this.coreStack.getDomainStackHosts(robots); case GLOBAL: return this.limitStack.getDomainStackHosts(robots); - case REMOTE: return this.remoteStack.getDomainStackHosts(robots); + case REMOTE: return (this.remoteStack != null) ? this.remoteStack.getDomainStackHosts(robots) : null; case NOLOAD: return this.noloadStack.getDomainStackHosts(robots); default: return null; } @@ -254,7 +271,7 @@ public class NoticedURL { switch (stackType) { case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount, maxtime); case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount, maxtime); - case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount, maxtime); + case REMOTE: return (this.remoteStack != null) ? this.remoteStack.getDomainStackReferences(host, maxcount, maxtime) : null; case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime); default: return null; } @@ -264,7 +281,7 @@ public class NoticedURL { switch (stackType) { case LOCAL: return pop(this.coreStack, delay, cs, robots); case GLOBAL: return pop(this.limitStack, delay, cs, robots); - case REMOTE: return pop(this.remoteStack, delay, cs, robots); + case REMOTE: return (this.remoteStack != null) ? pop(this.remoteStack, delay, cs, robots) : null; case NOLOAD: return pop(this.noloadStack, false, cs, robots); default: return null; } @@ -285,14 +302,25 @@ public class NoticedURL { } public void clear(final StackType stackType) { - ConcurrentLog.info("NoticedURL", "CLEARING STACK " + stackType); + ConcurrentLog.info("NoticedURL", "CLEARING STACK " + stackType); switch (stackType) { - case LOCAL: this.coreStack.clear(); break; - case GLOBAL: this.limitStack.clear(); break; - case REMOTE: this.remoteStack.clear(); break; - case NOLOAD: this.noloadStack.clear(); break; - default: return; - } + case LOCAL: + this.coreStack.clear(); + break; + case GLOBAL: + this.limitStack.clear(); + break; + case REMOTE: + if (this.remoteStack != null) { + this.remoteStack.clear(); + } + break; + case NOLOAD: + this.noloadStack.clear(); + break; + default: + return; + } } private static Request pop(final Balancer balancer, final boolean delay, final CrawlSwitchboard cs, final RobotsTxt robots) throws IOException { @@ -331,7 +359,7 @@ public class NoticedURL { try {switch (stackType) { case LOCAL: return this.coreStack.iterator(); case GLOBAL: return this.limitStack.iterator(); - case REMOTE: return this.remoteStack.iterator(); + case REMOTE: return (this.remoteStack != null) ? this.remoteStack.iterator() : null; case NOLOAD: return this.noloadStack.iterator(); default: return null; }} catch (final IOException e) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 56308ebb2..ea4ee9685 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1051,32 +1051,9 @@ public final class Switchboard extends serverSwitch { 20000, 0), 10000); - deployThread( - SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, - "Remote Crawl Job", - "thread that performes a single crawl/indexing step triggered by a remote peer", - "/IndexCreateQueues_p.html?stack=REMOTE", - new InstantBusyThread( - this.crawlQueues, - SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START, - SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT, - SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM, - 0, - 0), - 10000); - deployThread( - SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER, - "Remote Crawl URL Loader", - "thread that loads remote crawl lists from other peers", - null, - new InstantBusyThread( - this.crawlQueues, - SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START, - SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT, - SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM, - 10000, - 10000), - 10000); // error here? + + this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); + deployThread( SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, "Local Crawl", @@ -1472,21 +1449,77 @@ public final class Switchboard extends serverSwitch { // propagate to crawler final BusyThread rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep); - setConfig( - SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, - Math.min(10000, newBusySleep * 10)); - rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000)); - rct - .setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000)); + setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, + Math.min(10000, newBusySleep * 10)); + if (rct != null) { + rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000)); + rct.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000)); + } // propagate to loader final BusyThread rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 4); - setConfig( - SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, - Math.min(10000, newBusySleep * 20)); - rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000)); - rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000)); + setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, + Math.min(10000, newBusySleep * 20)); + if (rcl != null) { + rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000)); + rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000)); + } + } + + /** + * Initialisize and perform all settings to enable remote crawls + * (if remote crawl is not in use, save the resources) + * @param activate true=enable, false=disable + */ + public void initRemoteCrawler(final boolean activate) { + + this.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE, activate); + this.peers.mySeed().setFlagAcceptRemoteCrawl(activate); + if (activate) { + this.crawlQueues.initRemoteCrawlQueues(); + + BusyThread rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); + if (rct == null) { + deployThread( + SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, + "Remote Crawl Job", + "thread that performes a single crawl/indexing step triggered by a remote peer", + "/IndexCreateQueues_p.html?stack=REMOTE", + new InstantBusyThread( + this.crawlQueues, + SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_START, + SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT, + SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM, + 0, + 0), + 10000); + rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); + } + rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000)); + rct.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000)); + + BusyThread rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); + if (rcl == null) { + deployThread( + SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER, + "Remote Crawl URL Loader", + "thread that loads remote crawl lists from other peers", + null, + new InstantBusyThread( + this.crawlQueues, + SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START, + SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT, + SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM, + 10000, + 10000), + 10000); + + rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); + } + rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000)); + rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000)); + } } public void initMessages() throws IOException { @@ -2160,7 +2193,7 @@ public final class Switchboard extends serverSwitch { public int cleanupJobSize() { int c = 1; // run this always! - if ( (this.crawlQueues.delegatedURL.size() > 1000) ) { + if (this.crawlQueues.delegatedURL != null && (this.crawlQueues.delegatedURL.size() > 1000) ) { c++; } if ( (this.crawlQueues.errorURL.stackSize() > 1000) ) { @@ -2256,7 +2289,7 @@ public final class Switchboard extends serverSwitch { // clean up delegated stack checkInterruption(); - if ( (this.crawlQueues.delegatedURL.size() > 1000) ) { + if (this.crawlQueues.delegatedURL != null && (this.crawlQueues.delegatedURL.size() > 1000) ) { if ( this.log.isFine() ) { this.log.fine("Cleaning Delegated-URLs report stack, " + this.crawlQueues.delegatedURL.size() @@ -3778,7 +3811,7 @@ public final class Switchboard extends serverSwitch { mySeed.setFlagDirectConnect(true); mySeed.setLastSeenUTC(); mySeed.put(Seed.UTC, GenericFormatter.UTCDiffString()); - mySeed.setFlagAcceptRemoteCrawl(getConfigBool("crawlResponse", true)); + mySeed.setFlagAcceptRemoteCrawl(getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); mySeed.setFlagAcceptRemoteIndex(getConfigBool("allowReceiveIndex", true)); mySeed.setFlagSSLAvailable(this.getHttpServer() != null && this.getHttpServer().withSSL() && getConfigBool("server.https", false)); if (mySeed.getFlagSSLAvailable()) mySeed.put(Seed.PORTSSL, Integer.toString(getPublicPort("port.ssl", 8443))); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 928f1887c..82295624f 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -113,6 +113,7 @@ public final class SwitchboardConstants { * * @see Switchboard#CRAWLJOB_REMOTE_CRAWL_LOADER */ + public static final String CRAWLJOB_REMOTE = "crawlResponse"; // enable/disable response to remote crawl requests public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader"; public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_START = "remoteCrawlLoaderJob"; public static final String CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT = null;