From eca68fa197101a64546920dae015ffd2f35e09c8 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 25 Nov 2012 15:43:42 +0100 Subject: [PATCH] added debug code to crawler monitor --- htroot/Crawler_p.html | 4 +++- htroot/Crawler_p.java | 11 ++++++++- source/net/yacy/crawler/CrawlSwitchboard.java | 24 +++++++++++++++---- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index ec9fe4181..ea0c737b4 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -159,7 +159,7 @@ #(crawlProfilesShow)#::
-Running Crawls +Running Crawls (#[count]#) @@ -167,11 +167,13 @@ + #(debug)#::#(/debug)# #{list}# + #(debug)#::#(/debug)#
NameCountStatus
#[name]##[count]##(terminateButton)#::
Running
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index e225033a1..352acba71 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -49,6 +49,7 @@ import net.yacy.document.Document; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.NewsPool; @@ -87,7 +88,8 @@ public class Crawler_p { prop.put("forwardToCrawlStart", "0"); prop.put("info", "0"); - + boolean debug = (post != null && post.containsKey("debug")); + if (post != null) { String c = post.toString(); if (c.length() < 1000) Log.logInfo("Crawl Start", c); @@ -520,13 +522,20 @@ public class Crawler_p { profile = sb.crawler.getActive(h); if (CrawlProfile.ignoreNames.contains(profile.name())) continue; profile.putProfileEntry("crawlProfilesShow_list_", prop, true, dark, count, domlistlength); + prop.put("crawlProfilesShow_list_" + count + "_debug", debug ? 1 : 0); + if (debug) { + RowHandleSet urlhashes = sb.crawler.getURLHashes(h); + prop.put("crawlProfilesShow_list_" + count + "_debug_count", urlhashes == null ? "unknown" : Integer.toString(urlhashes.size())); + } if (profile.urlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN) { hosts = hosts + "," + profile.name(); } dark = !dark; count++; } + prop.put("crawlProfilesShow_debug", debug ? 1 : 0); prop.put("crawlProfilesShow_list", count); + prop.put("crawlProfilesShow_count", count); prop.put("crawlProfilesShow", count == 0 ? 0 : 1); if (count > 0) { diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index d6421d7a5..3233c1e6f 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -34,6 +34,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; @@ -46,7 +47,9 @@ import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.blob.MapHeap; +import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; @@ -75,6 +78,7 @@ public final class CrawlSwitchboard { private MapHeap profilesActiveCrawls; private final MapHeap profilesPassiveCrawls; private final Map profilesActiveCrawlsCache; //TreeMap(Base64Order.enhancedCoder); + private final Map profilesActiveCrawlsCounter; public CrawlProfile defaultProxyProfile; public CrawlProfile defaultRemoteProfile; public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; @@ -91,8 +95,8 @@ public final class CrawlSwitchboard { System.exit(0); } this.log = log; - this.profilesActiveCrawlsCache = - Collections.synchronizedMap(new TreeMap(Base64Order.enhancedCoder)); + this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap(Base64Order.enhancedCoder)); + this.profilesActiveCrawlsCounter = new ConcurrentHashMap(); // make crawl profiles database and default profiles this.queuesRoot = queuesRoot; @@ -229,6 +233,11 @@ public final class CrawlSwitchboard { this.profilesPassiveCrawls.put(profileKey, profile); } + public RowHandleSet getURLHashes(final byte[] profileKey) { + return this.profilesActiveCrawlsCounter.get(ASCII.String(profileKey)); + } + + private void initActiveCrawlProfiles() { // generate new default entry for proxy crawling this.defaultProxyProfile = @@ -470,7 +479,10 @@ public final class CrawlSwitchboard { return hasDoneSomething; } - public int cleanFinishesProfiles(CrawlQueues crawlQueues) { + public int cleanFinishesProfiles(CrawlQueues crawlQueues) { + // clear the counter cache + this.profilesActiveCrawlsCounter.clear(); + // find all profiles that are candidates for deletion Set deletionCandidate = new HashSet(); for (final byte[] handle: this.getActive()) { @@ -498,7 +510,11 @@ public final class CrawlSwitchboard { Request r; while (sei.hasNext()) { r = sei.next(); - deletionCandidate.remove(r.profileHandle()); + String handle = r.profileHandle(); + RowHandleSet us = this.profilesActiveCrawlsCounter.get(handle); + if (us == null) {us = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.profilesActiveCrawlsCounter.put(handle, us);} + us.put(r.url().hash()); + deletionCandidate.remove(handle); if (deletionCandidate.size() == 0) return 0; if (System.currentTimeMillis() > timeout) return 0; // give up; this is too large }