- enabled fetching new crawls via /yacy/list.html?list=queueUrls for testing purposes

- sent URLs are taken off the limit-stack (of the global crawl trigger) (may be moved somewhere else in future versions)
- added option to set the requested chunk-size

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3367 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 67d96249b4
commit e6ddf135bb

@ -33,11 +33,13 @@
#(peersKnown)#:: #(peersKnown)#::
<dt><label for="peer">Fetch from Peer</label>:</dt> <dt><label for="peer">Fetch from Peer</label>:</dt>
<dd> <dd>
<input type="radio" name="source" value="peer" id="peer" disabled="disabled" /> <input type="radio" name="source" value="peer" id="peer" />
<select name="peerhash" disabled="disabled"> <select name="peerhash">
<option value="random" selected="selected">Choose a random peer</option>#{peers}# <option value="random" selected="selected">Choose a random peer</option>#{peers}#
<option value="#[hash]#">#[name]#</option>#{/peers}# <option value="#[hash]#">#[name]#</option>#{/peers}#
</select> </select>
&nbsp;<label for="amount">Amount of URLs to request</label>:
<input type="text" name="amount" id="amount" value="50" maxlength="3" size="3" />
#(peerError)#:: #(peerError)#::
&nbsp;<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>:: &nbsp;<span class="error">Error fetching URL-list from <span class="tt">#[hash]#:#[name]#</span></span>::
&nbsp;<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)# &nbsp;<span class="error">Peer with hash <span class="tt">#[hash]#</span> doesn't seem to be online anymore</span>#(/peerError)#
@ -49,7 +51,7 @@
<input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br /> <input type="radio" name="reg" value="self_det" id="self_det" disabled="disabled"/> <label for="self_det">Fetch when queue is empty</label><br />
<input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>: <input type="radio" name="reg" value="delay" id="delay" /> <label for="delay">Fetch in a specified delay</label>:
<label for="frequency">every</label> <label for="frequency">every</label>
&nbsp;<input type="text" name="frequency" id="frequency" text-align="left" size="2" style="text-align: right;" maxlength="2"/> &nbsp;<input type="text" name="frequency" id="frequency" size="2" style="text-align: right;" maxlength="2"/>
<select name="freq_type"> <select name="freq_type">
<option value="weeks">Weeks</option> <option value="weeks">Weeks</option>
<option value="days" selected="selected">Days</option> <option value="days" selected="selected">Days</option>

@ -37,6 +37,8 @@ public class CrawlURLFetch_p {
private static final long STAT_THREAD_STOPPED = 1; private static final long STAT_THREAD_STOPPED = 1;
private static final long STAT_THREAD_PAUSED = 2; private static final long STAT_THREAD_PAUSED = 2;
public static final float MIN_PEER_VERSION_LIST_SERVLET = 0.504033F;
private static URLFetcher fetcher = null; private static URLFetcher fetcher = null;
private static plasmaCrawlProfile.entry profile = null; private static plasmaCrawlProfile.entry profile = null;
private static ArrayList savedURLs = new ArrayList(); private static ArrayList savedURLs = new ArrayList();
@ -74,6 +76,12 @@ public class CrawlURLFetch_p {
} }
} }
int count = 50;
if (post.get("amount", "").matches("\\d+")) {
count = Integer.parseInt(post.get("amount", ""));
if (count > 999) count = 999;
}
if (fetcher != null) fetcher.interrupt(); if (fetcher != null) fetcher.interrupt();
fetcher = null; fetcher = null;
if (post.get("source", "").equals("peer") && if (post.get("source", "").equals("peer") &&
@ -81,6 +89,7 @@ public class CrawlURLFetch_p {
fetcher = new URLFetcher( fetcher = new URLFetcher(
env, env,
profile, profile,
count,
frequency); frequency);
} else { } else {
URL url = null; URL url = null;
@ -103,9 +112,9 @@ public class CrawlURLFetch_p {
} else if (post.get("source", "").equals("peer")) { } else if (post.get("source", "").equals("peer")) {
yacySeed ys = null; yacySeed ys = null;
try { try {
ys = yacyCore.seedDB.getConnected(post.get("peerhash", "")); ys = yacyCore.seedDB.get(post.get("peerhash", ""));
if (ys != null) { if (ys != null) {
url = new URL("http://" + ys.getAddress() + "/yacy/urllist.html"); url = new URL("http://" + ys.getAddress() + URLFetcher.LIST_SERVLET);
} else { } else {
prop.put("peerError", ERR_PEER_OFFLINE); prop.put("peerError", ERR_PEER_OFFLINE);
prop.put("peerError_hash", post.get("peerhash", "")); prop.put("peerError_hash", post.get("peerhash", ""));
@ -122,6 +131,7 @@ public class CrawlURLFetch_p {
env, env,
profile, profile,
url, url,
count,
frequency); frequency);
} }
} }
@ -142,12 +152,14 @@ public class CrawlURLFetch_p {
fetcher = new URLFetcher( fetcher = new URLFetcher(
env, env,
profile, profile,
fetcher.count,
fetcher.delay); fetcher.delay);
} else { } else {
fetcher = new URLFetcher( fetcher = new URLFetcher(
env, env,
profile, profile,
fetcher.url, fetcher.url,
fetcher.count,
fetcher.delay); fetcher.delay);
} }
fetcher.start(); fetcher.start();
@ -200,7 +212,7 @@ public class CrawlURLFetch_p {
prop.put("peersKnown", 1); prop.put("peersKnown", 1);
try { try {
TreeMap hostList = new TreeMap(); TreeMap hostList = new TreeMap();
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, (float) 0.0); final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
while (e.hasMoreElements()) { while (e.hasMoreElements()) {
yacySeed seed = (yacySeed) e.nextElement(); yacySeed seed = (yacySeed) e.nextElement();
if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash); if (seed != null) hostList.put(seed.get(yacySeed.NAME, "nameless"),seed.hash);
@ -209,6 +221,7 @@ public class CrawlURLFetch_p {
String peername; String peername;
while ((peername = (String) hostList.firstKey()) != null) { while ((peername = (String) hostList.firstKey()) != null) {
final String Hash = (String) hostList.get(peername); final String Hash = (String) hostList.get(peername);
if (Hash.equals(yacyCore.seedDB.mySeed.hash)) continue;
prop.put("peersKnown_peers_" + peerCount + "_hash", Hash); prop.put("peersKnown_peers_" + peerCount + "_hash", Hash);
prop.put("peersKnown_peers_" + peerCount + "_name", peername); prop.put("peersKnown_peers_" + peerCount + "_name", peername);
hostList.remove(peername); hostList.remove(peername);
@ -239,6 +252,8 @@ public class CrawlURLFetch_p {
public static final long DELAY_ONCE = -1; public static final long DELAY_ONCE = -1;
public static final long DELAY_SELF_DET = 0; public static final long DELAY_SELF_DET = 0;
private static final String LIST_SERVLET = "/yacy/list.html?list=queueUrls";
public static int totalRuns = 0; public static int totalRuns = 0;
public static int totalFetchedURLs = 0; public static int totalFetchedURLs = 0;
public static int totalFailed = 0; public static int totalFailed = 0;
@ -251,6 +266,7 @@ public class CrawlURLFetch_p {
public int lastFailed = 0; public int lastFailed = 0;
public final URL url; public final URL url;
public final int count;
public final long delay; public final long delay;
public final plasmaSwitchboard sb; public final plasmaSwitchboard sb;
public final plasmaCrawlProfile.entry profile; public final plasmaCrawlProfile.entry profile;
@ -261,12 +277,14 @@ public class CrawlURLFetch_p {
serverSwitch env, serverSwitch env,
plasmaCrawlProfile.entry profile, plasmaCrawlProfile.entry profile,
URL url, URL url,
int count,
long delayMs) { long delayMs) {
if (env == null || profile == null || url == null) if (env == null || profile == null || url == null)
throw new NullPointerException("env, profile or url must not be null"); throw new NullPointerException("env, profile or url must not be null");
this.sb = (plasmaSwitchboard)env; this.sb = (plasmaSwitchboard)env;
this.profile = profile; this.profile = profile;
this.url = url; this.url = url;
this.count = count;
this.delay = delayMs; this.delay = delayMs;
this.setName("URLFetcher"); this.setName("URLFetcher");
} }
@ -274,12 +292,14 @@ public class CrawlURLFetch_p {
public URLFetcher( public URLFetcher(
serverSwitch env, serverSwitch env,
plasmaCrawlProfile.entry profile, plasmaCrawlProfile.entry profile,
int count,
long delayMs) { long delayMs) {
if (env == null || profile == null) if (env == null || profile == null)
throw new NullPointerException("env or profile must not be null"); throw new NullPointerException("env or profile must not be null");
this.sb = (plasmaSwitchboard)env; this.sb = (plasmaSwitchboard)env;
this.profile = profile; this.profile = profile;
this.url = null; this.url = null;
this.count = count;
this.delay = delayMs; this.delay = delayMs;
this.setName("URLFetcher"); this.setName("URLFetcher");
} }
@ -297,7 +317,7 @@ public class CrawlURLFetch_p {
return; return;
} }
totalFetchedURLs += stackURLs(getURLs(url)); totalFetchedURLs += stackURLs(getURLs(url));
lastRun = System.currentTimeMillis() - start; this.lastRun = System.currentTimeMillis() - start;
totalRuns++; totalRuns++;
if (this.delay < 0 || isInterrupted()) { if (this.delay < 0 || isInterrupted()) {
return; return;
@ -320,7 +340,7 @@ public class CrawlURLFetch_p {
// choose random seed // choose random seed
yacySeed ys = null; yacySeed ys = null;
Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, 0F); Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null, MIN_PEER_VERSION_LIST_SERVLET);
int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1; int num = new Random().nextInt(yacyCore.seedDB.sizeConnected()) + 1;
Object o; Object o;
for (int i=0; i<num && e.hasMoreElements(); i++) { for (int i=0; i<num && e.hasMoreElements(); i++) {
@ -330,16 +350,18 @@ public class CrawlURLFetch_p {
if (ys == null) return null; if (ys == null) return null;
try { try {
return new URL("http://" + ys.getAddress() + "/yacy/urllist.html"); return new URL("http://" + ys.getAddress() + LIST_SERVLET + "&count=" + this.count);
} catch (MalformedURLException ee) { return null; } } catch (MalformedURLException ee) { return null; }
} }
private int stackURLs(String[] urls) throws InterruptedException { private int stackURLs(String[] urls) throws InterruptedException {
this.lastFailed = 0; this.lastFailed = 0;
this.lastFetchedURLs = 0;
if (urls == null) return 0; if (urls == null) return 0;
String reason; String reason;
for (int i=0; i<urls.length && !isInterrupted(); i++) { for (int i=0; i<urls.length && !isInterrupted(); i++) {
serverLog.logFinest(this.getName(), "stacking " + urls[i]); if (urls[i].trim().length() == 0) continue;
serverLog.logFine(this.getName(), "stacking " + urls[i]);
reason = this.sb.sbStackCrawlThread.stackCrawl( reason = this.sb.sbStackCrawlThread.stackCrawl(
urls[i], urls[i],
null, null,
@ -348,7 +370,9 @@ public class CrawlURLFetch_p {
new Date(), new Date(),
this.profile.generalDepth(), this.profile.generalDepth(),
this.profile); this.profile);
if (reason != null) { if (reason == null) {
this.lastFetchedURLs++;
} else {
this.lastFailed++; this.lastFailed++;
totalFailed++; totalFailed++;
this.failed.put(urls[i], reason); this.failed.put(urls[i], reason);
@ -366,7 +390,7 @@ public class CrawlURLFetch_p {
} catch (MalformedURLException e) { } } catch (MalformedURLException e) { }
} }
} }
return urls.length - this.lastFailed; return this.lastFetchedURLs;
} }
private String[] getURLs(URL url) { private String[] getURLs(URL url) {
@ -384,7 +408,7 @@ public class CrawlURLFetch_p {
header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII"); header.put(httpHeader.ACCEPT_ENCODING, "US-ASCII");
header.put(httpHeader.HOST, url.getHost()); header.put(httpHeader.HOST, url.getHost());
httpc.response res = con.GET(url.getPath(), header); httpc.response res = con.GET(url.getPath() + "?" + url.getQuery(), header);
serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")"); serverLog.logFine(this.getName(), "downloaded URL-list from " + url + " (" + res.statusCode + ")");
this.lastServerResponse = res.statusCode + " (" + res.statusText + ")"; this.lastServerResponse = res.statusCode + " (" + res.statusText + ")";
if (res.status.startsWith("2")) { if (res.status.startsWith("2")) {

@ -49,17 +49,23 @@
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.io.File; import java.io.File;
import java.io.IOException;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
public final class list { public final class list {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) {
if (post == null || ss == null ) { return null; } if (post == null || ss == null)
throw new NullPointerException("post: " + post + ", sb: " + ss);
// return variable that accumulates replacements // return variable that accumulates replacements
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
@ -83,11 +89,27 @@ public final class list {
} // if filenamesarray.length > 0 } // if filenamesarray.length > 0
prop.put("list",out); prop.put("list",out);
} else if (col.length() == 0 && post.get("list", "").equals("queueUrls")) {
// list urls from remote crawler queue for other peers
int count = 50;
if (post.get("count", "").length() > 0 && post.get("count", "").matches("\\d+"))
count = Integer.parseInt(post.get("count", ""));
final StringBuffer sb = new StringBuffer();
plasmaCrawlNURL.Entry entry;
for (int i=0; i<count && count - i<((plasmaSwitchboard)ss).noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); i++) {
try {
entry = ((plasmaSwitchboard)ss).noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
sb.append(wikiCode.deReplaceHTMLEntities(entry.url().toNormalform())).append("\n");
} catch (IOException e) {
serverLog.logSevere("/yacy/list.html", "CANNOT FETCH ENTRY " + i + "/" + count + ": " + e.getMessage());
}
}
prop.put("list", sb);
} else { } else {
prop.putASIS("list",""); prop.putASIS("list","");
} }
return prop; return prop;
} }
} }

Loading…
Cancel
Save