update to HostBrowser:

- time-out after 3 seconds to speed up display (may be incomplete)
- showing also all links from the balancer queue in the host list (after
the '/') and in the result browser view with tag 'loading'
pull/1/head
Michael Peter Christen 13 years ago
parent e2c4c3c7d3
commit 75dd706e1b

@ -77,7 +77,7 @@ function updatepage(str) {
#{list}# #{list}#
<div style="float:left; padding:1px 5px 1px 5px;"> <div style="float:left; padding:1px 5px 1px 5px;">
<div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><a href="/HostBrowser.html?path=#[host]#">#[host]#</a><span>browse #[host]#</span></div></div> <div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><a href="/HostBrowser.html?path=#[host]#">#[host]#</a><span>browse #[host]#</span></div></div>
<div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]# URLs</div> <div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]##(crawler)#::/#[pending]##(/crawler)# URLs</div>
</div> </div>
#{/list}# #{/list}#
</fieldset> </fieldset>

@ -41,6 +41,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -130,8 +131,17 @@ public class HostBrowser {
if (post.containsKey("hosts")) { if (post.containsKey("hosts")) {
// generate host list // generate host list
try { try {
int maxcount = 200; int maxcount = 360; // == 6!/2 which makes nice matrixes for 3, 4, 5, 6 rows/colums
// collect from index
ReversibleScoreMap<String> score = fulltext.getSolr().getFacet(YaCySchema.host_s.name(), maxcount); ReversibleScoreMap<String> score = fulltext.getSolr().getFacet(YaCySchema.host_s.name(), maxcount);
// collect from crawler
final Map<String, Integer[]> crawler = (admin) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap<String, Integer[]>();
for (Map.Entry<String, Integer[]> host: crawler.entrySet()) {
score.inc(host.getKey(), host.getValue()[0]);
}
int c = 0; int c = 0;
Iterator<String> i = score.keys(false); Iterator<String> i = score.keys(false);
String host; String host;
@ -139,6 +149,9 @@ public class HostBrowser {
host = i.next(); host = i.next();
prop.put("hosts_list_" + c + "_host", host); prop.put("hosts_list_" + c + "_host", host);
prop.put("hosts_list_" + c + "_count", score.get(host)); prop.put("hosts_list_" + c + "_count", score.get(host));
boolean inCrawler = crawler.containsKey(host);
prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0);
if (inCrawler) prop.put("hosts_list_" + c + "_crawler_pending", crawler.get(host)[0]);
c++; c++;
} }
prop.put("hosts_list", c); prop.put("hosts_list", c);
@ -166,9 +179,8 @@ public class HostBrowser {
if (p < 8) { if (p < 8) {
prop.put("files_root", 1); prop.put("files_root", 1);
} else { } else {
path = path.substring(0, p + 1);
prop.put("files_root", 0); prop.put("files_root", 0);
prop.put("files_root_path", path); prop.put("files_root_path", path.substring(0, p + 1));
} }
try { try {
// generate file list from path // generate file list from path
@ -179,13 +191,14 @@ public class HostBrowser {
String hosthash = ASCII.String(uri.hash(), 6, 6); String hosthash = ASCII.String(uri.hash(), 6, 6);
// get all files for a specific host from the index // get all files for a specific host from the index
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000); BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 3000, 100);
SolrDocument doc; SolrDocument doc;
Set<String> storedDocs = new HashSet<String>(); Set<String> storedDocs = new HashSet<String>();
Set<String> inboundLinks = new HashSet<String>(); Set<String> inboundLinks = new HashSet<String>();
Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>(); Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
int hostsize = 0; int hostsize = 0;
final List<byte[]> deleteIDs = new ArrayList<byte[]>(); final List<byte[]> deleteIDs = new ArrayList<byte[]>();
long timeout = System.currentTimeMillis() + 3000;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
hostsize++; hostsize++;
@ -221,10 +234,16 @@ public class HostBrowser {
} }
} catch (MalformedURLException e) {} } catch (MalformedURLException e) {}
} }
if (System.currentTimeMillis() > timeout) break;
} }
if (deleteIDs.size() > 0) sb.index.fulltext().remove(deleteIDs, true); if (deleteIDs.size() > 0) sb.index.fulltext().remove(deleteIDs, true);
// now combine both lists into one // collect from crawler
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000);
Set<String> loadingLinks = new HashSet<String>();
for (Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true));
// now combine all lists into one
Map<String, Boolean> files = new HashMap<String, Boolean>(); Map<String, Boolean> files = new HashMap<String, Boolean>();
for (String u: storedDocs) files.put(u, true); for (String u: storedDocs) files.put(u, true);
for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, false); for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, false);
@ -268,8 +287,7 @@ public class HostBrowser {
prop.put("files_list_" + c + "_type_url", entry.getKey()); prop.put("files_list_" + c + "_type_url", entry.getKey());
boolean indexed = ((Boolean) entry.getValue()).booleanValue(); boolean indexed = ((Boolean) entry.getValue()).booleanValue();
try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;} try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;}
boolean loading = load.equals(entry.getKey()) || boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
(uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
//String failr = fulltext.failReason(ASCII.String(uri.hash())); //String failr = fulltext.failReason(ASCII.String(uri.hash()));
prop.put("files_list_" + c + "_type_stored", indexed ? 1 : loading ? 2 : 0); prop.put("files_list_" + c + "_type_stored", indexed ? 1 : loading ? 2 : 0);
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);

@ -131,7 +131,7 @@ public class IndexCreateQueues_p {
prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name()); prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name());
prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]); prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]);
prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1])); prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1]));
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost); List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost, 10000);
Seed initiator; Seed initiator;
String profileHandle; String profileHandle;

@ -22,6 +22,7 @@ package net.yacy.cora.federate.solr.connector;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -68,9 +69,20 @@ public abstract class AbstractSolrConnector implements SolrConnector {
} }
} }
/**
* Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring the solr query string
* @param offset first result offset
* @param maxcount the maximum number of results
* @param maxtime the maximum time in milliseconds
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
* @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element
*/
@Override @Override
public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime) { public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize) {
final BlockingQueue<SolrDocument> queue = new LinkedBlockingQueue<SolrDocument>(); final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(buffersize);
final long endtime = System.currentTimeMillis() + maxtime; final long endtime = System.currentTimeMillis() + maxtime;
final Thread t = new Thread() { final Thread t = new Thread() {
@Override @Override

@ -146,12 +146,14 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* Get a query result from solr as a stream of documents. * Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned. * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue * The method returns immediately and feeds the search results into the queue
* @param querystring * @param querystring the solr query string
* @param offset * @param offset first result offset
* @param maxcount * @param maxcount the maximum number of results
* @return * @param maxtime the maximum time in milliseconds
*/ * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime); * @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element
*/
public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize);
/** /**
* get a document id result stream from a solr query. * get a document id result stream from a solr query.

@ -339,16 +339,18 @@ public class Balancer {
* get lists of crawl request entries for a specific host * get lists of crawl request entries for a specific host
* @param host * @param host
* @param maxcount * @param maxcount
* @param maxtime
* @return a list of crawl loader requests * @return a list of crawl loader requests
*/ */
public List<Request> getDomainStackReferences(String host, int maxcount) { public List<Request> getDomainStackReferences(final String host, int maxcount, final long maxtime) {
HostHandles hh = this.domainStacks.get(host); final HostHandles hh = this.domainStacks.get(host);
if (hh == null) return new ArrayList<Request>(0); if (hh == null) return new ArrayList<Request>(0);
HandleSet domainList = hh.handleSet; final HandleSet domainList = hh.handleSet;
if (domainList.isEmpty()) return new ArrayList<Request>(0); if (domainList.isEmpty()) return new ArrayList<Request>(0);
ArrayList<Request> cel = new ArrayList<Request>(maxcount); maxcount = Math.min(maxcount, domainList.size());
final ArrayList<Request> cel = new ArrayList<Request>(maxcount);
long timeout = System.currentTimeMillis() + maxtime;
for (int i = 0; i < maxcount; i++) { for (int i = 0; i < maxcount; i++) {
if (domainList.size() <= i) break;
final byte[] urlhash = domainList.getOne(i); final byte[] urlhash = domainList.getOne(i);
if (urlhash == null) continue; if (urlhash == null) continue;
Row.Entry rowEntry; Row.Entry rowEntry;
@ -365,6 +367,7 @@ public class Balancer {
continue; continue;
} }
cel.add(crawlEntry); cel.add(crawlEntry);
if (System.currentTimeMillis() > timeout) break;
} }
return cel; return cel;
} }

@ -248,12 +248,12 @@ public class NoticedURL {
* @param maxcount * @param maxcount
* @return a list of crawl loader requests * @return a list of crawl loader requests
*/ */
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) { public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount, final long maxtime) {
switch (stackType) { switch (stackType) {
case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount); case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount, maxtime);
case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount); case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount, maxtime);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount); case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount, maxtime);
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount); case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime);
default: return null; default: return null;
} }
} }

@ -318,7 +318,7 @@ public final class Fulltext implements Iterable<byte[]> {
final String host = uri.getHost(); final String host = uri.getHost();
Thread t = new Thread(){ Thread t = new Thread(){
public void run() { public void run() {
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000); final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 1000000, 600000, -1);
try { try {
SolrDocument doc; SolrDocument doc;
boolean removed = false; boolean removed = false;
@ -342,12 +342,25 @@ public final class Fulltext implements Iterable<byte[]> {
* @param concurrently if true, then the method returnes immediately and runs concurrently * @param concurrently if true, then the method returnes immediately and runs concurrently
*/ */
public void remove(final List<byte[]> deleteIDs, final boolean concurrently) { public void remove(final List<byte[]> deleteIDs, final boolean concurrently) {
if (deleteIDs == null || deleteIDs.size() == 0) return;
Thread t = new Thread() { Thread t = new Thread() {
public void run() { public void run() {
for (byte[] id: deleteIDs) {remove(id);} try {
Fulltext.this.solr.commit(); synchronized (Fulltext.this.solr) {
} for (byte[] urlHash: deleteIDs) {
}; Fulltext.this.solr.delete(ASCII.String(urlHash));
}
}
} catch (final Throwable e) {
Log.logException(e);
}
if (Fulltext.this.urlIndexFile != null) try {
for (byte[] urlHash: deleteIDs) {
final Row.Entry r = Fulltext.this.urlIndexFile.remove(urlHash);
if (r != null) Fulltext.this.statsDump = null;
}
} catch (final IOException e) {}
}};
if (concurrently) t.start(); else t.run(); if (concurrently) t.start(); else t.run();
} }

Loading…
Cancel
Save