more fixes in postprocessing: partitioning of the complete queue to

enable smaller queries
pull/1/head
Michael Peter Christen 11 years ago
parent 2bc6199408
commit 327e83bfe7

@ -269,24 +269,27 @@ public abstract class AbstractSolrConnector implements SolrConnector {
public void run() { public void run() {
this.setName("AbstractSolrConnector:concurrentIDsByQuery(" + querystring + ")"); this.setName("AbstractSolrConnector:concurrentIDsByQuery(" + querystring + ")");
int o = offset; int o = offset;
while (System.currentTimeMillis() < endtime) { try {
try { while (System.currentTimeMillis() < endtime) {
SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName()); try {
int count = 0; SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName());
for (SolrDocument d: sdl) { int count = 0;
try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;} for (SolrDocument d: sdl) {
count++; try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;}
count++;
}
if (count < pagesize_ids) break;
o += pagesize_ids;
} catch (final SolrException e) {
break;
} catch (final IOException e) {
break;
} }
if (count < pagesize_ids) break;
o += pagesize_ids;
} catch (final SolrException e) {
break;
} catch (final IOException e) {
break;
} }
} } catch (Throwable e) {} finally {
for (int i = 0; i < concurrency; i++) { for (int i = 0; i < concurrency; i++) {
try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {} try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {}
}
} }
} }
}; };

@ -300,7 +300,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
QueryResponse rsp; QueryResponse rsp;
int retry = 0; int retry = 0;
Throwable error = null; Throwable error = null;
while (retry++ < 60) { while (retry++ < 10) {
try { try {
if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq) + (sort == null ? "" : ", sort = " + sort) + "; retry = " + retry + "; fl = " + fl); // for debugging in Threaddump if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq) + (sort == null ? "" : ", sort = " + sort) + "; retry = " + retry + "; fl = " + fl); // for debugging in Threaddump
rsp = this.server.query(params); rsp = this.server.query(params);

@ -1252,166 +1252,180 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// process all documents in collection // process all documents in collection
final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment
final Set<String> omitFields = new HashSet<String>();
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
final Collection<String> failids = new ArrayList<String>();
final AtomicInteger countcheck = new AtomicInteger(0);
final AtomicInteger proccount = new AtomicInteger();
final AtomicInteger proccount_referencechange = new AtomicInteger();
final AtomicInteger proccount_citationchange = new AtomicInteger();
try { try {
final Set<String> omitFields = new HashSet<String>(); // partitioning of the index, get a facet for a partitioning key
omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
final long count = collectionConnector.getCountByQuery(collection1query); final long count = collectionConnector.getCountByQuery(collection1query);
final long start = System.currentTimeMillis(); String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName();
final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors())); Map<String, ReversibleScoreMap<String>> partitioningFacet = collectionConnector.getFacets(collection1query, 100000, partitioningKey);
//final int concurrency = 1; ReversibleScoreMap<String> partitioning = partitioningFacet.get(partitioningKey);
final boolean reference_computation = this.contains(CollectionSchema.references_i) && long emptyCount = collectionConnector.getCountByQuery(partitioningKey + ":\"\" AND (" + collection1query + ")");
this.contains(CollectionSchema.references_internal_i) && if (emptyCount > 0) partitioning.inc("", (int) emptyCount);
this.contains(CollectionSchema.references_external_i) && for (String partitioningValue: partitioning) {
this.contains(CollectionSchema.references_exthosts_i); String partitioningQuery = partitioningKey + ":\"" + partitioningValue + "\" AND (" + collection1query + ")";
postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey; postprocessingActivity = "collecting " + partitioning.get(partitioningValue) + " documents from partition \"" + partitioningValue + "\" (averall " + count + ") from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey;
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
final BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery( // start collection of documents
collection1query, final long start = System.currentTimeMillis();
(this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ? final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors()));
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false //final int concurrency = 1;
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false final boolean reference_computation = this.contains(CollectionSchema.references_i) &&
: null, // null sort is faster! this.contains(CollectionSchema.references_internal_i) &&
0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true, this.contains(CollectionSchema.references_external_i) &&
byPartialUpdate ? this.contains(CollectionSchema.references_exthosts_i);
new String[]{ ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
// the following fields are needed to perform the postprocessing final BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
// and should only be used for partial updates; for full updates use a partitioningQuery,
// full list of fields to avoid LazyInstantiation which has poor performace (this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
CollectionSchema.id.getSolrFieldName(), CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema.sku.getSolrFieldName(), CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
CollectionSchema.harvestkey_s.getSolrFieldName(), : null, // null sort is faster!
CollectionSchema.process_sxt.getSolrFieldName(), 0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true,
CollectionSchema.canonical_equal_sku_b.getSolrFieldName(), byPartialUpdate ?
CollectionSchema.canonical_s.getSolrFieldName(), new String[]{
CollectionSchema.exact_signature_l.getSolrFieldName(), // the following fields are needed to perform the postprocessing
CollectionSchema.fuzzy_signature_l.getSolrFieldName(), // and should only be used for partial updates; for full updates use a
CollectionSchema.title_exact_signature_l.getSolrFieldName(), // full list of fields to avoid LazyInstantiation which has poor performace
CollectionSchema.description_exact_signature_l.getSolrFieldName(), CollectionSchema.id.getSolrFieldName(),
CollectionSchema.host_id_s.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(),
CollectionSchema.host_s.getSolrFieldName(), CollectionSchema.harvestkey_s.getSolrFieldName(),
CollectionSchema.host_subdomain_s.getSolrFieldName(), CollectionSchema.process_sxt.getSolrFieldName(),
CollectionSchema.url_chars_i.getSolrFieldName(), CollectionSchema.canonical_equal_sku_b.getSolrFieldName(),
CollectionSchema.url_protocol_s.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName(),
CollectionSchema.httpstatus_i.getSolrFieldName(), CollectionSchema.exact_signature_l.getSolrFieldName(),
CollectionSchema.inboundlinkscount_i.getSolrFieldName(), CollectionSchema.fuzzy_signature_l.getSolrFieldName(),
CollectionSchema.robots_i.getSolrFieldName()} : CollectionSchema.title_exact_signature_l.getSolrFieldName(),
this.allFields()); CollectionSchema.description_exact_signature_l.getSolrFieldName(),
final AtomicInteger proccount = new AtomicInteger(); CollectionSchema.host_id_s.getSolrFieldName(),
final AtomicInteger proccount_referencechange = new AtomicInteger(); CollectionSchema.host_s.getSolrFieldName(),
final AtomicInteger proccount_citationchange = new AtomicInteger(); CollectionSchema.host_subdomain_s.getSolrFieldName(),
final AtomicInteger countcheck = new AtomicInteger(0); CollectionSchema.url_chars_i.getSolrFieldName(),
final Collection<String> failids = new ArrayList<String>(); CollectionSchema.url_protocol_s.getSolrFieldName(),
final Thread rewriteThread[] = new Thread[concurrency]; CollectionSchema.httpstatus_i.getSolrFieldName(),
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) { CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
rewriteThread[rewrite_start] = new Thread() { CollectionSchema.robots_i.getSolrFieldName()} :
@Override this.allFields());
public void run() { final Thread rewriteThread[] = new Thread[concurrency];
SolrDocument doc; for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
try { rewriteThread[rewrite_start] = new Thread() {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { @Override
// for each to-be-processed entry work on the process tag public void run() {
Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); SolrDocument doc;
final String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); try {
final String i = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
if (proctags == null || proctags.size() == 0) { // for each to-be-processed entry work on the process tag
// this should not happen since we collected the documents using a process_sxt:[* TO *] term Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
ConcurrentLog.warn("CollectionConfiguration", "no process_sxt entry for url " + u + ", id=" + i); final String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
continue; final String i = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
} if (proctags == null || proctags.size() == 0) {
try { // this should not happen since we collected the documents using a process_sxt:[* TO *] term
DigestURL url = new DigestURL(u, ASCII.getBytes(i)); ConcurrentLog.warn("CollectionConfiguration", "no process_sxt entry for url " + u + ", id=" + i);
byte[] id = url.hash(); continue;
SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields); }
sid.setField(CollectionSchema.id.getSolrFieldName(), i); try {
for (Object tag: proctags) try { DigestURL url = new DigestURL(u, ASCII.getBytes(i));
byte[] id = url.hash();
SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields);
sid.setField(CollectionSchema.id.getSolrFieldName(), i);
for (Object tag: proctags) try {
// switch over tag types
ProcessType tagtype = ProcessType.valueOf((String) tag);
if (tagtype == ProcessType.CITATION &&
collection.contains(CollectionSchema.cr_host_count_i) &&
collection.contains(CollectionSchema.cr_host_chance_d) &&
collection.contains(CollectionSchema.cr_host_norm_i)) {
CRV crv = rankings.remove(ASCII.String(id)); // instead of 'get'ting the CRV, we also remove it because we will not need it again and free some memory here
if (crv != null) {
sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count);
sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr);
sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn);
proccount_citationchange.incrementAndGet();
}
}
if (tagtype == ProcessType.UNIQUE) {
postprocessing_http_unique(segment, doc, sid, url);
postprocessing_www_unique(segment, doc, sid, url);
postprocessing_doublecontent(segment, uniqueURLs, doc, sid, url);
}
} catch (IllegalArgumentException e) {}
// switch over tag types // compute references
ProcessType tagtype = ProcessType.valueOf((String) tag); if (reference_computation) {
String hosthash = url.hosthash();
if (tagtype == ProcessType.CITATION && if (!hostExtentCache.containsKey(hosthash)) {
collection.contains(CollectionSchema.cr_host_count_i) && StringBuilder q = new StringBuilder();
collection.contains(CollectionSchema.cr_host_chance_d) && q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
collection.contains(CollectionSchema.cr_host_norm_i)) { long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
CRV crv = rankings.remove(ASCII.String(id)); // instead of 'get'ting the CRV, we also remove it because we will not need it again and free some memory here hostExtentCache.put(hosthash, hostExtentCount);
if (crv != null) {
sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count);
sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr);
sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn);
proccount_citationchange.incrementAndGet();
} }
if (postprocessing_references(rrCache, sid, url, hostExtentCache)) proccount_referencechange.incrementAndGet();
} }
if (tagtype == ProcessType.UNIQUE) { // all processing steps checked, remove the processing and harvesting key
postprocessing_http_unique(segment, doc, sid, url); if (byPartialUpdate) {
postprocessing_www_unique(segment, doc, sid, url); sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
postprocessing_doublecontent(segment, uniqueURLs, doc, sid, url); sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
} else {
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
} }
// with standard solr fields selected, the sid now contains the fields
// id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i
// and the value for host_extent_i is by default 2147483647
} catch (IllegalArgumentException e) {} // send back to index
//collectionConnector.deleteById(i);
// compute references if (byPartialUpdate) {
if (reference_computation) { collectionConnector.update(sid);
String hosthash = url.hosthash(); } else {
if (!hostExtentCache.containsKey(hosthash)) { collectionConnector.add(sid);
StringBuilder q = new StringBuilder();
q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
hostExtentCache.put(hosthash, hostExtentCount);
} }
if (postprocessing_references(rrCache, sid, url, hostExtentCache)) proccount_referencechange.incrementAndGet(); long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet();
} if (thiscount % 100 == 0) {
postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " +
// all processing steps checked, remove the processing and harvesting key (thiscount * 60000L / (System.currentTimeMillis() - start)) + " ppm; " +
if (byPartialUpdate) { ((System.currentTimeMillis() - start) * (count - thiscount) / thiscount / 60000) + " minutes remaining";
sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null); }
} else { } catch (final Throwable e1) {
sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); ConcurrentLog.logException(e1);
sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName()); failids.add(i);
}
// with standard solr fields selected, the sid now contains the fields
// id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i
// and the value for host_extent_i is by default 2147483647
// send back to index
//collectionConnector.deleteById(i);
if (byPartialUpdate) {
collectionConnector.update(sid);
} else {
collectionConnector.add(sid);
}
long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet();
if (thiscount % 100 == 0) {
postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " +
(thiscount * 60000L / (System.currentTimeMillis() - start)) + " ppm; " +
((System.currentTimeMillis() - start) * (count - thiscount) / thiscount / 60000) + " minutes remaining";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
} }
} catch (final Throwable e1) { countcheck.incrementAndGet();
ConcurrentLog.logException(e1);
failids.add(i);
} }
countcheck.incrementAndGet(); } catch (InterruptedException e) {
ConcurrentLog.logException(e);
} }
} catch (InterruptedException e) {
ConcurrentLog.logException(e);
} }
} };
}; rewriteThread[rewrite_start].start();
rewriteThread[rewrite_start].start(); }
// wait for termination
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join();
} }
// wait for termination
for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join();
if (failids.size() > 0) { if (failids.size() > 0) {
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails"); ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails");
collectionConnector.deleteByIds(failids); collectionConnector.deleteByIds(failids);
} }
if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck + "; countquery=" + collection1query); // big gap for harvestkey = null
ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount + " new documents, " + ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount + " new documents, " +
proccount_referencechange + " reference-count changes, " + proccount_referencechange + " reference-count changes, " +
proccount_citationchange + " citation ranking changes."); proccount_citationchange + " citation ranking changes.");
} catch (final InterruptedException e2) { } catch (final InterruptedException e2) {
ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2); ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
} catch (IOException e3) { } catch (IOException e3) {

Loading…
Cancel
Save