more fixes in postprocessing: partitioning of the complete queue to

enable smaller queries
11 years ago · 327e83bfe7
parent 2bc6199408
commit 327e83bfe7
3 changed files with 173 additions and 156 deletions
--- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
@ -269,24 +269,27 @@ public abstract class AbstractSolrConnector implements SolrConnector {
            public void run() {
                this.setName("AbstractSolrConnector:concurrentIDsByQuery(" + querystring + ")");
                int o = offset;
-                while (System.currentTimeMillis() < endtime) {
+                try {
-                    try {
+                    while (System.currentTimeMillis() < endtime) {
-                        SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName());
+                        try {
-                        int count = 0;
+                            SolrDocumentList sdl = getDocumentListByQuery(querystring, sort, o, Math.min(maxcount, pagesize_ids), CollectionSchema.id.getSolrFieldName());
-                        for (SolrDocument d: sdl) {
+                            int count = 0;
-                            try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;}
+                            for (SolrDocument d: sdl) {
-                            count++;
+                                try {queue.put((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));} catch (final InterruptedException e) {break;}
                                count++;
                            }
                            if (count < pagesize_ids) break;
                            o += pagesize_ids;
                        } catch (final SolrException e) {
                            break;
                        } catch (final IOException e) {
                            break;
                        }
                        if (count < pagesize_ids) break;
                        o += pagesize_ids;
                    } catch (final SolrException e) {
                        break;
                    } catch (final IOException e) {
                        break;
                    }
-                }
+                } catch (Throwable e) {} finally {
-                for (int i = 0; i < concurrency; i++) {
+                    for (int i = 0; i < concurrency; i++) {
-                    try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {}
+                        try {queue.put(AbstractSolrConnector.POISON_ID);} catch (final InterruptedException e1) {}
                    }
                }
            }
        };
--- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
@ -300,7 +300,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
        QueryResponse rsp;
        int retry = 0;
        Throwable error = null;
-        while (retry++ < 60) {
+        while (retry++ < 10) {
            try {
                if (q != null) Thread.currentThread().setName("solr query: q = " + q + (fq == null ? "" : ", fq = " + fq) + (sort == null ? "" : ", sort = " + sort) + "; retry = " + retry + "; fl = " + fl); // for debugging in Threaddump
                rsp = this.server.query(params);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -1252,166 +1252,180 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        // process all documents in collection
        final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
        final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment
        final Set<String> omitFields = new HashSet<String>();
        omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
        omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
        final Collection<String> failids = new ArrayList<String>();
        final AtomicInteger countcheck = new AtomicInteger(0);
        final AtomicInteger proccount = new AtomicInteger();
        final AtomicInteger proccount_referencechange = new AtomicInteger();
        final AtomicInteger proccount_citationchange = new AtomicInteger();
        try {
-            final Set<String> omitFields = new HashSet<String>();
+            // partitioning of the index, get a facet for a partitioning key
            omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
            omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
            final long count = collectionConnector.getCountByQuery(collection1query);
-            final long start = System.currentTimeMillis();
+            String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName();
-            final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors()));
+            Map<String, ReversibleScoreMap<String>> partitioningFacet = collectionConnector.getFacets(collection1query, 100000, partitioningKey);
-            //final int concurrency = 1;
+            ReversibleScoreMap<String> partitioning = partitioningFacet.get(partitioningKey);
-            final boolean reference_computation = this.contains(CollectionSchema.references_i) &&
+            long emptyCount = collectionConnector.getCountByQuery(partitioningKey + ":\"\" AND (" + collection1query + ")");
-                    this.contains(CollectionSchema.references_internal_i) &&
+            if (emptyCount > 0) partitioning.inc("", (int) emptyCount);
-                    this.contains(CollectionSchema.references_external_i) &&
+            for (String partitioningValue: partitioning) {
-                    this.contains(CollectionSchema.references_exthosts_i);
+                String partitioningQuery = partitioningKey + ":\"" + partitioningValue + "\" AND (" + collection1query + ")";
-            postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey;
+                postprocessingActivity = "collecting " + partitioning.get(partitioningValue) + " documents from partition \"" + partitioningValue + "\" (averall " + count + ") from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey;
-            ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
+
-            final BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
+                // start collection of documents 
-                    collection1query,
+                final long start = System.currentTimeMillis();
-                    (this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
+                final int concurrency = Math.max(1, Math.min((int) (MemoryControl.available() / (100L * 1024L * 1024L)), Runtime.getRuntime().availableProcessors()));
-                    CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
+                //final int concurrency = 1;
-                    CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
+                final boolean reference_computation = this.contains(CollectionSchema.references_i) &&
-                    : null, // null sort is faster!
+                        this.contains(CollectionSchema.references_internal_i) &&
-                    0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true,
+                        this.contains(CollectionSchema.references_external_i) &&
-                    byPartialUpdate ? 
+                        this.contains(CollectionSchema.references_exthosts_i);
-                    new String[]{
+                ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
-                    // the following fields are needed to perform the postprocessing
+                final BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
-                    // and should only be used for partial updates; for full updates use a
+                        partitioningQuery,
-                    // full list of fields to avoid LazyInstantiation which has poor performace
+                        (this.contains(CollectionSchema.http_unique_b) || this.contains(CollectionSchema.www_unique_b)) ?
-                    CollectionSchema.id.getSolrFieldName(),
+                        CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
-                    CollectionSchema.sku.getSolrFieldName(),
+                        CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
-                    CollectionSchema.harvestkey_s.getSolrFieldName(),
+                        : null, // null sort is faster!
-                    CollectionSchema.process_sxt.getSolrFieldName(),
+                        0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true,
-                    CollectionSchema.canonical_equal_sku_b.getSolrFieldName(),
+                        byPartialUpdate ? 
-                    CollectionSchema.canonical_s.getSolrFieldName(),
+                        new String[]{
-                    CollectionSchema.exact_signature_l.getSolrFieldName(),
+                        // the following fields are needed to perform the postprocessing
-                    CollectionSchema.fuzzy_signature_l.getSolrFieldName(),
+                        // and should only be used for partial updates; for full updates use a
-                    CollectionSchema.title_exact_signature_l.getSolrFieldName(),
+                        // full list of fields to avoid LazyInstantiation which has poor performace
-                    CollectionSchema.description_exact_signature_l.getSolrFieldName(),
+                        CollectionSchema.id.getSolrFieldName(),
-                    CollectionSchema.host_id_s.getSolrFieldName(),
+                        CollectionSchema.sku.getSolrFieldName(),
-                    CollectionSchema.host_s.getSolrFieldName(),
+                        CollectionSchema.harvestkey_s.getSolrFieldName(),
-                    CollectionSchema.host_subdomain_s.getSolrFieldName(),
+                        CollectionSchema.process_sxt.getSolrFieldName(),
-                    CollectionSchema.url_chars_i.getSolrFieldName(),
+                        CollectionSchema.canonical_equal_sku_b.getSolrFieldName(),
-                    CollectionSchema.url_protocol_s.getSolrFieldName(),
+                        CollectionSchema.canonical_s.getSolrFieldName(),
-                    CollectionSchema.httpstatus_i.getSolrFieldName(),
+                        CollectionSchema.exact_signature_l.getSolrFieldName(),
-                    CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
+                        CollectionSchema.fuzzy_signature_l.getSolrFieldName(),
-                    CollectionSchema.robots_i.getSolrFieldName()} :
+                        CollectionSchema.title_exact_signature_l.getSolrFieldName(),
-                    this.allFields());
+                        CollectionSchema.description_exact_signature_l.getSolrFieldName(),
-            final AtomicInteger proccount = new AtomicInteger();
+                        CollectionSchema.host_id_s.getSolrFieldName(),
-            final AtomicInteger proccount_referencechange = new AtomicInteger();
+                        CollectionSchema.host_s.getSolrFieldName(),
-            final AtomicInteger proccount_citationchange = new AtomicInteger();
+                        CollectionSchema.host_subdomain_s.getSolrFieldName(),
-            final AtomicInteger countcheck = new AtomicInteger(0);
+                        CollectionSchema.url_chars_i.getSolrFieldName(),
-            final Collection<String> failids = new ArrayList<String>();
+                        CollectionSchema.url_protocol_s.getSolrFieldName(),
-            final Thread rewriteThread[] = new Thread[concurrency];
+                        CollectionSchema.httpstatus_i.getSolrFieldName(),
-            for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
+                        CollectionSchema.inboundlinkscount_i.getSolrFieldName(),
-                rewriteThread[rewrite_start] = new Thread() {
+                        CollectionSchema.robots_i.getSolrFieldName()} :
-                    @Override
+                        this.allFields());
-                    public void run() {
+                final Thread rewriteThread[] = new Thread[concurrency];
-                        SolrDocument doc;
+                for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) {
-                        try {
+                    rewriteThread[rewrite_start] = new Thread() {
-                            while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                        @Override
-                                // for each to-be-processed entry work on the process tag
+                        public void run() {
-                                Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
+                            SolrDocument doc;
-                                final String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
+                            try {
-                                final String i = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
+                                while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
-                                if (proctags == null || proctags.size() == 0) {
+                                    // for each to-be-processed entry work on the process tag
-                                    // this should not happen since we collected the documents using a process_sxt:[* TO *] term
+                                    Collection<Object> proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName());
-                                    ConcurrentLog.warn("CollectionConfiguration", "no process_sxt entry for url " + u + ", id=" + i);
+                                    final String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
-                                    continue;
+                                    final String i = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
-                                }
+                                    if (proctags == null || proctags.size() == 0) {
-                                try {
+                                        // this should not happen since we collected the documents using a process_sxt:[* TO *] term
-                                    DigestURL url = new DigestURL(u, ASCII.getBytes(i));
+                                        ConcurrentLog.warn("CollectionConfiguration", "no process_sxt entry for url " + u + ", id=" + i);
-                                    byte[] id = url.hash();
+                                        continue;
-                                    SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields);
+                                    }
-                                    sid.setField(CollectionSchema.id.getSolrFieldName(), i);
+                                    try {
-                                    for (Object tag: proctags) try {
+                                        DigestURL url = new DigestURL(u, ASCII.getBytes(i));
                                        byte[] id = url.hash();
                                        SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields);
                                        sid.setField(CollectionSchema.id.getSolrFieldName(), i);
                                        for (Object tag: proctags) try {
                                            // switch over tag types
                                            ProcessType tagtype = ProcessType.valueOf((String) tag);
                                            if (tagtype == ProcessType.CITATION &&
                                                collection.contains(CollectionSchema.cr_host_count_i) &&
                                                collection.contains(CollectionSchema.cr_host_chance_d) &&
                                                collection.contains(CollectionSchema.cr_host_norm_i)) {
                                                CRV crv = rankings.remove(ASCII.String(id)); // instead of 'get'ting the CRV, we also remove it because we will not need it again and free some memory here
                                                if (crv != null) {
                                                    sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count);
                                                    sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr);
                                                    sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn);
                                                    proccount_citationchange.incrementAndGet();
                                                }
                                            }
                                            if (tagtype == ProcessType.UNIQUE) {
                                                postprocessing_http_unique(segment, doc, sid, url);
                                                postprocessing_www_unique(segment, doc, sid, url);
                                                postprocessing_doublecontent(segment, uniqueURLs, doc, sid, url);
                                            }
                                        } catch (IllegalArgumentException e) {}
-                                        // switch over tag types
+                                        // compute references
-                                        ProcessType tagtype = ProcessType.valueOf((String) tag);
+                                        if (reference_computation) {
-    
+                                            String hosthash = url.hosthash();
-                                        if (tagtype == ProcessType.CITATION &&
+                                            if (!hostExtentCache.containsKey(hosthash)) {
-                                            collection.contains(CollectionSchema.cr_host_count_i) &&
+                                                StringBuilder q = new StringBuilder();
-                                            collection.contains(CollectionSchema.cr_host_chance_d) &&
+                                                q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
-                                            collection.contains(CollectionSchema.cr_host_norm_i)) {
+                                                long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
-                                            CRV crv = rankings.remove(ASCII.String(id)); // instead of 'get'ting the CRV, we also remove it because we will not need it again and free some memory here
+                                                hostExtentCache.put(hosthash, hostExtentCount);
                                            if (crv != null) {
                                                sid.setField(CollectionSchema.cr_host_count_i.getSolrFieldName(), crv.count);
                                                sid.setField(CollectionSchema.cr_host_chance_d.getSolrFieldName(), crv.cr);
                                                sid.setField(CollectionSchema.cr_host_norm_i.getSolrFieldName(), crv.crn);
                                                proccount_citationchange.incrementAndGet();
                                            }
                                            if (postprocessing_references(rrCache, sid, url, hostExtentCache)) proccount_referencechange.incrementAndGet();
                                        }
-    
+                                        
-                                        if (tagtype == ProcessType.UNIQUE) {
+                                        // all processing steps checked, remove the processing and harvesting key
-                                            postprocessing_http_unique(segment, doc, sid, url);
+                                        if (byPartialUpdate) {
-                                            postprocessing_www_unique(segment, doc, sid, url);
+                                            sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
-                                            postprocessing_doublecontent(segment, uniqueURLs, doc, sid, url);
+                                            sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
                                        } else {
                                            sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
                                            sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
                                        }
                                        // with standard solr fields selected, the sid now contains the fields
                                        // id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i
                                        // and the value for host_extent_i is by default 2147483647
-                                    } catch (IllegalArgumentException e) {}
+                                        // send back to index
-                                    
+                                        //collectionConnector.deleteById(i);
-                                    // compute references
+                                        if (byPartialUpdate) {
-                                    if (reference_computation) {
+                                            collectionConnector.update(sid);
-                                        String hosthash = url.hosthash();
+                                        } else {
-                                        if (!hostExtentCache.containsKey(hosthash)) {
+                                            collectionConnector.add(sid);
                                            StringBuilder q = new StringBuilder();
                                            q.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append("\" AND ").append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200");
                                            long hostExtentCount = segment.fulltext().getDefaultConnector().getCountByQuery(q.toString());
                                            hostExtentCache.put(hosthash, hostExtentCount);
                                        }
-                                        if (postprocessing_references(rrCache, sid, url, hostExtentCache)) proccount_referencechange.incrementAndGet();
+                                        long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet();
-                                    }
+                                        if (thiscount % 100 == 0) {
-                                    
+                                            postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " +
-                                    // all processing steps checked, remove the processing and harvesting key
+                                                (thiscount * 60000L / (System.currentTimeMillis() - start)) + " ppm; " +
-                                    if (byPartialUpdate) {
+                                                ((System.currentTimeMillis() - start) * (count - thiscount) / thiscount / 60000) + " minutes remaining";
-                                        sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
+                                            ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
-                                        sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
+                                        }
-                                    } else {
+                                    } catch (final Throwable e1) {
-                                        sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
+                                        ConcurrentLog.logException(e1);
-                                        sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
+                                        failids.add(i);
                                    }
                                    // with standard solr fields selected, the sid now contains the fields
                                    // id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i
                                    // and the value for host_extent_i is by default 2147483647
                                    // send back to index
                                    //collectionConnector.deleteById(i);
                                    if (byPartialUpdate) {
                                        collectionConnector.update(sid);
                                    } else {
                                        collectionConnector.add(sid);
                                    }
                                    long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet();
                                    if (thiscount % 100 == 0) {
                                        postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " +
                                            (thiscount * 60000L / (System.currentTimeMillis() - start)) + " ppm; " +
                                            ((System.currentTimeMillis() - start) * (count - thiscount) / thiscount / 60000) + " minutes remaining";
                                        ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
                                    }
-                                } catch (final Throwable e1) {
+                                    countcheck.incrementAndGet();
                                    ConcurrentLog.logException(e1);
                                    failids.add(i);
                                }
-                                countcheck.incrementAndGet();
+                            } catch (InterruptedException e) {
                                ConcurrentLog.logException(e);
                            }
                        } catch (InterruptedException e) {
                            ConcurrentLog.logException(e);
                        }
-                    }
+                    };
-                };
+                    rewriteThread[rewrite_start].start();
-                rewriteThread[rewrite_start].start();
+                }
                // wait for termination
                for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join();
            }
            // wait for termination
            for (int rewrite_start = 0; rewrite_start < concurrency; rewrite_start++) rewriteThread[rewrite_start].join();
            if (failids.size() > 0) {
                ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails");
                collectionConnector.deleteByIds(failids);
            }
-            if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null
+            if (count != countcheck.get()) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck + "; countquery=" + collection1query); // big gap for harvestkey = null
            ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount + " new documents, " +
                        proccount_referencechange + " reference-count changes, " +
                        proccount_citationchange + " citation ranking changes.");
        } catch (final InterruptedException e2) {
            ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2);
        } catch (IOException e3) {