From 8d2e7262d9568658269ebe16b9cc28ac81dac6af Mon Sep 17 00:00:00 2001 From: sgaebel Date: Fri, 4 Jan 2019 18:46:59 +0100 Subject: [PATCH] Recrawl: - set the chunksize to 100 to meet the max of the embedded solr - re-enable sorting (the case where we switched it of should be away) - enable recrawling on remote-solr --- htroot/IndexReIndexMonitor_p.java | 100 +++++++++--------- .../net/yacy/crawler/RecrawlBusyThread.java | 5 +- 2 files changed, 50 insertions(+), 55 deletions(-) diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index ac7cb6d0f..71c9eceed 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -140,58 +140,54 @@ public class IndexReIndexMonitor_p { if (recrawlbt == null || recrawlbt.shutdownInProgress()) { prop.put("recrawljobrunning_simulationResult", 0); prop.put("recrawljobrunning_error", 0); - if(!sb.index.fulltext().connectedLocalSolr()) { - prop.put("recrawljobrunning_error", 1); // Re-crawl works only with an embedded local Solr index - } else { - if (post.containsKey("recrawlnow")) { - sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, - new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000); - recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); - - /* store this call as an api call for easy scheduling possibility */ - if(sb.tables != null) { - /* We avoid creating a duplicate of any already recorded API call with the same parameters */ - final Row lastExecutedCall = WorkTables - .selectLastExecutedApiCall(IndexReIndexMonitor_p.SERVLET_NAME, post, sb); - if (lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { - byte[] lastExecutedCallPk = lastExecutedCall.getPK(); - if (lastExecutedCallPk != null) { - post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk)); - } - } - sb.tables.recordAPICall(post, IndexReIndexMonitor_p.SERVLET_NAME, WorkTables.TABLE_API_TYPE_CRAWLER, - "Recrawl documents matching selection query : " + recrawlQuery); - } - } else if(post.containsKey("simulateRecrawl")) { - final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); - if (solrConnector != null && !solrConnector.isClosed()) { - try { - /* Ensure indexed data is up-to-date */ - solrConnector.commit(true); - // query all or only httpstatus=200 depending on includefailed flag - final String finalQuery = RecrawlBusyThread.buildSelectionQuery(recrawlQuery, inclerrdoc); - final long count = solrConnector.getCountByQuery(finalQuery); - prop.put("recrawljobrunning_simulationResult", 1); - prop.put("recrawljobrunning_simulationResult_docCount", count); - if(count > 0) { - /* Got some results : add a link to the related solr select URL for easily browsing results */ - final int maxRows = 10; - final String solrSelectUrl = genLocalSolrSelectUrl(finalQuery, maxRows); - prop.put("recrawljobrunning_simulationResult_showSelectLink", 1); - prop.put("recrawljobrunning_simulationResult_showSelectLink_rows", maxRows); - prop.put("recrawljobrunning_simulationResult_showSelectLink_browseSelectedUrl", solrSelectUrl); - } else { - prop.put("recrawljobrunning_simulationResult_showSelectLink", 0); - } - } catch (final IOException e) { - prop.put("recrawljobrunning_simulationResult", 2); - ConcurrentLog.logException(e); - } - } else { - prop.put("recrawljobrunning_simulationResult", 3); - } - } - } + if (post.containsKey("recrawlnow")) { + sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, + new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000); + recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + + /* store this call as an api call for easy scheduling possibility */ + if(sb.tables != null) { + /* We avoid creating a duplicate of any already recorded API call with the same parameters */ + final Row lastExecutedCall = WorkTables + .selectLastExecutedApiCall(IndexReIndexMonitor_p.SERVLET_NAME, post, sb); + if (lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) { + byte[] lastExecutedCallPk = lastExecutedCall.getPK(); + if (lastExecutedCallPk != null) { + post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk)); + } + } + sb.tables.recordAPICall(post, IndexReIndexMonitor_p.SERVLET_NAME, WorkTables.TABLE_API_TYPE_CRAWLER, + "Recrawl documents matching selection query : " + recrawlQuery); + } + } else if(post.containsKey("simulateRecrawl")) { + final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); + if (solrConnector != null && !solrConnector.isClosed()) { + try { + /* Ensure indexed data is up-to-date */ + solrConnector.commit(true); + // query all or only httpstatus=200 depending on includefailed flag + final String finalQuery = RecrawlBusyThread.buildSelectionQuery(recrawlQuery, inclerrdoc); + final long count = solrConnector.getCountByQuery(finalQuery); + prop.put("recrawljobrunning_simulationResult", 1); + prop.put("recrawljobrunning_simulationResult_docCount", count); + if(count > 0) { + /* Got some results : add a link to the related solr select URL for easily browsing results */ + final int maxRows = 10; + final String solrSelectUrl = genLocalSolrSelectUrl(finalQuery, maxRows); + prop.put("recrawljobrunning_simulationResult_showSelectLink", 1); + prop.put("recrawljobrunning_simulationResult_showSelectLink_rows", maxRows); + prop.put("recrawljobrunning_simulationResult_showSelectLink_browseSelectedUrl", solrSelectUrl); + } else { + prop.put("recrawljobrunning_simulationResult_showSelectLink", 0); + } + } catch (final IOException e) { + prop.put("recrawljobrunning_simulationResult", 2); + ConcurrentLog.logException(e); + } + } else { + prop.put("recrawljobrunning_simulationResult", 3); + } + } if(post.containsKey("recrawlDefaults")) { recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY; diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index df7b87f0c..c658834ab 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -72,7 +72,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { private boolean includefailed; private int chunkstart = 0; - private final int chunksize; + private final int chunksize = 100; private final Switchboard sb; /** buffer of urls to recrawl */ @@ -129,8 +129,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { this.urlstack = new HashSet(); // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues. - this.solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc"; - this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200); + solrSortBy = CollectionSchema.load_date_dt.getSolrFieldName() + " asc"; final SolrConnector solrConnector = this.sb.index.fulltext().getDefaultConnector(); if (solrConnector != null && !solrConnector.isClosed()) {