adds deleting during recrawl

pull/389/head
sgaebel 5 years ago
parent e0ad8ca9da
commit 80785b785e

@ -73,6 +73,9 @@
<div class="form-group"> <div class="form-group">
<label>Include failed URLs <input type="checkbox" name="includefailedurls" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></label> <label>Include failed URLs <input type="checkbox" name="includefailedurls" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></label>
</div> </div>
<div class="form-group">
<label>Delete URLs <input type="checkbox" name="deleteOnRecrawl" #(deleteOnRecrawl)#::checked="checked"#(/deleteOnRecrawl)# /></label>
</div>
<input type="submit" name="recrawlDefaults" value="Set defaults" class="btn btn-default" title="Reset to default values"/> <input type="submit" name="recrawlDefaults" value="Set defaults" class="btn btn-default" title="Reset to default values"/>
<input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/> <input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
to re-crawl documents selected with the given query. to re-crawl documents selected with the given query.
@ -91,6 +94,9 @@
<div class="form-group"> <div class="form-group">
<label>Include failed urls <input type="checkbox" name="includefailedurls" onchange="this.form.submit()" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></label> <label>Include failed urls <input type="checkbox" name="includefailedurls" onchange="this.form.submit()" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></label>
</div> </div>
<div class="form-group">
<label>Delete urls <input type="checkbox" name="deleteOnRecrawl" onchange="this.form.submit()" #(deleteOnRecrawl)#::checked="checked"#(/deleteOnRecrawl)# /></label>
</div>
<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/> <input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/>
</fieldset> </fieldset>
#(/recrawljobrunning)# #(/recrawljobrunning)#

@ -123,6 +123,7 @@ public class IndexReIndexMonitor_p {
String recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY; String recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY;
boolean inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED; boolean inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED;
boolean deleteOnRecrawl = RecrawlBusyThread.DEFAULT_DELETE_ON_RECRAWL;
// to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only
if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread
@ -137,12 +138,16 @@ public class IndexReIndexMonitor_p {
inclerrdoc = post.getBoolean("includefailedurls"); inclerrdoc = post.getBoolean("includefailedurls");
} }
if (post.containsKey("deleteOnRecrawl")) {
deleteOnRecrawl = post.getBoolean("deleteOnRecrawl");
}
if (recrawlbt == null || recrawlbt.shutdownInProgress()) { if (recrawlbt == null || recrawlbt.shutdownInProgress()) {
prop.put("recrawljobrunning_simulationResult", 0); prop.put("recrawljobrunning_simulationResult", 0);
prop.put("recrawljobrunning_error", 0); prop.put("recrawljobrunning_error", 0);
if (post.containsKey("recrawlnow")) { if (post.containsKey("recrawlnow")) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000); new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc, deleteOnRecrawl), 1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
/* store this call as an api call for easy scheduling possibility */ /* store this call as an api call for easy scheduling possibility */
@ -192,6 +197,7 @@ public class IndexReIndexMonitor_p {
if(post.containsKey("recrawlDefaults")) { if(post.containsKey("recrawlDefaults")) {
recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY; recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY;
inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED; inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED;
deleteOnRecrawl = RecrawlBusyThread.DEFAULT_DELETE_ON_RECRAWL;
} }
} else { } else {
if (post.containsKey("stoprecrawl")) { if (post.containsKey("stoprecrawl")) {
@ -204,9 +210,10 @@ public class IndexReIndexMonitor_p {
if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { if (recrawlbt != null && !recrawlbt.shutdownInProgress()) {
if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) { if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) {
((RecrawlBusyThread) recrawlbt).setQuery(recrawlQuery, inclerrdoc); ((RecrawlBusyThread) recrawlbt).setQuery(recrawlQuery, inclerrdoc, deleteOnRecrawl);
} else { } else {
((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc); ((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc);
((RecrawlBusyThread) recrawlbt).setDeleteOnRecrawl(deleteOnRecrawl);
} }
} }
} }
@ -219,10 +226,12 @@ public class IndexReIndexMonitor_p {
prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).getUrlsToRecrawl()); prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).getUrlsToRecrawl());
prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery()); prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed()); prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
prop.put("recrawljobrunning_deleteOnRecrawl", ((RecrawlBusyThread) recrawlbt).getDeleteOnRecrawl());
} else { } else {
prop.put("recrawljobrunning", 0); prop.put("recrawljobrunning", 0);
prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery); prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery);
prop.put("recrawljobrunning_includefailedurls", inclerrdoc); prop.put("recrawljobrunning_includefailedurls", inclerrdoc);
prop.put("recrawljobrunning_deleteOnRecrawl", deleteOnRecrawl);
} }
// return rewrite properties // return rewrite properties

@ -26,8 +26,10 @@ package net.yacy.crawler;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
@ -44,7 +46,6 @@ import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
/** /**
@ -65,12 +66,18 @@ public class RecrawlBusyThread extends AbstractBusyThread {
/** Default value for inclusion or not of documents with a https status different from 200 (success) */ /** Default value for inclusion or not of documents with a https status different from 200 (success) */
public static final boolean DEFAULT_INCLUDE_FAILED = false; public static final boolean DEFAULT_INCLUDE_FAILED = false;
/** The default value whether to delete on Recrawl */
public static final boolean DEFAULT_DELETE_ON_RECRAWL = false;
/** The current query selecting documents to recrawl */ /** The current query selecting documents to recrawl */
private String currentQuery; private String currentQuery;
/** flag if docs with httpstatus_i <> 200 shall be recrawled */ /** flag if docs with httpstatus_i <> 200 shall be recrawled */
private boolean includefailed; private boolean includefailed;
/** flag whether to delete on Recrawl */
private boolean deleteOnRecrawl;
private int chunkstart = 0; private int chunkstart = 0;
private final int chunksize = 100; private final int chunksize = 100;
private final Switchboard sb; private final Switchboard sb;
@ -116,16 +123,17 @@ public class RecrawlBusyThread extends AbstractBusyThread {
* set to true when documents with a https status different from 200 * set to true when documents with a https status different from 200
* (success) must be included * (success) must be included
*/ */
public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed) { public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed, final boolean deleteOnRecrawl) {
super(3000, 1000); // set lower limits of cycle delay super(3000, 1000); // set lower limits of cycle delay
setName(THREAD_NAME); setName(THREAD_NAME);
this.setIdleSleep(10*60000); // set actual cycle delays this.setIdleSleep(10*60000); // set actual cycle delays
this.setBusySleep(2*60000); this.setBusySleep(2*60000);
this.setPriority(Thread.MIN_PRIORITY); this.setPriority(Thread.MIN_PRIORITY);
this.setLoadPreReqisite(1);
this.sb = xsb; this.sb = xsb;
this.currentQuery = query; this.currentQuery = query;
this.includefailed = includeFailed; this.includefailed = includeFailed;
this.deleteOnRecrawl = deleteOnRecrawl;
this.urlstack = new HashSet<DigestURL>(); this.urlstack = new HashSet<DigestURL>();
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues. // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
@ -143,10 +151,12 @@ public class RecrawlBusyThread extends AbstractBusyThread {
* and resets the counter to start a fresh query loop * and resets the counter to start a fresh query loop
* @param q select query * @param q select query
* @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled * @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled
* @param deleteOnRecrawl
*/ */
public void setQuery(String q, boolean includefailedurls) { public void setQuery(String q, boolean includefailedurls, final boolean deleteOnRecrawl) {
this.currentQuery = q; this.currentQuery = q;
this.includefailed = includefailedurls; this.includefailed = includefailedurls;
this.deleteOnRecrawl = deleteOnRecrawl;
this.chunkstart = 0; this.chunkstart = 0;
} }
@ -181,6 +191,14 @@ public class RecrawlBusyThread extends AbstractBusyThread {
return this.includefailed; return this.includefailed;
} }
public void setDeleteOnRecrawl(final boolean deleteOnRecrawl) {
this.deleteOnRecrawl = deleteOnRecrawl;
}
public boolean getDeleteOnRecrawl() {
return this.deleteOnRecrawl;
}
/** /**
* feed urls to the local crawler * feed urls to the local crawler
* (Switchboard.addToCrawler() is not used here, as there existing urls are always skiped) * (Switchboard.addToCrawler() is not used here, as there existing urls are always skiped)
@ -290,21 +308,27 @@ public class RecrawlBusyThread extends AbstractBusyThread {
} }
if (docList != null) { if (docList != null) {
List<String> tobedeletedIDs = new ArrayList<>();
for (final SolrDocument doc : docList) { for (final SolrDocument doc : docList) {
try { try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
if (deleteOnRecrawl) tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
} catch (final MalformedURLException ex) { } catch (final MalformedURLException ex) {
this.malformedUrlsCount++; this.malformedUrlsCount++;
try { // if index entry hasn't a valid url (useless), delete it // if index entry hasn't a valid url (useless), delete it
solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
this.malformedUrlsDeletedCount++; this.malformedUrlsDeletedCount++;
ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
} catch (final IOException ex1) {
ConcurrentLog.severe(THREAD_NAME, ex1.getMessage());
}
} }
} }
this.chunkstart = this.chunkstart + this.chunksize;
if (!tobedeletedIDs.isEmpty()) try {
solrConnector.deleteByIds(tobedeletedIDs);
} catch (IOException e) {
ConcurrentLog.severe(THREAD_NAME, "error deleting IDs ", e);
}
this.chunkstart = deleteOnRecrawl? 0 : this.chunkstart + this.chunksize;
} }
if (docList == null || docList.size() < this.chunksize) { if (docList == null || docList.size() < this.chunksize) {

Loading…
Cancel
Save