diff --git a/defaults/yacy.init b/defaults/yacy.init index 2bcc13a11..354963e5c 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -873,6 +873,8 @@ search.verify.delete = true remotesearch.maxcount = 10 remotesearch.maxtime = 3000 remotesearch.result.store=true +# Maximum size allowed (in bytes) for a remote document result to be stored to local index. Defaults to -1, which means no limit. +remotesearch.result.store.maxsize=-1 remotesearch.maxload.rwi=8.0 remotesearch.maxload.solr=4.0 diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html index 26d183beb..f608f1249 100644 --- a/htroot/ConfigPortal.html +++ b/htroot/ConfigPortal.html @@ -66,7 +66,11 @@
Index remote results
- add remote search results to the local index ( default=on, it is recommended to enable this option ! ) + add remote search results to the local index ( default=on, it is recommended to enable this option ! ) +
+
Limit size of indexed remote results
+
+ maximum allowed size in kbytes for each remote search result to be added to the local index (for example, a 1000kbytes limit might be useful if you are running YaCy with a low memory setup)
Default Pop-Up Page
diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index 003404d0d..547310157 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -93,6 +93,7 @@ public class ConfigPortal { final boolean storeresult = post.getBoolean(SwitchboardConstants.REMOTESEARCH_RESULT_STORE); sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, storeresult); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, post.getLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1)); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, post.get("search.verify", "ifexist")); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, post.getBoolean("search.verify.delete")); @@ -148,6 +149,7 @@ public class ConfigPortal { sb.setConfig("search.options", config.getProperty("search.options","true")); sb.setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, config.getProperty(SwitchboardConstants.GREEDYLEARNING_ACTIVE)); sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE)); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE)); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, config.getProperty(SwitchboardConstants.SEARCH_VERIFY,"iffresh")); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, config.getProperty(SwitchboardConstants.SEARCH_VERIFY_DELETE,"true")); sb.setConfig("about.headline", config.getProperty("about.headline","")); @@ -170,6 +172,12 @@ public class ConfigPortal { prop.put(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, sb.getConfig(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, "0")); prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, sb.getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true) ? 1 : 0); + long resultStoredMaxSize = sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1); + if(resultStoredMaxSize > 0) { + prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, resultStoredMaxSize); + } else { + prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, ""); + } prop.put("search.verify.nocache", sb.getConfig("search.verify", "").equals("nocache") ? 1 : 0); prop.put("search.verify.iffresh", sb.getConfig("search.verify", "").equals("iffresh") ? 1 : 0); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index c5c173189..f51a933cb 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -62,6 +62,15 @@ import java.util.Set; import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.http.entity.mime.content.ContentBody; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.FacetField; +import org.apache.solr.client.solrj.response.FacetField.Count; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrInputDocument; + import net.yacy.migration; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.analysis.Classification; @@ -120,15 +129,6 @@ import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; import net.yacy.utils.crypt; -import org.apache.http.entity.mime.content.ContentBody; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.response.FacetField; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.client.solrj.response.FacetField.Count; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrInputDocument; - public final class Protocol { @@ -929,6 +929,18 @@ public final class Protocol { private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.description_txt, CollectionSchema.h4_txt, CollectionSchema.h3_txt, CollectionSchema.h2_txt, CollectionSchema.h1_txt, CollectionSchema.text_t}; + /** + * Execute solr query against specified target. + * @param event search event ot feed with results + * @param solrQuery solr query + * @param offset pagination start indice + * @param count expected maximum results + * @param target target peer to query. May be null : in that case, local peer is queried. + * @param partitions + * @param blacklist url list to exclude from results + * @return the size of results list + * @throws InterruptedException when interrupt status on calling thread is detected while processing + */ protected static int solrQuery( final SearchEvent event, final SolrQuery solrQuery, @@ -1125,12 +1137,17 @@ public final class Protocol { // put the remote documents to the local index. We must convert the solr document to a solr input document: if (event.addResultsToLocalIndex) { - final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); - - // the input document stays untouched because it contains top-level cloned objects - docs.add(sid); - // will be stored to index, and is a full solr document, can be added to firstseen - event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis())); + /* Check document size, only if a limit is set on remote documents size allowed to be stored to local index */ + if(checkDocumentSize(doc, event.getRemoteDocStoredMaxSize() * 1024)) { + final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); + + // the input document stays untouched because it contains top-level cloned objects + docs.add(sid); + // will be stored to index, and is a full solr document, can be added to firstseen + event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis())); + } else { + Network.log.info("Document size greater than " + event.getRemoteDocStoredMaxSize() + " kbytes, excludes it from being stored to local index. Url : " + urlEntry.urlstring()); + } } // after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document @@ -1172,6 +1189,33 @@ public final class Protocol { } return dls; } + + /** + * Only when maxSize is greater than zero, check that doc size is lower. To + * process in a reasonable amount of time, document size is not evaluated + * summing all fields sizes, but only against text_t field which is quite representative and might weigh + * some MB. + * + * @param doc + * document to verify. Must not be null. + * @param maxSize + * maximum allowed size in bytes + * @return true when document evaluated size is lower or equal than maxSize, or when + * maxSize is lower or equal than zero. + */ + protected static boolean checkDocumentSize(SolrDocument doc, long maxSize) { + if (maxSize > 0) { + /* All text field is often the largest */ + Object value = doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + if(value instanceof String) { + /* Each char uses 2 bytes */ + if(((String)value).length() > (maxSize /2)) { + return false; + } + } + } + return true; + } public static Map permissionMessage(final String targetAddress, final String targetHash) { // ask for allowed message size and attachment size diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 32cc6c916..3b8c3e19b 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -332,6 +332,8 @@ public final class SwitchboardConstants { public static final String REMOTESEARCH_MAXCOUNT_USER = "remotesearch.maxcount"; public static final String REMOTESEARCH_MAXTIME_USER = "remotesearch.maxtime"; public static final String REMOTESEARCH_RESULT_STORE = "remotesearch.result.store"; // add remote results to local index + /** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */ + public static final String REMOTESEARCH_RESULT_STORE_MAXSIZE= "remotesearch.result.store.maxsize"; public static final String REMOTESEARCH_MAXLOAD_RWI = "remotesearch.maxload.rwi"; public static final String REMOTESEARCH_MAXLOAD_SOLR = "remotesearch.maxload.solr"; diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 83485c1a4..994902d9e 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -161,6 +161,8 @@ public final class SearchEvent { private ConcurrentHashMap> snippets; private final boolean remote; public final boolean addResultsToLocalIndex; // add received results to local index (defult=true) + /** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */ + private long remoteStoredDocMaxSize; private SortedMap> localSearchInclusion; private final ScoreMap ref; // reference score computation for the commonSense heuristic private final long maxtime; @@ -198,6 +200,22 @@ public final class SearchEvent { ); } + /** + * Set maximum size allowed (in kbytes) for a remote document result to be stored to local index. + * @param maxSize document content max size in kbytes. Zero or negative value means no limit. + */ + public void setRemoteDocStoredMaxSize(long maxSize) { + this.remoteStoredDocMaxSize = maxSize; + } + + /** + * @return maximum size allowed (in kbytes) for a remote document result to be stored to local index. + * Zero or negative value means no limit. + */ + public long getRemoteDocStoredMaxSize() { + return this.remoteStoredDocMaxSize; + } + protected SearchEvent( final QueryParams query, final SeedDB peers, @@ -261,6 +279,8 @@ public final class SearchEvent { this.IAneardhthash = null; this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false))); this.addResultsToLocalIndex = addResultsToLocalIdx; + /* Défault : no size limit to store remote result documents to local index. Use setter to eventually modify it. */ + this.remoteStoredDocMaxSize = -1; this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering this.local_rwi_stored = new AtomicInteger(0); this.local_solr_available = new AtomicInteger(0); diff --git a/source/net/yacy/search/query/SearchEventCache.java b/source/net/yacy/search/query/SearchEventCache.java index 5fb5ff671..0f8bb73fc 100644 --- a/source/net/yacy/search/query/SearchEventCache.java +++ b/source/net/yacy/search/query/SearchEventCache.java @@ -174,6 +174,10 @@ public class SearchEventCache { || (sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.peers.mySeed().getFlagAcceptRemoteIndex()); final boolean addToLocalIdx = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true); event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, delete, addToLocalIdx); + /* Optional config option may be valued to limit size of remote documents added to local index */ + if(sb != null) { + event.setRemoteDocStoredMaxSize(sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1)); + } MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads }