diff --git a/defaults/yacy.init b/defaults/yacy.init
index 2bcc13a11..354963e5c 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -873,6 +873,8 @@ search.verify.delete = true
remotesearch.maxcount = 10
remotesearch.maxtime = 3000
remotesearch.result.store=true
+# Maximum size allowed (in bytes) for a remote document result to be stored to local index. Defaults to -1, which means no limit.
+remotesearch.result.store.maxsize=-1
remotesearch.maxload.rwi=8.0
remotesearch.maxload.solr=4.0
diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html
index 26d183beb..f608f1249 100644
--- a/htroot/ConfigPortal.html
+++ b/htroot/ConfigPortal.html
@@ -66,7 +66,11 @@
Index remote results
- add remote search results to the local index ( default=on, it is recommended to enable this option ! )
+ add remote search results to the local index ( default=on, it is recommended to enable this option ! )
+
+ Limit size of indexed remote results
+
+ maximum allowed size in kbytes for each remote search result to be added to the local index (for example, a 1000kbytes limit might be useful if you are running YaCy with a low memory setup)
Default Pop-Up Page
diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java
index 003404d0d..547310157 100644
--- a/htroot/ConfigPortal.java
+++ b/htroot/ConfigPortal.java
@@ -93,6 +93,7 @@ public class ConfigPortal {
final boolean storeresult = post.getBoolean(SwitchboardConstants.REMOTESEARCH_RESULT_STORE);
sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, storeresult);
+ sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, post.getLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, post.get("search.verify", "ifexist"));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, post.getBoolean("search.verify.delete"));
@@ -148,6 +149,7 @@ public class ConfigPortal {
sb.setConfig("search.options", config.getProperty("search.options","true"));
sb.setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, config.getProperty(SwitchboardConstants.GREEDYLEARNING_ACTIVE));
sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE));
+ sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, config.getProperty(SwitchboardConstants.SEARCH_VERIFY,"iffresh"));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, config.getProperty(SwitchboardConstants.SEARCH_VERIFY_DELETE,"true"));
sb.setConfig("about.headline", config.getProperty("about.headline",""));
@@ -170,6 +172,12 @@ public class ConfigPortal {
prop.put(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, sb.getConfig(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, "0"));
prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, sb.getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true) ? 1 : 0);
+ long resultStoredMaxSize = sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1);
+ if(resultStoredMaxSize > 0) {
+ prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, resultStoredMaxSize);
+ } else {
+ prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, "");
+ }
prop.put("search.verify.nocache", sb.getConfig("search.verify", "").equals("nocache") ? 1 : 0);
prop.put("search.verify.iffresh", sb.getConfig("search.verify", "").equals("iffresh") ? 1 : 0);
diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java
index c5c173189..f51a933cb 100644
--- a/source/net/yacy/peers/Protocol.java
+++ b/source/net/yacy/peers/Protocol.java
@@ -62,6 +62,15 @@ import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.http.entity.mime.content.ContentBody;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.response.FacetField;
+import org.apache.solr.client.solrj.response.FacetField.Count;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrInputDocument;
+
import net.yacy.migration;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification;
@@ -120,15 +129,6 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.utils.crypt;
-import org.apache.http.entity.mime.content.ContentBody;
-import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.response.FacetField;
-import org.apache.solr.client.solrj.response.QueryResponse;
-import org.apache.solr.client.solrj.response.FacetField.Count;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
-import org.apache.solr.common.SolrInputDocument;
-
public final class Protocol {
@@ -929,6 +929,18 @@ public final class Protocol {
private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.description_txt, CollectionSchema.h4_txt, CollectionSchema.h3_txt, CollectionSchema.h2_txt, CollectionSchema.h1_txt, CollectionSchema.text_t};
+ /**
+ * Execute solr query against specified target.
+ * @param event search event ot feed with results
+ * @param solrQuery solr query
+ * @param offset pagination start indice
+ * @param count expected maximum results
+ * @param target target peer to query. May be null : in that case, local peer is queried.
+ * @param partitions
+ * @param blacklist url list to exclude from results
+ * @return the size of results list
+ * @throws InterruptedException when interrupt status on calling thread is detected while processing
+ */
protected static int solrQuery(
final SearchEvent event,
final SolrQuery solrQuery,
@@ -1125,12 +1137,17 @@ public final class Protocol {
// put the remote documents to the local index. We must convert the solr document to a solr input document:
if (event.addResultsToLocalIndex) {
- final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
-
- // the input document stays untouched because it contains top-level cloned objects
- docs.add(sid);
- // will be stored to index, and is a full solr document, can be added to firstseen
- event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
+ /* Check document size, only if a limit is set on remote documents size allowed to be stored to local index */
+ if(checkDocumentSize(doc, event.getRemoteDocStoredMaxSize() * 1024)) {
+ final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
+
+ // the input document stays untouched because it contains top-level cloned objects
+ docs.add(sid);
+ // will be stored to index, and is a full solr document, can be added to firstseen
+ event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
+ } else {
+ Network.log.info("Document size greater than " + event.getRemoteDocStoredMaxSize() + " kbytes, excludes it from being stored to local index. Url : " + urlEntry.urlstring());
+ }
}
// after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document
@@ -1172,6 +1189,33 @@ public final class Protocol {
}
return dls;
}
+
+ /**
+ * Only when maxSize is greater than zero, check that doc size is lower. To
+ * process in a reasonable amount of time, document size is not evaluated
+ * summing all fields sizes, but only against text_t field which is quite representative and might weigh
+ * some MB.
+ *
+ * @param doc
+ * document to verify. Must not be null.
+ * @param maxSize
+ * maximum allowed size in bytes
+ * @return true when document evaluated size is lower or equal than maxSize, or when
+ * maxSize is lower or equal than zero.
+ */
+ protected static boolean checkDocumentSize(SolrDocument doc, long maxSize) {
+ if (maxSize > 0) {
+ /* All text field is often the largest */
+ Object value = doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
+ if(value instanceof String) {
+ /* Each char uses 2 bytes */
+ if(((String)value).length() > (maxSize /2)) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
public static Map permissionMessage(final String targetAddress, final String targetHash) {
// ask for allowed message size and attachment size
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index 32cc6c916..3b8c3e19b 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -332,6 +332,8 @@ public final class SwitchboardConstants {
public static final String REMOTESEARCH_MAXCOUNT_USER = "remotesearch.maxcount";
public static final String REMOTESEARCH_MAXTIME_USER = "remotesearch.maxtime";
public static final String REMOTESEARCH_RESULT_STORE = "remotesearch.result.store"; // add remote results to local index
+ /** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */
+ public static final String REMOTESEARCH_RESULT_STORE_MAXSIZE= "remotesearch.result.store.maxsize";
public static final String REMOTESEARCH_MAXLOAD_RWI = "remotesearch.maxload.rwi";
public static final String REMOTESEARCH_MAXLOAD_SOLR = "remotesearch.maxload.solr";
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index 83485c1a4..994902d9e 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -161,6 +161,8 @@ public final class SearchEvent {
private ConcurrentHashMap> snippets;
private final boolean remote;
public final boolean addResultsToLocalIndex; // add received results to local index (defult=true)
+ /** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */
+ private long remoteStoredDocMaxSize;
private SortedMap> localSearchInclusion;
private final ScoreMap ref; // reference score computation for the commonSense heuristic
private final long maxtime;
@@ -198,6 +200,22 @@ public final class SearchEvent {
);
}
+ /**
+ * Set maximum size allowed (in kbytes) for a remote document result to be stored to local index.
+ * @param maxSize document content max size in kbytes. Zero or negative value means no limit.
+ */
+ public void setRemoteDocStoredMaxSize(long maxSize) {
+ this.remoteStoredDocMaxSize = maxSize;
+ }
+
+ /**
+ * @return maximum size allowed (in kbytes) for a remote document result to be stored to local index.
+ * Zero or negative value means no limit.
+ */
+ public long getRemoteDocStoredMaxSize() {
+ return this.remoteStoredDocMaxSize;
+ }
+
protected SearchEvent(
final QueryParams query,
final SeedDB peers,
@@ -261,6 +279,8 @@ public final class SearchEvent {
this.IAneardhthash = null;
this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false)));
this.addResultsToLocalIndex = addResultsToLocalIdx;
+ /* Défault : no size limit to store remote result documents to local index. Use setter to eventually modify it. */
+ this.remoteStoredDocMaxSize = -1;
this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering
this.local_rwi_stored = new AtomicInteger(0);
this.local_solr_available = new AtomicInteger(0);
diff --git a/source/net/yacy/search/query/SearchEventCache.java b/source/net/yacy/search/query/SearchEventCache.java
index 5fb5ff671..0f8bb73fc 100644
--- a/source/net/yacy/search/query/SearchEventCache.java
+++ b/source/net/yacy/search/query/SearchEventCache.java
@@ -174,6 +174,10 @@ public class SearchEventCache {
|| (sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.peers.mySeed().getFlagAcceptRemoteIndex());
final boolean addToLocalIdx = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true);
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, delete, addToLocalIdx);
+ /* Optional config option may be valued to limit size of remote documents added to local index */
+ if(sb != null) {
+ event.setRemoteDocStoredMaxSize(sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1));
+ }
MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads
}