From dcad393fe592170b681958d9abe946ec3d0c1556 Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Wed, 11 Jul 2018 08:13:29 +0200
Subject: [PATCH] Fixed exceeding max size of failreason_s Solr field on large
 link list

When using the 'From Link-List of URL' as a crawl start, with lists in
the order of one or more thousands of links, the failreason_s Solr field
maximum size (32kb) was exceeded by the string representation of the URL
must-match filter when a crawl URL was rejected because not matching.
---
 source/net/yacy/crawler/CrawlStacker.java      |  7 +++++--
 source/net/yacy/crawler/data/CrawlProfile.java | 18 ++++++++++++++++++
 source/net/yacy/crawler/data/CrawlQueues.java  |  2 +-
 source/net/yacy/search/Switchboard.java        |  2 +-
 4 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index 42018e7ce..97d35e005 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -496,8 +496,11 @@ public final class CrawlStacker implements WorkflowTask<Request>{
 
         // filter with must-match for URLs
         if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
-            if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
-            return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
+        	final String patternStr = profile.formattedUrlMustMatchPattern();
+            if (CrawlStacker.log.isFine()) {
+            	CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + patternStr + "'.");
+            }
+            return ERROR_NO_MATCH_MUST_MATCH_FILTER + patternStr;
         }
 
         // filter with must-not-match for URLs
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index c3973afd4..488c24de4 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -467,6 +467,24 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         }
         return this.crawlerurlmustmatch;
     }
+    
+	/**
+	 * Render the urlMustMatchPattern as a String of limited size, suffixing it with
+	 * "..." when it is truncated. Used to prevent unnecessary growth of the logs,
+	 * and to prevent exceeding the field size limit for
+	 * CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
+	 * added to the Solr index.
+	 * 
+	 * @return the urlMustMatchPattern formatted as a String of limited size
+	 */
+    public String formattedUrlMustMatchPattern() {
+    	String patternStr = urlMustMatchPattern().toString();
+    	if(patternStr.length() > 1000) {
+    		/* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
+    		patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
+    	}
+    	return patternStr;
+    }
 
     /**
      * Gets the regex which must not be matched by URLs in order to be crawled.
diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
index 8c0fb2d20..77c983ef0 100644
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -371,7 +371,7 @@ public class CrawlQueues {
                             + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
                             + ", depth=" + urlEntry.depth()
                             + ", crawlDepth=" + profile.depth()
-                            + ", must-match=" + profile.urlMustMatchPattern().toString()
+                            + ", must-match=" + profile.formattedUrlMustMatchPattern()
                             + ", must-not-match=" + profile.urlMustNotMatchPattern().toString()
                             + ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
                 }
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 957289dee..3a33a5f87 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2992,7 +2992,7 @@ public final class Switchboard extends serverSwitch {
                 "processResourceStack processCase=" + processCase
                 + ", depth=" + response.depth()
                 + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
-                + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+                + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().formattedUrlMustMatchPattern())
                 + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
                 + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
                 + ", url=" + response.url()); // DEBUG