From dcad393fe592170b681958d9abe946ec3d0c1556 Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 11 Jul 2018 08:13:29 +0200 Subject: [PATCH] Fixed exceeding max size of failreason_s Solr field on large link list When using the 'From Link-List of URL' as a crawl start, with lists in the order of one or more thousands of links, the failreason_s Solr field maximum size (32kb) was exceeded by the string representation of the URL must-match filter when a crawl URL was rejected because not matching. --- source/net/yacy/crawler/CrawlStacker.java | 7 +++++-- source/net/yacy/crawler/data/CrawlProfile.java | 18 ++++++++++++++++++ source/net/yacy/crawler/data/CrawlQueues.java | 2 +- source/net/yacy/search/Switchboard.java | 2 +- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 42018e7ce..97d35e005 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -496,8 +496,11 @@ public final class CrawlStacker implements WorkflowTask{ // filter with must-match for URLs if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) { - if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'."); - return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString(); + final String patternStr = profile.formattedUrlMustMatchPattern(); + if (CrawlStacker.log.isFine()) { + CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + patternStr + "'."); + } + return ERROR_NO_MATCH_MUST_MATCH_FILTER + patternStr; } // filter with must-not-match for URLs diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index c3973afd4..488c24de4 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -467,6 +467,24 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.crawlerurlmustmatch; } + + /** + * Render the urlMustMatchPattern as a String of limited size, suffixing it with + * "..." when it is truncated. Used to prevent unnecessary growth of the logs, + * and to prevent exceeding the field size limit for + * CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc + * added to the Solr index. + * + * @return the urlMustMatchPattern formatted as a String of limited size + */ + public String formattedUrlMustMatchPattern() { + String patternStr = urlMustMatchPattern().toString(); + if(patternStr.length() > 1000) { + /* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */ + patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "..."; + } + return patternStr; + } /** * Gets the regex which must not be matched by URLs in order to be crawled. diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 8c0fb2d20..77c983ef0 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -371,7 +371,7 @@ public class CrawlQueues { + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.depth() - + ", must-match=" + profile.urlMustMatchPattern().toString() + + ", must-match=" + profile.formattedUrlMustMatchPattern() + ", must-not-match=" + profile.urlMustNotMatchPattern().toString() + ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false"))); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 957289dee..3a33a5f87 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2992,7 +2992,7 @@ public final class Switchboard extends serverSwitch { "processResourceStack processCase=" + processCase + ", depth=" + response.depth() + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) - + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString()) + + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().formattedUrlMustMatchPattern()) + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString()) + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator())) + ", url=" + response.url()); // DEBUG