fixed default must-match filter for full domain crawls - the old filter

was to restrictive and did not allow intranet crawls
pull/1/head
Michael Christen 13 years ago
parent 3e61287326
commit 22f05c83ff

@ -482,16 +482,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (crawlingStartURL.isFile()) { if (crawlingStartURL.isFile()) {
return "file://" + crawlingStartURL.getPath() + ".*"; return "file://" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isSMB()) { } else if (crawlingStartURL.isSMB()) {
return "smb://" + crawlingStartURL.getHost() + "(?:/|$)+.*"; return "smb://" + crawlingStartURL.getHost() + ".*";
} else if (crawlingStartURL.isFTP()) { } else if (crawlingStartURL.isFTP()) {
return "ftp://" + crawlingStartURL.getHost() + "(?:/|$)+.*"; return "ftp://" + crawlingStartURL.getHost() + ".*";
} else { } else {
final String host = crawlingStartURL.getHost(); final String host = crawlingStartURL.getHost();
if (host.startsWith("www.")) { if (host.startsWith("www.")) {
return "https?://" + crawlingStartURL.getHost() + "(?:/|$)+.*"; return "https?://" + crawlingStartURL.getHost() + ".*";
} else { } else {
// if the www is not given we accept that also // if the www is not given we accept that also
return "https?://(?:www.)?" + crawlingStartURL.getHost() + "(?:/|$)+.*"; return "https?://(?:www.)?" + crawlingStartURL.getHost() + ".*";
} }
} }
} }

@ -414,13 +414,13 @@ public final class CrawlStacker {
// filter with must-match for URLs // filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) { if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return "url does not match must-match filter"; return "url does not match must-match filter " + profile.urlMustMatchPattern().toString();
} }
// filter with must-not-match for URLs // filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) { if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return "url matches must-not-match filter"; return "url matches must-not-match filter " + profile.urlMustNotMatchPattern().toString();
} }
// deny cgi // deny cgi

Loading…
Cancel
Save