Adjusted crawl depth control for FTP crawl start URLs.

pull/101/merge
luccioman 8 years ago
parent 68d4dc5cc5
commit c1401d821e

@ -267,6 +267,7 @@ public final class CrawlStacker {
final String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); final String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1);
final String host = ftpURL.getHost(); final String host = ftpURL.getHost();
final int port = ftpURL.getPort(); final int port = ftpURL.getPort();
final int pathParts = ftpURL.getPaths().length;
new Thread() { new Thread() {
@Override @Override
public void run() { public void run() {
@ -290,6 +291,10 @@ public final class CrawlStacker {
cq.noticeURL.removeByURLHash(urlhash); cq.noticeURL.removeByURLHash(urlhash);
} }
/* Each entry is a children resource of the starting ftp URL :
* take into account the sub folder depth in the crawl depth control */
int nextDepth = Math.max(0, url.getPaths().length - pathParts);
// put entry on crawl stack // put entry on crawl stack
enqueueEntry(new Request( enqueueEntry(new Request(
initiator, initiator,
@ -298,7 +303,7 @@ public final class CrawlStacker {
MultiProtocolURL.unescape(entry.name), MultiProtocolURL.unescape(entry.name),
entry.date, entry.date,
profile.handle(), profile.handle(),
0, nextDepth,
timezoneOffset)); timezoneOffset));
} }
} catch (final IOException e1) { } catch (final IOException e1) {

Loading…
Cancel
Save