@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyNewsPool ;
public class Crawler_p {
public static final String CRAWLING_MODE_URL = "url" ;
public static final String CRAWLING_MODE_FILE = "file" ;
public static final String CRAWLING_MODE_SITEMAP = "sitemap" ;
// this servlet does NOT create the Crawler servlet page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@ -102,10 +98,8 @@ public class Crawler_p {
}
prop . put ( "info" , "0" ) ;
if ( post ! = null ) {
// a crawl start
if ( post . containsKey ( "continue" ) ) {
if ( post ! = null & & post . containsKey ( "continue" ) ) {
// continue queue
final String queue = post . get ( "continue" , "" ) ;
if ( queue . equals ( "localcrawler" ) ) {
@ -115,7 +109,7 @@ public class Crawler_p {
}
}
if ( post . containsKey ( "pause" ) ) {
if ( post ! = null & & post . containsKey ( "pause" ) ) {
// pause queue
final String queue = post . get ( "pause" , "" ) ;
if ( queue . equals ( "localcrawler" ) ) {
@ -125,7 +119,7 @@ public class Crawler_p {
}
}
if ( post . containsKey ( "crawlingstart" ) ) {
if ( post ! = null & & post . containsKey ( "crawlingstart" ) ) {
// init crawl
if ( sb . peers = = null ) {
prop . put ( "info" , "3" ) ;
@ -135,7 +129,7 @@ public class Crawler_p {
int pos = crawlingStart . indexOf ( "://" ) ;
if ( pos = = - 1 ) crawlingStart = "http://" + crawlingStart ;
// normalizing URL
// normalize URL
DigestURI crawlingStartURL = null ;
try { crawlingStartURL = new DigestURI ( crawlingStart , null ) ; } catch ( final MalformedURLException e1 ) { }
crawlingStart = ( crawlingStartURL = = null ) ? null : crawlingStartURL . toNormalform ( true , true ) ;
@ -145,7 +139,7 @@ public class Crawler_p {
final boolean subPath = post . get ( "range" , "wide" ) . equals ( "subpath" ) ; // special property in simple crawl start
// set the crawl ing filter
// set the crawl filter
String newcrawlingMustMatch = post . get ( "mustmatch" , CrawlProfile . MATCH_ALL ) ;
String newcrawlingMustNotMatch = post . get ( "mustnotmatch" , CrawlProfile . MATCH_NEVER ) ;
if ( newcrawlingMustMatch . length ( ) < 2 ) newcrawlingMustMatch = CrawlProfile . MATCH_ALL ; // avoid that all urls are filtered out if bad value was submitted
@ -195,9 +189,6 @@ public class Crawler_p {
// store just a protocol
sb . tables . recordAPICall ( post , "Crawler_p.html" , WorkTables . TABLE_API_TYPE_CRAWLER , "crawl start for " + crawlingStart ) ;
}
final boolean crawlingDomFilterCheck = post . get ( "crawlingDomFilterCheck" , "off" ) . equals ( "on" ) ;
final int crawlingDomFilterDepth = ( crawlingDomFilterCheck ) ? Integer . parseInt ( post . get ( "crawlingDomFilterDepth" , "-1" ) ) : - 1 ;
env . setConfig ( "crawlingDomFilterDepth" , Integer . toString ( crawlingDomFilterDepth ) ) ;
final boolean crawlingDomMaxCheck = post . get ( "crawlingDomMaxCheck" , "off" ) . equals ( "on" ) ;
final int crawlingDomMaxPages = ( crawlingDomMaxCheck ) ? Integer . parseInt ( post . get ( "crawlingDomMaxPages" , "-1" ) ) : - 1 ;
@ -232,7 +223,7 @@ public class Crawler_p {
env . setConfig ( "xpstopw" , ( xpstopw ) ? "true" : "false" ) ;
final String crawlingMode = post . get ( "crawlingMode" , "url" ) ;
if ( crawlingMode . equals ( CRAWLING_MODE_URL ) ) {
if ( crawlingMode . equals ( "url" ) ) {
// check if pattern matches
if ( ( crawlingStart = = null | | crawlingStartURL = = null ) /* || (!(crawlingStart.matches(newcrawlingfilter))) */ ) {
@ -261,7 +252,7 @@ public class Crawler_p {
newcrawlingMustMatch ,
newcrawlingMustNotMatch ,
newcrawlingdepth ,
crawlingIfOlder , crawlingDomFilterDepth , crawlingDomMaxPages ,
crawlingIfOlder , crawlingDomMaxPages ,
crawlingQ ,
indexText , indexMedia ,
storeHTCache , true , crawlOrder , xsstopw , xdstopw , xpstopw , cachePolicy ) ;
@ -346,31 +337,19 @@ public class Crawler_p {
Log . logException ( e ) ;
}
} else if ( crawlingMode . equals ( CRAWLING_MODE_FILE ) ) {
} else if ( crawlingMode . equals ( "file" ) ) {
if ( post . containsKey ( "crawlingFile" ) ) {
// getting the name of the uploaded file
final String fileName = post . get ( "crawlingFile" ) ;
try {
// check if the crawl filter works correctly
Pattern . compile ( newcrawlingMustMatch ) ;
// loading the file content
final File file = new File ( fileName ) ;
// getting the content of the bookmark file
final String fileString = post . get ( "crawlingFile$file" ) ;
// parsing the bookmark file and fetching the headline and contained links
final ContentScraper scraper = new ContentScraper ( new DigestURI ( file ) ) ;
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new TransformerWriter ( null , null , scraper , null , false ) ;
FileUtils . copy ( fileString , writer ) ;
writer . close ( ) ;
//String headline = scraper.getHeadline();
final Map < MultiProtocolURI , String > hyperlinks = scraper . getAnchors ( ) ;
// creating a crawler profile
final DigestURI crawlURL = new DigestURI ( "file://" + file . toString ( ) , null ) ;
final CrawlProfile profile = new CrawlProfile (
fileName , crawlURL ,
@ -378,7 +357,6 @@ public class Crawler_p {
CrawlProfile . MATCH_NEVER ,
newcrawlingdepth ,
crawlingIfOlder ,
crawlingDomFilterDepth ,
crawlingDomMaxPages ,
crawlingQ ,
indexText ,
@ -389,19 +367,13 @@ public class Crawler_p {
xsstopw , xdstopw , xpstopw ,
cachePolicy ) ;
sb . crawler . profilesActiveCrawls . put ( profile . handle ( ) . getBytes ( ) , profile ) ;
// pause local crawl here
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
// loop through the contained links
final Iterator < Map . Entry < MultiProtocolURI , String > > linkiterator = hyperlinks . entrySet ( ) . iterator ( ) ;
DigestURI nexturl ;
while ( linkiterator . hasNext ( ) ) {
final Map . Entry < MultiProtocolURI , String > e = linkiterator . next ( ) ;
if ( e . getKey ( ) = = null ) continue ;
nexturl = new DigestURI ( e . getKey ( ) ) ;
// enqueuing the url for crawling
sb . crawlStacker . enqueueEntry ( new Request (
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
nexturl ,
@ -416,7 +388,6 @@ public class Crawler_p {
}
} catch ( final PatternSyntaxException e ) {
// print error message
prop . put ( "info" , "4" ) ; //crawlfilter does not match url
prop . putHTML ( "info_newcrawlingfilter" , newcrawlingMustMatch ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
@ -429,31 +400,24 @@ public class Crawler_p {
}
sb . continueCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
}
} else if ( crawlingMode . equals ( CRAWLING_MODE_SITEMAP ) ) {
String sitemapURLStr = null ;
} else if ( crawlingMode . equals ( "sitemap" ) ) {
String sitemapURLStr = post . get ( "sitemapURL" , "" ) ;
try {
// getting the sitemap URL
sitemapURLStr = post . get ( "sitemapURL" , "" ) ;
final DigestURI sitemapURL = new DigestURI ( sitemapURLStr , null ) ;
// create a new profile
final CrawlProfile pe = new CrawlProfile (
sitemapURLStr , sitemapURL ,
newcrawlingMustMatch ,
CrawlProfile . MATCH_NEVER ,
newcrawlingdepth ,
crawlingIfOlder , crawlingDomFilterDepth , crawlingDomMaxPages ,
crawlingIfOlder , crawlingDomMaxPages ,
crawlingQ ,
indexText , indexMedia ,
storeHTCache , true , crawlOrder ,
xsstopw , xdstopw , xpstopw ,
cachePolicy ) ;
sb . crawler . profilesActiveCrawls . put ( pe . handle ( ) . getBytes ( ) , pe ) ;
// create a new sitemap importer
final SitemapImporter importer = new SitemapImporter ( sb , new DigestURI ( sitemapURLStr , null ) , pe ) ;
final SitemapImporter importer = new SitemapImporter ( sb , sitemapURL , pe ) ;
importer . start ( ) ;
} catch ( final Exception e ) {
// mist
prop . put ( "info" , "6" ) ; //Error with url
@ -461,14 +425,79 @@ public class Crawler_p {
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
Log . logException ( e ) ;
}
} else if ( crawlingMode . equals ( "sitelist" ) ) {
try {
final DigestURI sitelistURL = new DigestURI ( crawlingStart , null ) ;
// download document
ContentScraper scraper = null ;
scraper = sb . loader . parseResource ( sitelistURL , CrawlProfile . CacheStrategy . IFFRESH ) ;
String title = scraper . getTitle ( ) ;
// String description = scraper.getDescription();
// get links and generate filter
StringBuilder filter = new StringBuilder ( ) ;
final Map < MultiProtocolURI , String > hyperlinks = scraper . getAnchors ( ) ;
for ( MultiProtocolURI uri : hyperlinks . keySet ( ) ) {
filter . append ( '|' ) . append ( uri . getProtocol ( ) ) . append ( "://" ) . append ( uri . getHost ( ) ) . append ( ".*" ) ;
}
newcrawlingMustMatch = filter . length ( ) > 0 ? filter . substring ( 1 ) : "" ;
// put links onto crawl queue
final CrawlProfile profile = new CrawlProfile (
title = = null | | title . length ( ) = = 0 ? sitelistURL . getHost ( ) : title ,
sitelistURL ,
newcrawlingMustMatch ,
CrawlProfile . MATCH_NEVER ,
newcrawlingdepth ,
crawlingIfOlder ,
crawlingDomMaxPages ,
crawlingQ ,
indexText ,
indexMedia ,
storeHTCache ,
true ,
crawlOrder ,
xsstopw , xdstopw , xpstopw ,
cachePolicy ) ;
sb . crawler . profilesActiveCrawls . put ( profile . handle ( ) . getBytes ( ) , profile ) ;
sb . pauseCrawlJob ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL ) ;
final Iterator < Map . Entry < MultiProtocolURI , String > > linkiterator = hyperlinks . entrySet ( ) . iterator ( ) ;
DigestURI nexturl ;
while ( linkiterator . hasNext ( ) ) {
final Map . Entry < MultiProtocolURI , String > e = linkiterator . next ( ) ;
if ( e . getKey ( ) = = null ) continue ;
nexturl = new DigestURI ( e . getKey ( ) ) ;
// remove the url from the database to be prepared to crawl them again
final byte [ ] urlhash = nexturl . hash ( ) ;
indexSegment . urlMetadata ( ) . remove ( urlhash ) ;
sb . crawlQueues . noticeURL . removeByURLHash ( urlhash ) ;
sb . crawlQueues . errorURL . remove ( urlhash ) ;
sb . crawlStacker . enqueueEntry ( new Request (
sb . peers . mySeed ( ) . hash . getBytes ( ) ,
nexturl ,
null ,
e . getValue ( ) ,
new Date ( ) ,
profile . handle ( ) ,
0 ,
0 ,
0
) ) ;
}
} catch ( final Exception e ) {
// mist
prop . put ( "info" , "6" ) ; //Error with url
prop . putHTML ( "info_crawlingStart" , crawlingStart ) ;
prop . putHTML ( "info_error" , e . getMessage ( ) ) ;
Log . logException ( e ) ;
}
}
}
}
if ( post . containsKey ( "crawlingPerformance" ) ) {
if ( post ! = null & & post . containsKey ( "crawlingPerformance" ) ) {
setPerformance ( sb , post ) ;
}
}
// performance settings
final long LCbusySleep = Integer . parseInt ( env . getConfig ( SwitchboardConstants . CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP , "1000" ) ) ;