replaced auto-dom filter with easy-to-understand Site Link-List crawler option

- nobody understand the auto-dom filter without a lenghtly introduction about the function of a crawler - nobody ever used the auto-dom filter other than with a crawl depth of 1 - the auto-dom filter was buggy since the filter did not survive a restart and then a search index contained waste - the function of the auto-dom filter was in fact to just load a link list from the given start url and then start separate crawls for all these urls restricted by their domain - the new Site Link-List option shows the target urls in real-time during input of the start url (like the robots check) and gives a transparent feed-back what it does before it can be used - the new option also fits into the easy site-crawl start menu git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · f6eebb6f99
parent 63e387508c
commit f6eebb6f99
13 changed files with 437 additions and 424 deletions
--- a/htroot/CrawlProfileEditor_p.html
+++ b/htroot/CrawlProfileEditor_p.html
@ -38,8 +38,7 @@
    <td><strong>Must Match</strong></td>
    <td><strong>Must Not Match</strong></td>
    <td><strong>MaxAge</strong></td>
-    <td><strong>Auto Filter Depth</strong></td>
-    <td><strong>Auto Filter Content</strong></td>
+    <td><strong>Domain Counter Content</strong></td>
    <td><strong>Max Page Per Domain</strong></td>
    <td><strong>Accept '?' URLs</strong></td>
    <td><strong>Fill Proxy Cache</strong></td>
@ -70,7 +69,6 @@
    <td>#[mustmatch]#</td>
    <td>#[mustnotmatch]#</td>
    <td>#[crawlingIfOlder]#</td>
-    <td>#[crawlingDomFilterDepth]#</td>
    <td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
    <td>#[crawlingDomMaxPages]#</td>
    <td>#(withQuery)#no::yes#(/withQuery)#</td>
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -87,7 +87,6 @@ public class CrawlProfileEditor_p {
        labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
        labels.add(new eentry(CrawlProfile.DEPTH,               "Crawl Depth",           false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER,    "Recrawl If Older",      false, eentry.INTEGER));
-        labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH,    "Domain Filter Depth",   false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES,       "Domain Max. Pages",     false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.CRAWLING_Q,          "CrawlingQ / '?'-URLs",  false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.INDEX_TEXT,          "Index Text",            false, eentry.BOOLEAN));
@ -245,7 +244,7 @@ public class CrawlProfileEditor_p {
        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
-        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");

        // start contrib [MN]
        int i = 0;
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -9,7 +9,6 @@
 		<mustmatch>#[mustmatch]#</mustmatch>
 		<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
 		<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
-		<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
 		<crawlingDomFilterContent>
 		#{crawlingDomFilterContent}#
 			<item>#[item]#</item>
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -44,6 +44,13 @@
                  <input name="crawlingURL" type="text" size="41" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" />                                    
                </td>
              </tr>
+              <tr>
+                <td><label for="url"><span class="nobr">From Link-List of URL</span></label>:</td>
+                <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"/></td>
+                <td>
+                  <div id="sitelistURLs"></div>
+                </td>              
+              </tr>
              <tr>
                <td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
                <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
@ -154,22 +161,6 @@
            If you don't know what this means, please leave this field empty.
          </td>
        </tr>
-        <tr valign="top" class="TableCellLight">
-          <td>Auto-Dom-Filter:</td>
-          <td>
-            <label for="crawlingDomFilterCheck">Use</label>:
-            <input type="checkbox" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" #(crawlingDomFilterCheck)#::checked="checked"#(/crawlingDomFilterCheck)# />&nbsp;&nbsp;
-            <label for="crawlingDomFilterDepth">Depth</label>:
-            <input name="crawlingDomFilterDepth" id="crawlingDomFilterDepth" type="text" size="2" maxlength="2" value="#[crawlingDomFilterDepth]#" />
-          </td>
-          <td>
-            This option will automatically create a domain-filter which limits the crawl on domains the crawler
-            will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while
-            restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth
-            for this example would be 1.<br />
-            The default value 0 gives no restrictions.
-          </td>
-        </tr>
        <tr valign="top" class="TableCellDark">
          <td>Maximum Pages per Domain:</td>
          <td>
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -42,13 +42,18 @@
            <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
            </td>
 	        <td>
-          <span id="robotsOK"></span>
-          <img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
-          </td></tr><tr>
+            <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
+            </td>
+          </tr><tr>
+            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
+            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
+            <td><div id="sitelistURLs"></div></td>
+          </tr><tr>
            <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td>
            <td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
-          </tr></table><br/>
+          </tr>
+          </table><br/>
        </dd>
      <input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99">
        <dt><label>Scheduler</label></dt>
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch;
 import de.anomic.yacy.yacyNewsPool;

 public class Crawler_p {
-	public static final String CRAWLING_MODE_URL = "url";
-	public static final String CRAWLING_MODE_FILE = "file";
-	public static final String CRAWLING_MODE_SITEMAP = "sitemap";
-	

    // this servlet does NOT create the Crawler servlet page content!
    // this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@ -102,10 +98,8 @@ public class Crawler_p {
        }
        
        prop.put("info", "0");
-        if (post != null) {
-            // a crawl start
            
-            if (post.containsKey("continue")) {
+        if (post != null && post.containsKey("continue")) {
            // continue queue
            final String queue = post.get("continue", "");
            if (queue.equals("localcrawler")) {
@ -115,7 +109,7 @@ public class Crawler_p {
            }
        }

-            if (post.containsKey("pause")) {
+        if (post != null && post.containsKey("pause")) {
            // pause queue
            final String queue = post.get("pause", "");
            if (queue.equals("localcrawler")) {
@ -125,7 +119,7 @@ public class Crawler_p {
            }
        }
        
-            if (post.containsKey("crawlingstart")) {
+        if (post != null && post.containsKey("crawlingstart")) {
            // init crawl
            if (sb.peers == null) {
                prop.put("info", "3");
@ -135,7 +129,7 @@ public class Crawler_p {
                int pos = crawlingStart.indexOf("://");
                if (pos == -1) crawlingStart = "http://" + crawlingStart;

-                    // normalizing URL
+                // normalize URL
                DigestURI crawlingStartURL = null;
                try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
                crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
@ -145,7 +139,7 @@ public class Crawler_p {
                final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
                
                
-                    // set the crawling filter
+                // set the crawl filter
                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
                String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
@ -195,9 +189,6 @@ public class Crawler_p {
                    // store just a protocol
                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
                }                    
-                    final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
-                    final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
-                    env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
                
                final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
@ -232,7 +223,7 @@ public class Crawler_p {
                env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
                
                final String crawlingMode = post.get("crawlingMode","url");
-                    if (crawlingMode.equals(CRAWLING_MODE_URL)) {
+                if (crawlingMode.equals("url")) {
                    
                    // check if pattern matches
                    if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
@ -261,7 +252,7 @@ public class Crawler_p {
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                newcrawlingdepth,
-                                    crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
+                                crawlingIfOlder, crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
                                storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
@ -346,31 +337,19 @@ public class Crawler_p {
                        Log.logException(e);
                    }
                    
-                    } else if (crawlingMode.equals(CRAWLING_MODE_FILE)) {
+                } else if (crawlingMode.equals("file")) {
                    if (post.containsKey("crawlingFile")) {
-                            // getting the name of the uploaded file
                        final String fileName = post.get("crawlingFile");  
                        try {
                            // check if the crawl filter works correctly
                            Pattern.compile(newcrawlingMustMatch);
-                                
-                                // loading the file content
                            final File file = new File(fileName);
-                                
-                                // getting the content of the bookmark file
                            final String fileString = post.get("crawlingFile$file");
-                                
-                                // parsing the bookmark file and fetching the headline and contained links
                            final ContentScraper scraper = new ContentScraper(new DigestURI(file));
-                                //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
                            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
                            FileUtils.copy(fileString, writer);
                            writer.close();
-                                
-                                //String headline = scraper.getHeadline();
                            final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
-                                
-                                // creating a crawler profile
                            final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
                            final CrawlProfile profile = new CrawlProfile(
                                    fileName, crawlURL,
@ -378,7 +357,6 @@ public class Crawler_p {
                                    CrawlProfile.MATCH_NEVER,
                                    newcrawlingdepth,
                                    crawlingIfOlder,
-                                        crawlingDomFilterDepth,
                                    crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText,
@ -389,19 +367,13 @@ public class Crawler_p {
                                    xsstopw, xdstopw, xpstopw,
                                    cachePolicy);
                            sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
-                                
-                                // pause local crawl here
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-                                
-                                // loop through the contained links
                            final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
                            DigestURI nexturl;
                            while (linkiterator.hasNext()) {
                                final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
                                if (e.getKey() == null) continue;
                                nexturl = new DigestURI(e.getKey());
-                                    
-                                    // enqueuing the url for crawling
                                sb.crawlStacker.enqueueEntry(new Request(
                                        sb.peers.mySeed().hash.getBytes(), 
                                        nexturl, 
@ -416,7 +388,6 @@ public class Crawler_p {
                            }
                           
                        } catch (final PatternSyntaxException e) {
-                                // print error message
                            prop.put("info", "4"); //crawlfilter does not match url
                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
@ -429,31 +400,24 @@ public class Crawler_p {
                        }
                        sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
                    }
-                    } else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) { 
-                    	String sitemapURLStr = null;
+                } else if (crawlingMode.equals("sitemap")) {
+                    String sitemapURLStr = post.get("sitemapURL","");
                	try {
-                    		// getting the sitemap URL
-                    		sitemapURLStr = post.get("sitemapURL","");
                		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
-                            
-                    		// create a new profile
                		final CrawlProfile pe = new CrawlProfile(
                				sitemapURLStr, sitemapURL,
                				newcrawlingMustMatch,
                				CrawlProfile.MATCH_NEVER,
                				newcrawlingdepth,
-                    				crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
+                				crawlingIfOlder, crawlingDomMaxPages,
                				crawlingQ,
                				indexText, indexMedia,
                				storeHTCache, true, crawlOrder,
                				xsstopw, xdstopw, xpstopw,
                				cachePolicy);
                		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
-                    		
-                    		// create a new sitemap importer
-                    		final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe);
+                		final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
                		importer.start();
-                    		
                	} catch (final Exception e) {
                		// mist
                		prop.put("info", "6");//Error with url
@ -461,14 +425,79 @@ public class Crawler_p {
                		prop.putHTML("info_error", e.getMessage());
                		Log.logException(e);
                	}
+                } else if (crawlingMode.equals("sitelist")) {
+                    try {
+                        final DigestURI sitelistURL = new DigestURI(crawlingStart, null);
+                        // download document
+                        ContentScraper scraper = null;
+                        scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
+                        String title = scraper.getTitle();
+                        // String description = scraper.getDescription();
+                        
+                        // get links and generate filter
+                        StringBuilder filter = new StringBuilder();
+                        final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
+                        for (MultiProtocolURI uri: hyperlinks.keySet()) {
+                            filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
+                        }
+                        newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";
+
+                        // put links onto crawl queue
+                        final CrawlProfile profile = new CrawlProfile(
+                                title == null || title.length() == 0 ? sitelistURL.getHost() : title,
+                                sitelistURL,
+                                newcrawlingMustMatch,
+                                CrawlProfile.MATCH_NEVER,
+                                newcrawlingdepth,
+                                crawlingIfOlder,
+                                crawlingDomMaxPages,
+                                crawlingQ,
+                                indexText,
+                                indexMedia,
+                                storeHTCache,
+                                true,
+                                crawlOrder,
+                                xsstopw, xdstopw, xpstopw,
+                                cachePolicy);
+                        sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
+                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+                        final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
+                        DigestURI nexturl;
+                        while (linkiterator.hasNext()) {
+                            final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
+                            if (e.getKey() == null) continue;
+                            nexturl = new DigestURI(e.getKey());
+                            // remove the url from the database to be prepared to crawl them again
+                            final byte[] urlhash = nexturl.hash();
+                            indexSegment.urlMetadata().remove(urlhash);
+                            sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+                            sb.crawlQueues.errorURL.remove(urlhash);
+                            sb.crawlStacker.enqueueEntry(new Request(
+                                    sb.peers.mySeed().hash.getBytes(), 
+                                    nexturl, 
+                                    null, 
+                                    e.getValue(), 
+                                    new Date(),
+                                    profile.handle(),
+                                    0,
+                                    0,
+                                    0
+                                    ));
+                        }
+                    } catch (final Exception e) {
+                        // mist
+                        prop.put("info", "6");//Error with url
+                        prop.putHTML("info_crawlingStart", crawlingStart);
+                        prop.putHTML("info_error", e.getMessage());
+                        Log.logException(e);
+                    }
                }
            }
        }
        
-            if (post.containsKey("crawlingPerformance")) {
+        if (post != null && post.containsKey("crawlingPerformance")) {
            setPerformance(sb, post);
        }
-        }
        
        // performance settings
        final long LCbusySleep = Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "1000"));
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -152,7 +152,6 @@ public class QuickCrawlLink_p {
                        crawlingMustNotMatch,
                        CrawlingDepth,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-                        -1, // domFilterDepth, if negative: no auto-filter
                        -1, // domMaxPages, if negative: no count restriction
                        crawlDynamic,
                        indexText,
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -81,6 +81,20 @@ public class getpageinfo_p {
                    // put language
                    Set<String> languages = scraper.getContentLanguages();
                    prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
+                    
+                    // get links and put them into a semicolon-separated list
+                    StringBuilder links = new StringBuilder();
+                    StringBuilder filter = new StringBuilder();
+                    count = 0;
+                    for (MultiProtocolURI uri: scraper.getAnchors().keySet()) {
+                        links.append(';').append(uri.toNormalform(true, false));
+                        filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
+                        prop.putXML("links_" + count + "_link", uri.toNormalform(true, false));
+                        count++;
+                    }
+                    prop.put("links", count);
+                    prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
+                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                }
            }
            if(actions.indexOf("robots")>=0){
--- a/htroot/api/util/getpageinfo_p.xml
+++ b/htroot/api/util/getpageinfo_p.xml
@ -6,9 +6,16 @@
  <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
  <sitemap>#[sitemap]#</sitemap>
  <favicon>#[favicon]#</favicon>
+  <sitelist>#[sitelist]#</sitelist>
+  <filter>#[filter]#</filter>
  <tags>
    #{tags}#
    <tag name="#[tag]#" />
    #{/tags}#
  </tags>
+  <links>
+    #{links}#
+    <link name="#[link]#" />
+    #{/links}#
+  </links>
 </pageinfo>
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@ -6,7 +6,7 @@ function handleResponse(){
    if (http.readyState == 4){
        var response = http.responseXML;

-		// getting the document title
+		// get the document title
        doctitle="";		
        if (response.getElementsByTagName("title")[0].firstChild!=null){
 	        doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue;
@ -41,7 +41,7 @@ function handleResponse(){
 	        document.getElementById("robotsOK").innerHTML="";
        }		
 		
-		// getting the sitemap URL contained in the robots.txt
+		// get the sitemap URL contained in the robots.txt
 		if (document.getElementsByName("sitemapURL").length > 0) {
 			sitemap="";		
 	        if (response.getElementsByTagName("sitemap")[0].firstChild!=null){
@ -50,15 +50,23 @@ function handleResponse(){
 			document.getElementsByName("sitemapURL")[0].value=sitemap;
 			document.getElementById("sitemap").disabled=false;
 		}
+			sitelist="";		
+	        if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
+		        sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
+		    }
+			document.getElementById("sitelistURLs").innerHTML = sitelist;
+			document.getElementById("sitelist").disabled=false;
        
 		// clear the ajax image
 		document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
    }
 }
+
 function changed() {
 	window.clearTimeout(timeout);
 	timeout=window.setTimeout("loadInfos()", 1500);
 }
+
 function loadInfos() {
 	// displaying ajax image
 	document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);	
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -48,7 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String FILTER_MUSTNOTMATCH = "nevermatch";
    public static final String DEPTH            = "generalDepth";
    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
-    public static final String DOM_FILTER_DEPTH = "domFilterDepth";
    public static final String DOM_MAX_PAGES    = "domMaxPages";
    public static final String CRAWLING_Q       = "crawlingQ";
    public static final String INDEX_TEXT       = "indexText";
@ -70,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final String mustnotmatch,
                 final int depth,
                 final long recrawlIfOlder /*date*/,
-                 final int domFilterDepth, final int domMaxPages,
+                 final int domMaxPages,
                 final boolean crawlingQ,
                 final boolean indexText, final boolean indexMedia,
                 final boolean storeHTCache, final boolean storeTXCache,
@ -87,7 +86,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(FILTER_MUSTNOTMATCH,   (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
        put(DEPTH,            depth);
        put(RECRAWL_IF_OLDER, recrawlIfOlder);
-        put(DOM_FILTER_DEPTH, domFilterDepth);
        put(DOM_MAX_PAGES,    domMaxPages);
        put(CRAWLING_Q,       crawlingQ); // crawling of urls with '?'
        put(INDEX_TEXT,       indexText);
@ -186,21 +184,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return 0L;
        }
    }
-    public int domFilterDepth() {
-        // if the depth is equal or less to this depth,
-        // then the current url feeds with its domain the crawl filter
-        // if this is -1, all domains are feeded
-        final String r = get(DOM_FILTER_DEPTH);
-        if (r == null) return Integer.MAX_VALUE;
-        try {
-            final int i = Integer.parseInt(r);
-            if (i < 0) return Integer.MAX_VALUE;
-            return i;
-        } catch (final NumberFormatException e) {
-            Log.logException(e);
-            return Integer.MAX_VALUE;
-        }
-    }
    public int domMaxPages() {
        // this is the maximum number of pages that are crawled for a single domain
        // if -1, this means no limit
@ -270,16 +253,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            dp.inc();
        }
    }
-    public boolean grantedDomAppearance(final String domain) {
-        final int max = domFilterDepth();
-        if (max == Integer.MAX_VALUE) return true;
-        final DomProfile dp = doms.get(domain);
-        if (dp == null) {
-            return 0 < max;
-        }
-        return dp.depth <= max;
-    }
-
    public boolean grantedDomCount(final String domain) {
        final int max = domMaxPages();
        if (max == Integer.MAX_VALUE) return true;
@ -292,10 +265,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public int domSize() {
        return doms.size();
    }
-    public boolean domExists(final String domain) {
-        if (domFilterDepth() == Integer.MAX_VALUE) return true;
-        return doms.containsKey(domain);
-    }

    public String domName(final boolean attr, final int index){
        final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -196,7 +196,7 @@ public final class CrawlStacker {
        final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());

        // add domain to profile domain list
-        if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
+        if (profile.domMaxPages() != Integer.MAX_VALUE) {
            profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
        }

@ -296,12 +296,6 @@ public final class CrawlStacker {
            return "post url not allowed";
        }

-        // deny urls that do not match with the profile domain list
-        if (!(profile.grantedDomAppearance(url.getHost()))) {
-            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
-            return "url does not match domain filter";
-        }
-
        // deny urls that exceed allowed number of occurrences
        if (!(profile.grantedDomCount(url.getHost()))) {
            if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -164,9 +164,10 @@ public final class CrawlSwitchboard {
        
        if (this.defaultProxyProfile == null) {
            // generate new default entry for proxy crawling
-            this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+            this.defaultProxyProfile = new CrawlProfile(
+                    "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
                    0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
                    true, true,
@ -177,38 +178,38 @@ public final class CrawlSwitchboard {
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
        }
        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
        }
        if (this.defaultSurrogateProfile == null) {
            // generate new default entry for surrogate parsing
            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
            this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
        }
    }