diff --git a/build.properties b/build.properties index 9f79b035f..25b7fe2d2 100644 --- a/build.properties +++ b/build.properties @@ -3,11 +3,11 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.391 -#releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz -releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz -#releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} -releaseDir=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr} +releaseVersion=0.392 +releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz +#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz +releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} +#releaseDir=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr} releaseNr=$Revision$ # defining some file/directory access rights diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index e174e3dd2..477cff470 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -70,11 +70,15 @@ public class IndexCreateWWWGlobalQueue_p { if (post != null) { if (post.containsKey("clearcrawlqueue")) { String urlHash; + int c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT); + switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_LIMIT); + /* int c = 0; while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) > 0) { urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT).hash(); if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } } + */ prop.put("info", 3);//crawling queue cleared prop.put("info_numEntries", c); } diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 7605bb8b3..f0d2d3dad 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -70,11 +70,8 @@ public class IndexCreateWWWLocalQueue_p { if (post != null) { if (post.containsKey("clearcrawlqueue")) { String urlHash; - int c = 0; - while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) { - urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_CORE).hash(); - if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; } - } + int c = switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE); + switchboard.urlPool.noticeURL.clear(plasmaCrawlNURL.STACK_TYPE_CORE); prop.put("info", 3);//crawling queue cleared prop.put("info_numEntries", c); } diff --git a/htroot/ProxyIndexingMonitor_p.html b/htroot/ProxyIndexingMonitor_p.html index d461d3a47..e61c318ac 100644 --- a/htroot/ProxyIndexingMonitor_p.html +++ b/htroot/ProxyIndexingMonitor_p.html @@ -12,7 +12,8 @@ This is the control page for web pages that your peer has indexed during the current application run-time as result of proxy fetch/prefetch. No personal or protected page is indexed; -those pages are detected by Cookie-Use or POST-Parameters (either in URL or as HTTP protocol) +those pages are detected by properties in the HTTP header (like Cookie-Use, or HTTP Authorization) +or by POST-Parameters (either in URL or as HTTP protocol) and automatically excluded from indexing.

@@ -54,46 +55,8 @@ Please delete that file and restart.

An error has occurred: #[error]#.
#(/info)# -

Snapshot of recently indexed web pages that passed the proxy:
-#(table4)# -The stack is empty. -:: - -#(size)# -Showing all #[all]# entries in this stack. -:: -Showing latest #[count]# lines from a stack of #[all]# entries. -#(/size)# - - - - -#(showInit)#::#(/showInit)# -#(showExec)#::#(/showExec)# - - - - - -#{indexed}# - - -#(showInit)#::#(/showInit)# -#(showExec)#::#(/showExec)# - - - - - -#{/indexed}# -
-
InitiatorExecutorModified Date#WordsTitleURL
-
- - - -
#[initiatorSeed]##[executorSeed]##[moddate]##[wordcount]##[urldescr]##[url]#

-#(/table4)# +

You can see a snapshot of recently indexed pages +on the Proxy Index Monitor Page.

#[footer]# diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index fb9b7fcca..6ea858a14 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -74,21 +74,7 @@ public class ProxyIndexingMonitor_p { prop.put("info_message", ""); if (post != null) { - if (post.containsKey("clearlist4")) switchboard.urlPool.loadedURL.clearStack(4); // local: by proxy crawl - if (post.containsKey("deleteentry")) { - String hash = post.get("hash", null); - if (hash != null) { - // delete from database - switchboard.urlPool.loadedURL.remove(hash); - } - } - - if (post.containsKey("moreIndexed")) { - showIndexedCount = Integer.parseInt(post.get("showIndexed", "40")); - } - if (post.get("se") != null) se = true; - if (post.containsKey("proxyprofileset")) try { // read values and put them in global settings int newProxyPrefetchDepth = Integer.parseInt((String) post.get("proxyPrefetchDepth", "0")); @@ -121,10 +107,6 @@ public class ProxyIndexingMonitor_p { } } - // create tables - String myname = yacyCore.seedDB.mySeed.getName(); - prop.putAll(switchboard.urlPool.loadedURL.genTableProps(4, showIndexedCount, false, false, "proxy", null, "ProxyIndexingMonitor_p.html", true)); - prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0); // return rewrite properties diff --git a/htroot/index.rss b/htroot/index.rss index 225fc8e3c..a390d0ba1 100644 --- a/htroot/index.rss +++ b/htroot/index.rss @@ -12,6 +12,7 @@ #[description]# #[url]# + #(snippet)#::#[text]##(/snippet)# #[date]# #{/results}# diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 07f49a3ef..6dc9be0e6 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -563,6 +563,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt res.writeContent(hfos, cacheFile); if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); this.theLogger.logDebug("for write-file of " + url + ": contentLength = " + contentLength + ", sizeBeforeDelete = " + sizeBeforeDelete); + cacheManager.writeFileAnnouncement(cacheFile); if (sizeBeforeDelete == -1) { // totally fresh file //cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert diff --git a/source/de/anomic/kelondro/kelondroMergeIterator.java b/source/de/anomic/kelondro/kelondroMergeIterator.java index c2abb8f90..e3b69844b 100644 --- a/source/de/anomic/kelondro/kelondroMergeIterator.java +++ b/source/de/anomic/kelondro/kelondroMergeIterator.java @@ -44,6 +44,7 @@ package de.anomic.kelondro; import java.util.Comparator; import java.util.Iterator; import java.util.Set; +import java.util.ConcurrentModificationException; public class kelondroMergeIterator implements Iterator { @@ -72,10 +73,18 @@ public class kelondroMergeIterator implements Iterator { } private void nexta() { - if (a.hasNext()) na = (String) a.next(); else na = null; + try { + if (a.hasNext()) na = (String) a.next(); else na = null; + } catch (ConcurrentModificationException e) { + na = null; + } } private void nextb() { - if (b.hasNext()) nb = (String) b.next(); else nb = null; + try { + if (b.hasNext()) nb = (String) b.next(); else nb = null; + } catch (ConcurrentModificationException e) { + nb = null; + } } public boolean hasNext() { diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index 9b956cd85..bf4ec92f5 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -239,6 +239,17 @@ public class kelondroRecords { // thats it! } + public void clear() throws IOException { + // Removes all mappings from this map + //throw new UnsupportedOperationException("clear not supported"); + USEDC = 0; + FREEC = 0; + FREEH = new Handle(NUL); + entryFile.seek(POS_USEDC); entryFile.writeInt(this.USEDC); + entryFile.seek(POS_FREEC); entryFile.writeInt(this.FREEC); + entryFile.seek(POS_FREEH); entryFile.writeInt(this.FREEH.index); + } + public kelondroRecords(File file, long buffersize) throws IOException{ // opens an existing tree if (!file.exists()) throw new IOException("kelondroRecords: file " + file.getAbsoluteFile().toString() + " does not exist"); @@ -776,11 +787,6 @@ public class kelondroRecords { return TXTPROPS[pos]; } - // Removes all mappings from this map (optional operation). - public void clear() { - throw new UnsupportedOperationException("clear not supported"); - } - // Returns true if this map contains no key-value mappings. public boolean isEmpty() { return (USEDC == 0); diff --git a/source/de/anomic/kelondro/kelondroStack.java b/source/de/anomic/kelondro/kelondroStack.java index aaa3b2ec4..6860026d3 100644 --- a/source/de/anomic/kelondro/kelondroStack.java +++ b/source/de/anomic/kelondro/kelondroStack.java @@ -82,6 +82,12 @@ public class kelondroStack extends kelondroRecords { super(file, buffersize); } + public void clear() throws IOException { + super.clear(); + setHandle(root, null); // reset the root value + setHandle(toor, null); // reset the toor value + } + public class Counter implements Iterator { Handle nextHandle = null; public Counter() throws IOException { diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index fdb4c1719..76e0050c3 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -119,6 +119,11 @@ public class kelondroTree extends kelondroRecords implements Comparator { super(ra, buffersize); } + public void clear() throws IOException { + super.clear(); + setHandle(root, null); // reset the root value + } + // Returns the value to which this map maps the specified key. public synchronized byte[][] get(byte[] key) throws IOException { //System.out.println("kelondroTree.get " + new String(key) + " in " + filename); diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 852850ff7..4ca9b5ddb 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -250,6 +250,21 @@ public class plasmaCrawlNURL extends plasmaURL { } } + public void clear(int stackType) { + try { + switch (stackType) { + case STACK_TYPE_CORE: coreStack.clear(); break; + case STACK_TYPE_LIMIT: limitStack.clear(); break; + case STACK_TYPE_OVERHANG: overhangStack.clear(); break; + case STACK_TYPE_REMOTE: remoteStack.clear(); break; + case STACK_TYPE_IMAGE: imageStack.clear(); break; + case STACK_TYPE_MOVIE: movieStack.clear(); break; + case STACK_TYPE_MUSIC: musicStack.clear(); break; + default: return; + } + } catch (IOException e) {} + } + private Entry pop(kelondroStack stack) { // this is a filo - pop try { diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 49fa12efe..47fa84db4 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -254,7 +254,8 @@ public final class plasmaHTCache { " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " + ((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + " OLD"); - + cleanup(); + // start to prefetch ip's from dns String dom; long start = System.currentTimeMillis(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 78360dcf0..9cfaccccb 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -380,9 +380,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser indexDistribution = new plasmaWordIndexDistribution(urlPool, wordIndex, log, getConfig("allowDistributeIndex", "false").equals("true")); - indexDistribution.setCounts(100, 1, 8000); - deployThread("20_dhtdistribution", "DHT Distribution (currently by juniors only)", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null, - new serverInstantThread(indexDistribution, "job", null), 120000); + indexDistribution.setCounts(100, 1, 3, 8000); + deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null, + new serverInstantThread(indexDistribution, "job", null), 12000); // init migratiion from 0.37 -> 0.38 classicCache = new plasmaWordIndexClassicCacheMigration(plasmaPath, wordIndex); diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java index 31202c413..45ecb9535 100644 --- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -17,75 +17,99 @@ import de.anomic.kelondro.kelondroException; public class plasmaWordIndexDistribution { - // distributes parts of the index to other peers - // stops as soon as an error occurrs - - private int indexCount; - private int peerCount; - private long maxTime; - - private plasmaURLPool urlPool; - private plasmaWordIndex wordIndex; - private serverLog log; - private boolean enabled; - - public plasmaWordIndexDistribution(plasmaURLPool urlPool, plasmaWordIndex wordIndex, serverLog log, - boolean enable) { - this.urlPool = urlPool; - this.wordIndex = wordIndex; - setCounts(100 /*indexCount*/, 1 /*peerCount*/, 8000); - } + // distributes parts of the index to other peers + // stops as soon as an error occurrs + + private int indexCount; + private int juniorPeerCount, seniorPeerCount; + private long maxTime; + + private plasmaURLPool urlPool; + private plasmaWordIndex wordIndex; + private serverLog log; + private boolean enabled; + + public plasmaWordIndexDistribution(plasmaURLPool urlPool, plasmaWordIndex wordIndex, serverLog log, + boolean enable) { + this.urlPool = urlPool; + this.wordIndex = wordIndex; + this.enabled = enable; + this.log = log; + setCounts(100 /*indexCount*/, 1 /*juniorPeerCount*/, 3 /*seniorPeerCount*/, 8000); + } + + public void enable() { + enabled = true; + } + + public void disable() { + enabled = false; + } + + public boolean job() { - public void enable() { - enabled = true; + if (yacyCore.seedDB == null) { + log.logDebug("no word distribution: seedDB == null"); + return false; } - - public void disable() { - enabled = false; + if (yacyCore.seedDB.mySeed == null) { + log.logDebug("no word distribution: mySeed == null"); + return false; + } + if (yacyCore.seedDB.mySeed.isVirgin()) { + log.logDebug("no word distribution: status is virgin"); + return false; + } + if (!(enabled)) { + log.logDebug("no word distribution: not enabled"); + return false; + } + if (urlPool.loadedURL.size() < 10) { + log.logDebug("no word distribution: loadedURL.size() = " + urlPool.loadedURL.size()); + return false; + } + if (wordIndex.size() < 100) { + log.logDebug("no word distribution: not enough words - wordIndex.size() = " + wordIndex.size()); + return false; + } + if (urlPool.noticeURL.stackSize() > 0) { + log.logDebug("no word distribution: crawl in progress - noticeURL.stackSize() = " + urlPool.noticeURL.stackSize()); + return false; } - public boolean job() { - if ((yacyCore.seedDB == null) || - (yacyCore.seedDB.mySeed == null) || - (yacyCore.seedDB.mySeed.isVirgin()) || - (urlPool.loadedURL.size() < 10) || - (wordIndex.size() < 100) || - (!(yacyCore.seedDB.mySeed.isJunior()))) return false; + // do the transfer + int peerCount = (yacyCore.seedDB.mySeed.isJunior()) ? juniorPeerCount : seniorPeerCount; + long starttime = System.currentTimeMillis(); + int transferred = performTransferIndex(indexCount, peerCount, true); + + if (transferred <= 0) { + log.logDebug("no word distribution: transfer failed"); + return false; + } - int transferred; - long starttime = System.currentTimeMillis(); - try { - if ( - (urlPool.noticeURL.stackSize() == 0) && - (enabled) && - ((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) { - indexCount = transferred; - if ((System.currentTimeMillis() - starttime) > (maxTime * peerCount)) indexCount--; else indexCount++; - if (indexCount < 30) indexCount = 30; - return true; - } else { - // make a long pause - return false; - } - } catch (IllegalArgumentException ee) { - // this is a bug that occurres if a not-fixeable data-inconsistency in the table structure was detected - // make a long pause - log.logError("very bad data inconsistency: " + ee.getMessage()); - //ee.printStackTrace(); - return false; - } - } + // adopt transfer count + if ((System.currentTimeMillis() - starttime) > (maxTime * peerCount)) + indexCount--; + else + indexCount++; + if (indexCount < 30) indexCount = 30; - public void setCounts(int indexCount, int peerCount, long maxTimePerTransfer) { - this.maxTime = maxTimePerTransfer; - this.indexCount = indexCount; - if (indexCount < 30) indexCount = 30; - this.peerCount = peerCount; - } + // show success + return true; + + } + + public void setCounts(int indexCount, int juniorPeerCount, int seniorPeerCount, long maxTimePerTransfer) { + this.maxTime = maxTimePerTransfer; + this.indexCount = indexCount; + if (indexCount < 30) indexCount = 30; + this.juniorPeerCount = juniorPeerCount; + this.seniorPeerCount = seniorPeerCount; + } + + public int performTransferIndex(int indexCount, int peerCount, boolean delete) { + if ((yacyCore.seedDB == null) || (yacyCore.seedDB.sizeConnected() == 0)) return -1; - public int performTransferIndex(int indexCount, int peerCount, boolean delete) { - if ((yacyCore.seedDB == null) || (yacyCore.seedDB.sizeConnected() == 0)) return -1; - // collect index //String startPointHash = yacyCore.seedCache.mySeed.hash; String startPointHash = serverCodings.encodeMD5B64("" + System.currentTimeMillis(), true).substring(0, yacySeedDB.commonHashLength); @@ -140,18 +164,18 @@ public class plasmaWordIndexDistribution { return -1; } } else { - // simply close the indexEntities - for (int i = 0; i < indexEntities.length; i++) try { - indexEntities[i].close(); - } catch (IOException ee) {} - } + // simply close the indexEntities + for (int i = 0; i < indexEntities.length; i++) try { + indexEntities[i].close(); + } catch (IOException ee) {} + } return indexCount; } else { log.logError("Index distribution failed. Too less peers (" + hc + ") received the index, not deleted locally."); return -1; } } - + private plasmaWordIndexEntity[] selectTransferIndexes(String hash, int count) { Vector tmpEntities = new Vector(); String nexthash = ""; @@ -161,7 +185,7 @@ public class plasmaWordIndexDistribution { Enumeration urlEnum; plasmaWordIndexEntry indexEntry; while ((count > 0) && (wordHashIterator.hasNext()) && - ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)) { + ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)) { indexEntity = wordIndex.getEntity(nexthash, true); if (indexEntity.size() == 0) { indexEntity.deleteComplete(); @@ -229,7 +253,7 @@ public class plasmaWordIndexDistribution { /* if (wordIndex.getEntity(indexEntities[i].wordHash()).deleteComplete()) System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " SUCCESSFULL"); - else + else System.out.println("DEBUG: trial delete of partial word index " + indexEntities[i].wordHash() + " FAILED"); */ // end debug @@ -238,7 +262,7 @@ public class plasmaWordIndexDistribution { // delete complete file if (indexEntities[i].deleteComplete()) { indexEntities[i].close(); - } else { + } else { indexEntities[i].close(); // have another try... if (!(plasmaWordIndexEntity.wordHash2path(wordIndex.getRoot() /*PLASMADB*/, indexEntities[i].wordHash()).delete())) { @@ -247,7 +271,7 @@ public class plasmaWordIndexDistribution { } } } - indexEntities[i] = null; + indexEntities[i] = null; } return success; } diff --git a/yacy.logging b/yacy.logging index d6be2159f..08a96ec74 100644 --- a/yacy.logging +++ b/yacy.logging @@ -12,7 +12,7 @@ # INFO regular action information (i.e. any httpd request URL) # FINEST in-function status debug output PARSER.level = INFO -YACY.level = INFO +YACY.level = FINEST HTCACHE.level = INFO PLASMA.level = FINEST SERVER.level = INFO