diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 7575fc545..e7b879580 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -126,19 +126,18 @@ public final class transferRWI { wordhashes[received] = wordHash; entry = new plasmaWordIndexEntry(estring.substring(p)); sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true); - //sb.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), System.currentTimeMillis(), true); serverCore.checkInterruption(); urlHash = entry.getUrlHash(); try { - if ( - (!(unknownURL.contains(urlHash))) && - (!(sb.urlPool.loadedURL.exists(urlHash))) - ) { + if ((!(unknownURL.contains(urlHash))) && + (!(sb.urlPool.loadedURL.exists(urlHash)))) { unknownURL.add(urlHash); } } catch (Exception ex) { - sb.getLog().logWarning("transferRWI: DB-Error while trying to determine if URL with hash '" + urlHash + "' is known.",ex); + sb.getLog().logWarning( + "transferRWI: DB-Error while trying to determine if URL with hash '" + + urlHash + "' is known.", ex); unknownURL.add(urlHash); } received++; diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 5b1c6d8ab..11cf19e07 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -95,16 +95,17 @@ public final class transferURL { } else { lEntry = sb.urlPool.loadedURL.newEntry(urls, true); if ((lEntry != null) && (lEntry.url() != null)) { - if ( - (blockBlacklist) && - (plasmaSwitchboard.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) - ){ - yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName); + if ((blockBlacklist) && + (plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) { + int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); + yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); lEntry = null; } else { sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3); - yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName); - received++; + yacyCore.log.logFine("transferURL: received URL '" + + lEntry.url() + "' from peer " + + otherPeerName); + received++; } } else { yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 2008e6248..3dad23f5b 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -358,6 +358,14 @@ public final class plasmaWordIndex { return removed; } + public synchronized int tryRemoveURLs(String urlHash) { + // this tries to delete an index from the cache that has this + // urlHash assigned. This can only work if the entry is really fresh + // and can be found in the RAM cache + // this returns the number of deletion that had been possible + return ramCache.tryRemoveURLs(urlHash); + } + public static final int RL_RAMCACHE = 0; public static final int RL_FILECACHE = 1; public static final int RL_ASSORTMENTS = 2; diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index fc225a76e..3a4f98306 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -61,8 +61,9 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // environment constants private static final String indexArrayFileName = "indexDump1.array"; public static final int ramCacheReferenceLimit = 50; - public static final long ramCacheAgeLimit = 60 * 60 * 2 * 1000; // milliseconds; 2 Hours - + public static final long ramCacheMaxAge = 1000 * 60 * 60 * 2; // milliseconds; 2 Hours + public static final long ramCacheMinAge = 1000 * 60 * 2; // milliseconds; 2 Minutes (Karenz for DHT Receive) + // class variables private final File databaseRoot; private final TreeMap cache; @@ -257,12 +258,13 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { String hash = null; int count = hashScore.getMaxScore(); if ((count > ramCacheReferenceLimit) && - ((hash = (String) hashScore.getMaxObject()) != null)) { - // flush high-score entries + ((hash = (String) hashScore.getMaxObject()) != null) && + (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) > ramCacheMinAge)) { + // flush high-score entries, but not if they are too 'young' return hash; } long oldestTime = longEmit(hashDate.getMinScore()); - if (((System.currentTimeMillis() - oldestTime) > ramCacheAgeLimit) && + if (((System.currentTimeMillis() - oldestTime) > ramCacheMaxAge) && ((hash = (String) hashDate.getMinObject()) != null)) { // flush out-dated entries return hash; @@ -271,6 +273,10 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { if (Runtime.getRuntime().freeMemory() < 10000000) { // low-memory case hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM) + if (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) < ramCacheMinAge) { + // to young, take it from the oldest entries + hash = (String) hashDate.getMinObject(); + } } else { // not-efficient-so-far case hash = (String) hashDate.getMinObject(); // flush oldest entries (makes indexing faster) @@ -335,6 +341,30 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return count; } + public synchronized int tryRemoveURLs(String urlHash) { + // this tries to delete an index from the cache that has this + // urlHash assigned. This can only work if the entry is really fresh + // Such entries must be searched in the latest entries + Iterator i = hashDate.scores(false); + String wordHash; + long t; + plasmaWordIndexEntryContainer c; + int delCount = 0; + while (i.hasNext()) { + wordHash = (String) i.next(); + // check time + t = longEmit(hashDate.getScore(wordHash)); + if (System.currentTimeMillis() - t > ramCacheMinAge) return delCount; + // get container + c = (plasmaWordIndexEntryContainer) cache.get(wordHash); + if (c.remove(urlHash) != null) { + cache.put(wordHash, c); + delCount++; + } + } + return delCount; + } + public int addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean highPriority) { // this puts the entries into the cache, not into the assortment directly