diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 6406d1178..27f7918e5 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -599,7 +599,6 @@ public class kelondroCollectionIndex { int newPartitionNumber; while ((newPartitionNumber = arrayIndex(oldcollection.size())) > maxPartitions) { kelondroRowSet newcollection = shrinkCollection(key, oldcollection, arrayCapacity(maxPartitions)); - saveCommons(key, oldcollection); oldcollection = newcollection; } @@ -714,7 +713,6 @@ public class kelondroCollectionIndex { int newPartitionNumber; while ((newPartitionNumber = arrayIndex(oldcollection.size())) > maxPartitions) { kelondroRowSet newcollection = shrinkCollection(key, oldcollection, arrayCapacity(maxPartitions)); - saveCommons(key, oldcollection); oldcollection = newcollection; } @@ -747,6 +745,7 @@ public class kelondroCollectionIndex { } private kelondroRowSet shrinkCollection(byte[] key, kelondroRowSet collection, int targetSize) { + //TODO Remove timing before release // removes entries from collection // the removed entries are stored in a 'commons' dump file @@ -754,23 +753,32 @@ public class kelondroCollectionIndex { int oldsize = collection.size(); kelondroRowSet survival = new kelondroRowSet(collection.rowdef, 0); if (oldsize <= targetSize) return survival; + long sadd1 = 0, srem1 = 0, sadd2 = 0, srem2 = 0, tot1 = 0, tot2 = 0; + long t1 = 0, t2 = 0; // delete some entries, which are bad rated Iterator i = collection.rows(); kelondroRow.Entry entry; byte[] ref; + t1 = System.currentTimeMillis(); while (i.hasNext()) { entry = (kelondroRow.Entry) i.next(); ref = entry.getColBytes(0); if ((ref.length == 12) && (yacyURL.probablyRootURL(new String(ref)))) { + t2 = System.currentTimeMillis(); survival.addUnique(entry); + sadd1 += System.currentTimeMillis() - t2; + t2 = System.currentTimeMillis(); i.remove(); + srem1 += System.currentTimeMillis() - t2; } } int firstSurvival = survival.size(); + tot1 = System.currentTimeMillis() - t1; // check if we shrinked enough Random rand = new Random(System.currentTimeMillis()); + t1 = System.currentTimeMillis(); while (survival.size() > targetSize) { // now delete randomly more entries from the survival collection i = survival.rows(); @@ -778,13 +786,22 @@ public class kelondroCollectionIndex { entry = (kelondroRow.Entry) i.next(); ref = entry.getColBytes(0); if (rand.nextInt() % 4 != 0) { + t2 = System.currentTimeMillis(); collection.addUnique(entry); + sadd2 += System.currentTimeMillis() - t2; + t2 = System.currentTimeMillis(); i.remove(); + srem2 += System.currentTimeMillis() - t2; } } } + tot2 = System.currentTimeMillis() - t1; + serverLog.logFine("kelondroCollectionIndex", "tot= "+tot1+'/'+tot2+" # add/rem(1)= "+sadd1+'/'+srem1+" # add/rem(2)= "+sadd2+'/'+srem2); serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", survival size = " + survival.size() + ", first survival = " + firstSurvival); + + //finally dump the removed entries to a file + saveCommons(key, collection); return survival; } diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index 5445e71fa..86ec84448 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -363,26 +363,39 @@ public class kelondroRowCollection { chunkcount += c.size(); } - protected synchronized final void removeRow(int p) { + /** + * This method removes the entry at position p ensuring the order of the remaining + * entries if specified by keepOrder. + * Note: Keeping the order is expensive. If you want to remove more than one element in + * a batch with this method, it'd be better to do the removes without order keeping and doing + * the sort after all the removes are done. + * + * @param p element at this position will be removed + * @param keepOrder keep the order of remaining entries + */ + protected synchronized final void removeRow(int p, boolean keepOrder) { assert p >= 0 : "p = " + p; assert p < chunkcount : "p = " + p + ", chunkcount = " + chunkcount; assert chunkcount > 0 : "chunkcount = " + chunkcount; assert sortBound <= chunkcount : "sortBound = " + sortBound + ", chunkcount = " + chunkcount; - if (p < sortBound) { - // remove by shift - System.arraycopy( - chunkcache, (p + 1) * this.rowdef.objectsize(), + if (keepOrder && (p < sortBound)) { + // remove by shift (quite expensive for big collections) + System.arraycopy( + chunkcache, (p + 1) * this.rowdef.objectsize(), chunkcache, p * this.rowdef.objectsize(), (chunkcount - p - 1) * this.rowdef.objectsize()); sortBound--; } else { - // remove by copying the top-element to the remove position - if (p != chunkcount - 1) { - System.arraycopy( - chunkcache, (chunkcount - 1) * this.rowdef.objectsize(), - chunkcache, p * this.rowdef.objectsize(), - this.rowdef.objectsize()); - } + // remove by copying the top-element to the remove position + if (p != chunkcount - 1) { + System.arraycopy( + chunkcache, (chunkcount - 1) * this.rowdef.objectsize(), + chunkcache, p * this.rowdef.objectsize(), + this.rowdef.objectsize()); + } + // we moved the last element to the remove position: (p+1)st element + // only the first p elements keep their order + if (sortBound > p) sortBound = p; } chunkcount--; this.lastTimeWrote = System.currentTimeMillis(); @@ -414,6 +427,12 @@ public class kelondroRowCollection { return new rowIterator(); } + /** + * Iterator for kelondroRowCollection. + * It supports remove() though it doesn't contain the order of the underlying + * collection during removes. + * + */ public class rowIterator implements Iterator { private int p; @@ -432,7 +451,7 @@ public class kelondroRowCollection { public void remove() { p--; - removeRow(p); + removeRow(p, false); } } @@ -562,7 +581,7 @@ public class kelondroRowCollection { //System.out.println("ENTRY0: " + serverLog.arrayList(chunkcache, rowdef.objectsize*i, rowdef.objectsize)); //System.out.println("ENTRY1: " + serverLog.arrayList(chunkcache, rowdef.objectsize*(i+1), rowdef.objectsize)); if (compare(i, i + 1) == 0) { - removeRow(i); // this decreases the chunkcount + removeRow(i, true); // this decreases the chunkcount } else { i++; } diff --git a/source/de/anomic/kelondro/kelondroRowSet.java b/source/de/anomic/kelondro/kelondroRowSet.java index 838839e29..487b32314 100644 --- a/source/de/anomic/kelondro/kelondroRowSet.java +++ b/source/de/anomic/kelondro/kelondroRowSet.java @@ -124,7 +124,7 @@ public class kelondroRowSet extends kelondroRowCollection implements kelondroInd if (index < 0) return null; //System.out.println("remove: chunk found at index position (before remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length + 10) + ", searchkey=" + serverLog.arrayList(a, start, length)); kelondroRow.Entry entry = super.get(index); - super.removeRow(index); + super.removeRow(index, false); //System.out.println("remove: chunk found at index position (after remove) " + index + ", inset=" + serverLog.arrayList(super.chunkcache, super.rowdef.objectsize() * index, length) + ", searchkey=" + serverLog.arrayList(a, start, length)); int findagainindex = find(a, start, length); //System.out.println("kelondroRowSet.remove");