removed assortments from indexing data structures

removed options to switch on assortments

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3041 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 2372b4fe0c
commit 052f28312a

@ -3,11 +3,11 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.49
#releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
releaseDir=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}
releaseVersion=0.491
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}
#releaseDir=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}
releaseFileParentDir=yacy
releaseNr=$Revision$

@ -49,7 +49,6 @@
import java.io.File;
import java.lang.reflect.Method;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.data.translator;

@ -307,7 +307,6 @@ public class IndexControl_p {
// generate list
if (post.containsKey("keyhashsimilar")) {
try {
final Iterator containerIt = switchboard.wordIndex.indexContainerSet(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator();
indexContainer container;
int i = 0;
@ -327,9 +326,6 @@ public class IndexControl_p {
prop.put("keyhashsimilar_rows_"+rows+"_cols", cols);
prop.put("keyhashsimilar_rows", rows + 1);
prop.put("result", "");
} catch (IOException e) {
prop.put("result", "unknown keys: " + e.getMessage());
}
}
if (post.containsKey("urlstringsearch")) {

@ -166,17 +166,7 @@ public class PerformanceMemory_p {
dfltTotal = 0;
bestTotal = 0;
if (sb.wordIndex.useCollectionIndex) {
prop.put("useRWICache", 0);
} else {
prop.put("useRWICache", 1);
req = sb.wordIndex.size();
chk = sb.wordIndex.assortmentsCacheChunkSizeAvg();
obj = sb.wordIndex.assortmentsCacheObjectSizeAvg();
slt = sb.wordIndex.assortmentsCacheNodeStatus();
ost = sb.wordIndex.assortmentsCacheObjectStatus();
putprop(prop, env, "useRWICache", "RWI", set);
}
prop.put("useRWICache", 0);
req = sb.cacheManager.dbSize();
chk = sb.cacheManager.cacheNodeChunkSize();

@ -205,25 +205,6 @@
</table>
</form>
<p>
<strong>Index Assortments:</strong>
</p>
<table border="0" cellpadding="5" cellspacing="1">
#{assortmentCluster}#
<tr valign="top" class="TableCellDark">
<td>Assortments #[assortmentSlots]#:</td>
<td align="right">#[assortmentSizeA]#</td>
<td align="right">#[assortmentSizeB]#</td>
<td align="right">#[assortmentSizeC]#</td>
<td align="right">#[assortmentSizeD]#</td>
<td align="right">#[assortmentSizeE]#</td>
<td align="right">#[assortmentSizeF]#</td>
<td align="right">#[assortmentSizeG]#</td>
<td align="right">#[assortmentSizeH]#</td>
</tr>
#{/assortmentCluster}#
</table>
<p>
<strong>Proxy Performance Settings:</strong>
</p>

@ -280,24 +280,6 @@ public class PerformanceQueues_p {
prop.put("onlineCautionDelay", switchboard.getConfig("onlineCautionDelay", "30000"));
prop.put("onlineCautionDelayCurrent", System.currentTimeMillis() - switchboard.proxyLastAccess);
int[] asizes = switchboard.wordIndex.assortmentsSizes();
if (asizes != null) {
for (int i = 0; i < asizes.length; i += 8) {
prop.put("assortmentCluster_" + (i/8) + "_assortmentSlots", (i + 1) + "-" + (i + 8));
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeA", asizes[i]);
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeB", asizes[i + 1]);
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeC", asizes[i + 2]);
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeD", asizes[i + 3]);
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeE", asizes[i + 4]);
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeF", asizes[i + 5]);
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeG", asizes[i + 6]);
prop.put("assortmentCluster_" + (i/8) + "_assortmentSizeH", asizes[i + 7]);
}
prop.put("assortmentCluster", asizes.length / 8);
} else {
prop.put("assortmentCluster", 0);
}
// table thread pool settings
GenericKeyedObjectPool.Config crawlerPoolConfig = switchboard.cacheLoader.getPoolConfig();
prop.put("pool_0_name","Crawler Pool");

@ -448,7 +448,7 @@ public final class indexRAMRI implements indexRI {
public synchronized indexContainer addEntry(String wordHash, indexRWIEntry newEntry, long updateTime, boolean dhtCase) {
indexContainer container = (indexContainer) cache.get(wordHash);
if (container == null) container = new indexContainer(wordHash, this.payloadrow, newEntry instanceof indexRWIEntryNew);
if (container == null) container = new indexContainer(wordHash, this.payloadrow, true);
indexRWIEntry[] entries = new indexRWIEntry[] { newEntry };
if (container.add(entries, updateTime) > 0) {
cache.put(wordHash, container);

@ -80,6 +80,7 @@ import java.util.TreeSet;
import java.util.logging.Logger;
import de.anomic.server.serverMemory;
import de.anomic.server.logging.serverLog;
public class kelondroRecords {
@ -1175,7 +1176,13 @@ public class kelondroRecords {
public Node next0() {
// read Objects until a non-deleted Node appears
while (hasNext0()) {
Node nn = next00();
Node nn;
try {
nn = next00();
} catch (IOException e) {
serverLog.logSevere("kelondroRecords", filename + " failed with " + e.getMessage());
return null;
}
byte[] key = nn.getKey();
if ((key == null) ||
((key.length == 1) && (key[0] == (byte) 0x80)) || // the NUL pointer ('lost' chain terminator)
@ -1193,24 +1200,19 @@ public class kelondroRecords {
return null;
}
public Node next00() {
try {
// see if the next record is in the bulk, and if not re-fill the bulk
if ((pos.index - bulkstart) >= bulksize) {
bulkstart = pos.index;
int maxlength = Math.min(USAGE.allCount() - bulkstart, bulksize);
entryFile.readFully(POS_NODES + bulkstart * recordsize, bulk, 0, maxlength * recordsize);
}
// read node from bulk
Node n = new Node(new Handle(pos.index), bulk, (pos.index - bulkstart) * recordsize);
pos.index++;
while ((markedDeleted.contains(pos)) && (pos.index < USAGE.allCount())) pos.index++;
return n;
} catch (IOException e) {
e.printStackTrace();
throw new kelondroException(filename, e.getMessage());
public Node next00() throws IOException {
// see if the next record is in the bulk, and if not re-fill the bulk
if ((pos.index - bulkstart) >= bulksize) {
bulkstart = pos.index;
int maxlength = Math.min(USAGE.allCount() - bulkstart, bulksize);
entryFile.readFully(POS_NODES + bulkstart * recordsize, bulk, 0, maxlength * recordsize);
}
// read node from bulk
Node n = new Node(new Handle(pos.index), bulk, (pos.index - bulkstart) * recordsize);
pos.index++;
while ((markedDeleted.contains(pos)) && (pos.index < USAGE.allCount())) pos.index++;
return n;
}
public void remove() {

@ -88,7 +88,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB");
this.importNurlDB = new plasmaCrawlNURL(this.importPath, ((this.cacheSize*3)/4)/1024, preloadTime, false);
this.importNurlDB = new plasmaCrawlNURL(this.importPath, ((this.cacheSize*3)/4)/1024, preloadTime);
this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();

@ -1,7 +1,6 @@
package de.anomic.plasma.dbImport;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
@ -76,14 +75,10 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
this.log.logFine("Initializing source word index db.");
try {
this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, true, (this.cacheSize/2)/1024, preloadTime / 2, this.log, sb.getConfigBool("useCollectionIndex", false));
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, true, (this.cacheSize/2)/1024, preloadTime / 2, this.log);
this.log.logFine("Initializing import URL db.");
this.importUrlDB = new plasmaCrawlLURL(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, false);
this.importUrlDB = new plasmaCrawlLURL(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2);
this.importStartSize = this.importWordIndex.size();
}

@ -46,7 +46,6 @@ package de.anomic.plasma.parser.swf;
import java.io.InputStream;
import de.anomic.net.URL;
import java.util.Hashtable;
import java.util.TreeSet;
import java.util.HashMap;
import pt.tumba.parser.swf.*;
@ -102,7 +101,7 @@ public class swfParser extends AbstractParser implements Parser {
String longTitle = null;
String[] sections = null;
String abstrct = null;
TreeSet images = null;
//TreeSet images = null;
HashMap anchors = new HashMap();
int urls = 0;
int urlStart = -1;

@ -150,23 +150,15 @@ public class plasmaCrawlEURL {
// the class object
private kelondroIndex urlIndexFile = null;
public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
public plasmaCrawlEURL(File cachePath, int bufferkb, long preloadTime) {
super();
if (newdb) {
String newCacheName = "urlErr3.table";
cachePath.mkdirs();
try {
urlIndexFile = new kelondroFlexTable(cachePath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
} else {
File oldCacheFile = new File(cachePath, "urlErr0.db");
oldCacheFile.getParentFile().mkdirs();
urlIndexFile = kelondroTree.open(oldCacheFile, bufferkb * 0x400, preloadTime, rowdef);
String newCacheName = "urlErr3.table";
cachePath.mkdirs();
try {
urlIndexFile = new kelondroFlexTable(cachePath, newCacheName, bufferkb * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}

@ -66,7 +66,6 @@ import de.anomic.index.indexRWIEntry;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroFlexSplitTable;
@ -93,23 +92,14 @@ public final class plasmaCrawlLURL {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
private boolean newdb;
// the class object
private kelondroIndex urlIndexFile = null;
public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime, boolean newdb) {
public plasmaCrawlLURL(File plasmaPath, File indexPath, int bufferkb, long preloadTime) {
super();
this.newdb = newdb;
try {
if (newdb) {
urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
} else {
File oldLURLDB = new File(plasmaPath, "urlHash.db");
oldLURLDB.getParentFile().mkdirs();
urlIndexFile = new kelondroCache(new kelondroTree(oldLURLDB, bufferkb / 2 * 0x400, preloadTime, indexURLEntryOld.rowdef), bufferkb / 2 * 0x400, true, false);
}
urlIndexFile = new kelondroFlexSplitTable(new File(indexPath, "PUBLIC/TEXT"), "urls", bufferkb * 0x400, preloadTime, indexURLEntryNew.rowdef, kelondroBase64Order.enhancedCoder);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -216,10 +206,7 @@ public final class plasmaCrawlLURL {
try {
kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
if (newdb)
return new indexURLEntryNew(entry, searchedWord);
else
return new indexURLEntryOld(entry, searchedWord);
return new indexURLEntryNew(entry, searchedWord);
} catch (IOException e) {
return null;
}
@ -250,10 +237,7 @@ public final class plasmaCrawlLURL {
public synchronized indexURLEntry newEntry(String propStr) {
if (propStr.startsWith("{") && propStr.endsWith("}")) {
if (newdb)
return new indexURLEntryNew(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
else
return new indexURLEntryOld(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
return new indexURLEntryNew(serverCodings.s2p(propStr.substring(1, propStr.length() - 1)));
} else {
return null;
}
@ -281,11 +265,7 @@ public final class plasmaCrawlLURL {
int limage,
int lvideo,
int lapp) {
if (newdb)
return new indexURLEntryNew(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
else
return new indexURLEntryOld(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
return new indexURLEntryNew(url, descr, author, tags, ETag, mod, load, fresh, referrer, md5,
size, wc, dt, flags, lang, llocal, lother, laudio, limage, lvideo, lapp);
}
@ -411,14 +391,7 @@ public final class plasmaCrawlLURL {
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
if (newdb)
return new indexURLEntryNew(e, null);
else
return new indexURLEntryOld(e, null);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
return new indexURLEntryNew(e, null);
}
public void remove() {
@ -610,7 +583,7 @@ public final class plasmaCrawlLURL {
} catch (MalformedURLException e) {}
if (args[0].equals("-l")) try {
// arg 1 is path to URLCache
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0, false);
final plasmaCrawlLURL urls = new plasmaCrawlLURL(new File(args[1]), new File(args[2]), 1, 0);
final Iterator enu = urls.entries(true, false, null);
while (enu.hasNext()) {
System.out.println(((indexURLEntry) enu.next()).toString());

@ -106,13 +106,12 @@ public class plasmaCrawlNURL {
private File cacheStacksPath;
private int bufferkb;
private long preloadTime;
private boolean newdb;
initStackIndex initThead;
// the class object
private kelondroIndex urlIndexFile = null;
public plasmaCrawlNURL(File cachePath, int bufferkb, long preloadTime, boolean newdb) {
public plasmaCrawlNURL(File cachePath, int bufferkb, long preloadTime) {
super();
this.cacheStacksPath = cachePath;
this.bufferkb = bufferkb;
@ -120,7 +119,6 @@ public class plasmaCrawlNURL {
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
this.newdb = newdb;
openHashCache();
File coreStackFile = new File(cachePath, "urlNoticeLocal0.stack");
@ -195,24 +193,13 @@ public class plasmaCrawlNURL {
}
private void openHashCache() {
if (newdb) {
String newCacheName = "urlNotice5.table";
cacheStacksPath.mkdirs();
try {
urlIndexFile = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb / 2 * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder), bufferkb / 2 * 0x400, true, false);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
} else {
File oldCacheFile = new File(cacheStacksPath, "urlNotice2.db");
oldCacheFile.getParentFile().mkdirs();
try {
urlIndexFile = new kelondroCache(kelondroTree.open(oldCacheFile, bufferkb / 2 * 0x400, preloadTime, rowdef), bufferkb / 2 * 0x400, true, true);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
String newCacheName = "urlNotice5.table";
cacheStacksPath.mkdirs();
try {
urlIndexFile = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb / 2 * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder), bufferkb / 2 * 0x400, true, false);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}

@ -41,7 +41,6 @@
package de.anomic.plasma;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@ -278,12 +277,6 @@ public class plasmaDHTChunk {
urlCache = new HashMap();
this.status = chunkStatus_FAILED;
return 0;
} catch (IOException e) {
log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
indexContainers = new indexContainer[0];
urlCache = new HashMap();
this.status = chunkStatus_FAILED;
return 0;
}
}

@ -237,7 +237,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public dbImportManager dbImportManager;
public plasmaDHTFlush transferIdxThread = null;
private plasmaDHTChunk dhtTransferChunk = null;
private boolean newIndex;
/*
* Remote Proxy configuration
@ -431,17 +430,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management
log.logConfig("Starting Indexing Management");
urlPool = new plasmaURLPool(plasmaPath, indexPath,
ramLURL, getConfigBool("useFlexTableForLURL", false),
ramNURL, getConfigBool("useFlexTableForNURL", false),
ramEURL, getConfigBool("useFlexTableForEURL", true),
ramLURL,
ramNURL,
ramEURL,
ramLURL_time);
newIndex = getConfigBool("useCollectionIndex", false);
try {
wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log, newIndex);
} catch (IOException e1) {
e1.printStackTrace();
System.exit(-1);
}
wordIndex = new plasmaWordIndex(plasmaPath, indexPath, true, ramRWI, ramRWI_time, log);
// set a high maximum cache size to current size; this is adopted later automatically
int wordCacheMaxCount = Math.max((int) getConfigLong("wordCacheInitCount", 30000),

@ -59,13 +59,13 @@ public class plasmaURLPool {
public final plasmaCrawlEURL errorURL;
public plasmaURLPool(File plasmaPath, File indexPath,
int ramLURL, boolean newLURL,
int ramNURL, boolean newNURL,
int ramEURL, boolean newEURL,
int ramLURL,
int ramNURL,
int ramEURL,
long preloadTime) {
loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime, newLURL);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1, newNURL);
errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1, newEURL);
loadedURL = new plasmaCrawlLURL(plasmaPath, indexPath, ramLURL, preloadTime);
noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL, -1);
errorURL = new plasmaCrawlEURL(plasmaPath, ramEURL, -1);
}
public String exists(String hash) {

@ -59,8 +59,6 @@ import de.anomic.yacy.yacyDHTAction;
public final class plasmaWordIndex implements indexRI {
private static final String indexAssortmentClusterPath = "ACLUSTER";
private static final int assortmentCount = 64;
private static final kelondroRow payloadrowold = indexRWIEntryOld.urlEntryRow;
private static final kelondroRow payloadrownew = indexRWIEntryNew.urlEntryRow;
@ -68,50 +66,28 @@ public final class plasmaWordIndex implements indexRI {
private final kelondroOrder indexOrder = kelondroBase64Order.enhancedCoder;
private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
private int assortmentBufferSize; // kb
private final plasmaWordIndexAssortmentCluster assortmentCluster; // old database structure, to be replaced by CollectionRI
private final plasmaWordIndexFileCluster backend; // old database structure, to be replaced by CollectionRI
public boolean busyCacheFlush; // shows if a cache flush is currently performed
public boolean useCollectionIndex; // flag for usage of new collectionIndex db
private int idleDivisor, busyDivisor;
public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log, boolean useCollectionIndex) throws IOException {
public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, boolean dummy, int bufferkb, long preloadTime, serverLog log) {
this.oldDatabaseRoot = oldDatabaseRoot;
this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, payloadrowold, log);
File textindexcache = new File(newIndexRoot, "PUBLIC/TEXT/RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (useCollectionIndex) {
this.dhtOutCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump1.array", log, true);
this.dhtInCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump2.array", log, true);
} else {
this.dhtOutCache = new indexRAMRI(oldDatabaseRoot, payloadrowold, 64, "indexDump1.array", log, false);
this.dhtInCache = new indexRAMRI(oldDatabaseRoot, payloadrowold, 64, "indexDump2.array", log, false);
}
// create assortment cluster path
File assortmentClusterPath = new File(oldDatabaseRoot, indexAssortmentClusterPath);
this.assortmentBufferSize = bufferkb;
this.dhtOutCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump1.array", log, true);
this.dhtInCache = new indexRAMRI(textindexcache, payloadrownew, 1024, "dump2.array", log, true);
// create collections storage path
File textindexcollections = new File(newIndexRoot, "PUBLIC/TEXT/RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
if (useCollectionIndex) {
this.collections = new indexCollectionRI(textindexcollections, "collection", bufferkb * 1024, preloadTime, payloadrownew);
this.assortmentCluster = null;
} else {
this.collections = null;
if (!(assortmentClusterPath.exists())) assortmentClusterPath.mkdirs();
this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, payloadrowold, assortmentBufferSize, preloadTime, log);
}
this.collections = new indexCollectionRI(textindexcollections, "collection", bufferkb * 1024, preloadTime, payloadrownew);
busyCacheFlush = false;
this.useCollectionIndex = useCollectionIndex;
this.busyDivisor = 5000;
this.idleDivisor = 420;
}
public kelondroRow payloadrow() {
if (useCollectionIndex) return payloadrownew; else return payloadrowold;
return payloadrownew;
}
public indexRWIEntry newRWIEntry(
@ -135,14 +111,9 @@ public final class plasmaWordIndex implements indexRI {
int outlinksSame,
int outlinksOther,
kelondroBitfield flags ) {
if (useCollectionIndex)
return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
outlinksSame, outlinksOther, flags);
else
return new indexRWIEntryOld(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
outlinksSame, outlinksOther, false);
}
public File getRoot() {
@ -181,28 +152,6 @@ public final class plasmaWordIndex implements indexRI {
return dhtInCache.size();
}
public int[] assortmentsSizes() {
return (assortmentCluster == null) ? null : assortmentCluster.sizes();
}
public int assortmentsCacheChunkSizeAvg() {
return (assortmentCluster == null) ? 0 : assortmentCluster.cacheChunkSizeAvg();
}
public int assortmentsCacheObjectSizeAvg() {
return (assortmentCluster == null) ? 0 : assortmentCluster.cacheObjectSizeAvg();
}
public int[] assortmentsCacheNodeStatus() {
if (assortmentCluster != null) return assortmentCluster.cacheNodeStatus();
return new int[]{0,0,0,0,0,0,0,0,0,0};
}
public long[] assortmentsCacheObjectStatus() {
if (assortmentCluster != null) return assortmentCluster.cacheObjectStatus();
return new long[]{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
}
public void setMaxWordCount(int maxWords) {
dhtOutCache.setMaxWordCount(maxWords);
}
@ -235,11 +184,11 @@ public final class plasmaWordIndex implements indexRI {
}
public indexContainer emptyContainer(String wordHash) {
return new indexContainer(wordHash, payloadrow(), useCollectionIndex);
return new indexContainer(wordHash, payloadrow(), true);
}
public indexContainer addEntry(String wordHash, indexRWIEntry entry, long updateTime, boolean dhtInCase) {
if ((useCollectionIndex) && (entry instanceof indexRWIEntryOld)) {
if (entry instanceof indexRWIEntryOld) {
if (entry.urlHash() == null) return null;
entry = new indexRWIEntryNew((indexRWIEntryOld) entry);
}
@ -259,7 +208,7 @@ public final class plasmaWordIndex implements indexRI {
private indexContainer convertOld2New(indexContainer entries) {
// convert old entries to new entries
indexContainer newentries = new indexContainer(entries.getWordHash(), payloadrownew, useCollectionIndex);
indexContainer newentries = new indexContainer(entries.getWordHash(), payloadrownew, true);
Iterator i = entries.entries();
indexRWIEntryOld old;
while (i.hasNext()) {
@ -272,7 +221,7 @@ public final class plasmaWordIndex implements indexRI {
}
public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtInCase) {
if ((useCollectionIndex) && (entries.row().objectsize() == payloadrowold.objectsize())) entries = convertOld2New(entries);
if (entries.row().objectsize() == payloadrowold.objectsize()) entries = convertOld2New(entries);
// set dhtInCase depending on wordHash
if ((!dhtInCase) && (yacyDHTAction.shallBeOwnWord(entries.getWordHash()))) dhtInCase = true;
@ -297,7 +246,7 @@ public final class plasmaWordIndex implements indexRI {
if (flushCount > 100) flushCount = 100;
if (flushCount < 1) flushCount = Math.min(1, ram.size());
flushCache(ram, flushCount);
while (ram.maxURLinCache() > ((useCollectionIndex) ? 1024 : 64)) flushCache(ram, 1);
while (ram.maxURLinCache() >= 2040) flushCache(ram, 1);
}
private void flushCache(indexRAMRI ram, int count) {
@ -315,16 +264,9 @@ public final class plasmaWordIndex implements indexRI {
// flush the wordHash
indexContainer c = ram.deleteContainer(wordHash);
if (c != null) {
if (useCollectionIndex) {
indexContainer feedback = collections.addEntries(c, c.updated(), false);
if (feedback != null) {
throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
}
} else {
indexContainer feedback = assortmentCluster.addEntries(c, c.updated(), false);
if (feedback != null) {
backend.addEntries(feedback, System.currentTimeMillis(), true);
}
indexContainer feedback = collections.addEntries(c, c.updated(), false);
if (feedback != null) {
throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
}
}
@ -413,7 +355,6 @@ public final class plasmaWordIndex implements indexRI {
}
public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) {
long start = System.currentTimeMillis();
// get from cache
indexContainer container = dhtOutCache.getContainer(wordHash, urlselection, true, -1);
@ -424,33 +365,10 @@ public final class plasmaWordIndex implements indexRI {
}
// get from collection index
if (useCollectionIndex) {
if (container == null) {
container = collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
} else {
container.add(collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
}
if (container == null) {
container = collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
} else {
// get from assortments
if (assortmentCluster != null) {
if (container == null) {
container = assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime);
} else {
// add containers from assortment cluster
container.add(assortmentCluster.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
}
}
// get from backend
if (maxTime > 0) {
maxTime = maxTime - (System.currentTimeMillis() - start);
if (maxTime < 0) maxTime = 100;
}
if (container == null) {
container = backend.getContainer(wordHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime);
} else {
container.add(backend.getContainer(wordHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : maxTime), -1);
}
container.add(collections.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), -1);
}
return container;
}
@ -486,28 +404,14 @@ public final class plasmaWordIndex implements indexRI {
}
public int size() {
if (useCollectionIndex)
return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size()));
else
return java.lang.Math.max((assortmentCluster == null) ? 0 : assortmentCluster.size(),
java.lang.Math.max(backend.size(),
java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())));
return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size()));
}
public int indexSize(String wordHash) {
int size = 0;
size += dhtInCache.indexSize(wordHash);
size += dhtOutCache.indexSize(wordHash);
if (useCollectionIndex) {
size += collections.indexSize(wordHash);
} else try {
size += (assortmentCluster == null) ? 0 : assortmentCluster.indexSize(wordHash);
plasmaWordIndexFile entity = backend.getEntity(wordHash, true, -1);
if (entity != null) {
size += entity.size();
entity.close();
}
} catch (IOException e) {}
size += collections.indexSize(wordHash);
return size;
}
@ -515,25 +419,15 @@ public final class plasmaWordIndex implements indexRI {
synchronized (this) {
dhtInCache.close(waitingBoundSeconds);
dhtOutCache.close(waitingBoundSeconds);
if (useCollectionIndex) {
collections.close(-1);
} else {
if (assortmentCluster != null) assortmentCluster.close(-1);
backend.close(10);
}
collections.close(-1);
}
}
public indexContainer deleteContainer(String wordHash) {
indexContainer c = new indexContainer(wordHash, payloadrow(), useCollectionIndex);
indexContainer c = new indexContainer(wordHash, payloadrow(), true);
c.add(dhtInCache.deleteContainer(wordHash), -1);
c.add(dhtOutCache.deleteContainer(wordHash), -1);
if (useCollectionIndex) {
c.add(collections.deleteContainer(wordHash), -1);
} else {
if (assortmentCluster != null) c.add(assortmentCluster.deleteContainer(wordHash), -1);
c.add(backend.deleteContainer(wordHash), -1);
}
c.add(collections.deleteContainer(wordHash), -1);
return c;
}
@ -541,12 +435,7 @@ public final class plasmaWordIndex implements indexRI {
boolean removed = false;
removed = removed | (dhtInCache.removeEntry(wordHash, urlHash, deleteComplete));
removed = removed | (dhtOutCache.removeEntry(wordHash, urlHash, deleteComplete));
if (useCollectionIndex) {
removed = removed | (collections.removeEntry(wordHash, urlHash, deleteComplete));
} else {
if (assortmentCluster != null) removed = removed | (assortmentCluster.removeEntry(wordHash, urlHash, deleteComplete));
removed = removed | backend.removeEntry(wordHash, urlHash, deleteComplete);
}
removed = removed | (collections.removeEntry(wordHash, urlHash, deleteComplete));
return removed;
}
@ -554,12 +443,7 @@ public final class plasmaWordIndex implements indexRI {
int removed = 0;
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete);
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete);
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
} else if (assortmentCluster != null) {
removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete);
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
}
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
return removed;
}
@ -567,12 +451,7 @@ public final class plasmaWordIndex implements indexRI {
String removed = "";
removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
if (useCollectionIndex) {
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
} else {
if (assortmentCluster != null) removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete) + ", ";
removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
}
removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
return removed;
}
@ -589,7 +468,7 @@ public final class plasmaWordIndex implements indexRI {
return dhtInCache.tryRemoveURLs(urlHash) | dhtOutCache.tryRemoveURLs(urlHash);
}
public TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) throws IOException {
public TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) {
// creates a set of indexContainers
// this does not use the dhtInCache
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
@ -610,62 +489,33 @@ public final class plasmaWordIndex implements indexRI {
public Iterator wordContainers(String startHash, boolean rot) {
// returns an iteration of indexContainers
try {
return wordContainers(startHash, RL_WORDFILES, rot);
} catch (IOException e) {
return new HashSet().iterator();
}
return wordContainers(startHash, RL_WORDFILES, rot);
}
public Iterator wordContainers(String startHash, int resourceLevel, boolean rot) throws IOException {
public Iterator wordContainers(String startHash, int resourceLevel, boolean rot) {
if (rot) return new rotatingContainerIterator(startHash, resourceLevel);
else return wordContainers(startHash, resourceLevel);
}
private Iterator wordContainers(String startWordHash, int resourceLevel) throws IOException {
private Iterator wordContainers(String startWordHash, int resourceLevel) {
kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone());
containerOrder.rotate(startWordHash.getBytes());
if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) {
return dhtOutCache.wordContainers(startWordHash, false);
}
if (useCollectionIndex) {
return new kelondroMergeIterator(
return new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
collections.wordContainers(startWordHash, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
} else {
if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) {
return new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
(assortmentCluster == null) ? null : assortmentCluster.wordContainers(startWordHash, true, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
}
if (resourceLevel == plasmaWordIndex.RL_WORDFILES) {
return new kelondroMergeIterator(
new kelondroMergeIterator(
dhtOutCache.wordContainers(startWordHash, false),
(assortmentCluster == null) ? null : assortmentCluster.wordContainers(startWordHash, true, false),
containerOrder,
indexContainer.containerMergeMethod,
true),
backend.wordContainers(startWordHash, false),
containerOrder,
indexContainer.containerMergeMethod,
true);
}
}
return null;
}
public class rotatingContainerIterator implements Iterator {
Iterator i;
int resourceLevel;
public rotatingContainerIterator(String startWordHash, int resourceLevel) throws IOException {
public rotatingContainerIterator(String startWordHash, int resourceLevel) {
this.resourceLevel = resourceLevel;
i = wordContainers(startWordHash, resourceLevel);
}
@ -676,11 +526,9 @@ public final class plasmaWordIndex implements indexRI {
public boolean hasNext() {
if (i.hasNext()) return true;
else try {
else {
i = wordContainers("------------", resourceLevel);
return i.hasNext();
} catch (IOException e) {
return false;
}
}
@ -693,57 +541,6 @@ public final class plasmaWordIndex implements indexRI {
}
} // class rotatingContainerIterator
public Object migrateWords2Assortment(String wordhash) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
if (!(db.exists())) return "not available";
plasmaWordIndexFile entity = null;
try {
entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
int size = entity.size();
if (size > assortmentCluster.clusterCapacity) {
// this will be too big to integrate it
entity.close(); entity = null;
return "too big";
} else {
// take out all words from the assortment to see if it fits
// together with the extracted assortment
indexContainer container = assortmentCluster.deleteContainer(wordhash, -1);
if (size + container.size() > assortmentCluster.clusterCapacity) {
// this will also be too big to integrate, add to entity
entity.addEntries(container);
entity.close(); entity = null;
return new Integer(-container.size());
} else {
// the combined container will fit, read the container
try {
Iterator entries = entity.elements(true);
indexRWIEntry entry;
while (entries.hasNext()) {
entry = (indexRWIEntry) entries.next();
// System.out.println("ENTRY = " + entry.getUrlHash());
container.add(new indexRWIEntry[]{entry}, System.currentTimeMillis());
}
// we have read all elements, now delete the entity
entity.deleteComplete();
entity.close(); entity = null;
// integrate the container into the assortments; this will work
assortmentCluster.addEntries(container, container.updated(), false);
return new Integer(size);
} catch (kelondroException e) {
// database corrupted, we simply give up the database and delete it
try {entity.close();} catch (Exception ee) {} entity = null;
try {db.delete();} catch (Exception ee) {}
return "database corrupted; deleted";
}
}
}
} finally {
if (entity != null) try {entity.close();}catch(Exception e){}
}
}
public Object migrateWords2index(String wordhash) throws IOException {
// returns the number of entries that had been added to the assortments
// can be negative if some assortments have been moved to the backend
@ -753,7 +550,7 @@ public final class plasmaWordIndex implements indexRI {
try {
entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
int size = entity.size();
indexContainer container = new indexContainer(wordhash, payloadrow(), useCollectionIndex);
indexContainer container = new indexContainer(wordhash, payloadrow(), true);
try {
Iterator entries = entity.elements(true);
@ -812,48 +609,43 @@ public final class plasmaWordIndex implements indexRI {
indexRWIEntry entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
try {
Iterator indexContainerIterator = indexContainerSet(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
while (indexContainerIterator.hasNext() && run) {
Iterator indexContainerIterator = indexContainerSet(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
while (indexContainerIterator.hasNext() && run) {
waiter();
container = (indexContainer) indexContainerIterator.next();
Iterator containerIterator = container.entries();
wordHashNow = container.getWordHash();
while (containerIterator.hasNext() && run) {
waiter();
container = (indexContainer) indexContainerIterator.next();
Iterator containerIterator = container.entries();
wordHashNow = container.getWordHash();
while (containerIterator.hasNext() && run) {
waiter();
entry = (indexRWIEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
indexURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) {
entry = (indexRWIEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
indexURLEntry ue = lurl.load(entry.urlHash(), null);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {
url = ue.comp().url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
} else {
url = ue.comp().url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}
}
}
if (urlHashs.size() > 0) {
int removed = removeEntries(container.getWordHash(), urlHashs, true);
serverLog.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
lastWordHash = container.getWordHash();
lastDeletionCounter = urlHashs.size();
urlHashs.clear();
}
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
TreeSet containers = indexContainerSet(container.getWordHash(), plasmaWordIndex.RL_WORDFILES, false, 100);
}
if (urlHashs.size() > 0) {
int removed = removeEntries(container.getWordHash(), urlHashs, true);
serverLog.logFine("INDEXCLEANER", container.getWordHash() + ": " + removed + " of " + container.size() + " URL-entries deleted");
lastWordHash = container.getWordHash();
lastDeletionCounter = urlHashs.size();
urlHashs.clear();
}
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
TreeSet containers = indexContainerSet(container.getWordHash(), plasmaWordIndex.RL_WORDFILES, false, 100);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext()) && (!container.getWordHash().equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) {
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext())&&(!container.getWordHash().equals(((indexContainer) indexContainerIterator.next()).getWordHash()))) {
indexContainerIterator = containers.iterator();
}
}
}
} catch (IOException e) {
serverLog.logSevere("INDEXCLEANER",
"IndexCleaner-Thread: unable to start: "
+ e.getMessage());
}
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped");
}
@ -903,16 +695,11 @@ public final class plasmaWordIndex implements indexRI {
// System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB");
File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX");
try {
plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP"), false);
Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true);
while (containerIter.hasNext()) {
System.out.println("File: " + (indexContainer) containerIter.next());
}
} catch (IOException e) {
e.printStackTrace();
plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, true, 555, 1000, new serverLog("TESTAPP"));
Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true);
while (containerIter.hasNext()) {
System.out.println("File: " + (indexContainer) containerIter.next());
}
}
}

@ -68,7 +68,6 @@ import org.apache.axis.attachments.Attachments;
import org.w3c.dom.Document;
import de.anomic.data.listManager;
import de.anomic.http.httpd;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -501,11 +500,13 @@ public class BlacklistService extends AbstractService {
};
}
/* not used
private String[] getSharedBlacklistArray() {
String sharedBlacklists = this.switchboard.getConfig(BLACKLIST_SHARED, "");
String[] supportedBlacklistTypeArray = sharedBlacklists.split(",");
return supportedBlacklistTypeArray;
}
*/
private File getBlacklistFile(String blacklistName) {
File blacklistFile = new File(listManager.listsPath, blacklistName);
@ -517,11 +518,13 @@ public class BlacklistService extends AbstractService {
return blacklistFile.exists();
}
/* not used
private HashSet getSharedBlacklistSet() {
HashSet supportedTypesSet = new HashSet(Arrays.asList(getSharedBlacklistArray()));
return supportedTypesSet;
}
*/
private String[] getSupportedBlacklistTypeArray() {
String supportedBlacklistTypesStr = this.switchboard.getConfig(BLACKLISTS_TYPES, "");
String[] supportedBlacklistTypeArray = supportedBlacklistTypesStr.split(",");
@ -555,10 +558,12 @@ public class BlacklistService extends AbstractService {
listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig(LIST_MANAGER_LISTS_PATH, "DATA/LISTS"));
}
/* not used
private void ativateBlacklistForAllTypes(String blacklistName) {
String[] supportedBlacklistTypes = getSupportedBlacklistTypeArray();
this.activateBlacklistForTypes(blacklistName,supportedBlacklistTypes);
}
*/
private void activateBlacklistForTypes(String blacklistName, String[] activateForBlacklistTypes) {
if (activateForBlacklistTypes == null) return;

@ -552,7 +552,7 @@ public final class yacyClient {
}
// add the url entry to the word indexes
for (int m = 0; m < words; m++) {
if ((wordIndex.useCollectionIndex) && (entry instanceof indexRWIEntryOld)) {
if (entry instanceof indexRWIEntryOld) {
if (entry.urlHash() == null) continue;
entry = new indexRWIEntryNew((indexRWIEntryOld) entry);
}

@ -95,8 +95,6 @@ import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverMemory;
import de.anomic.server.serverPlainSwitch;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverSystem;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.enumerateFiles;
@ -652,20 +650,13 @@ public final class yacy {
*/
public static void migrateWords(String homePath) {
// run with "java -classpath classes yacy -migratewords"
final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File dbroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
serverLog log = new serverLog("WORDMIGRATION");
log.logInfo("STARTING MIGRATION");
boolean useCollectionIndex = sps.getConfigBool("useCollectionIndex", false);
plasmaWordIndex wordIndexCache = null;
try {
wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, true, 20000, 10000, log, useCollectionIndex);
} catch (IOException e1) {
e1.printStackTrace();
System.exit(-1);
}
wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, true, 20000, 10000, log);
enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true);
String wordhash;
File wordfile;
@ -675,10 +666,7 @@ public final class yacy {
wordfile = (File) words.nextElement();
wordhash = wordfile.getName().substring(0, 12);
// System.out.println("NOW: " + wordhash);
if (useCollectionIndex)
migrationStatus = wordIndexCache.migrateWords2index(wordhash);
else
migrationStatus = wordIndexCache.migrateWords2Assortment(wordhash);
migrationStatus = wordIndexCache.migrateWords2index(wordhash);
if (migrationStatus instanceof Integer) {
int migrationCount = ((Integer) migrationStatus).intValue();
if (migrationCount == 0)
@ -704,7 +692,6 @@ public final class yacy {
*/
public static void minimizeUrlDB(String homePath, int dbcache) {
// run with "java -classpath classes yacy -minimizeUrlDB"
final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File plasmaroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
@ -715,16 +702,16 @@ public final class yacy {
// db containing all currently loades urls
int cache = dbcache * 1024; // in KB
log.logFine("URLDB-Caches: "+cache+" bytes");
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexRoot, cache, 10000, false);
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexRoot, cache, 10000);
// db used to hold all neede urls
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(plasmaroot, "minimized"), indexRoot, cache, 10000, false);
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(plasmaroot, "minimized"), indexRoot, cache, 10000);
Runtime rt = Runtime.getRuntime();
int cacheMem = (int)((serverMemory.max-rt.totalMemory())/1024)-(2*cache + 8*1024);
if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up.");
plasmaWordIndex wordIndex = new plasmaWordIndex(plasmaroot, indexRoot, true, cacheMem, 10000, log, sps.getConfigBool("useCollectionIndex", false));
plasmaWordIndex wordIndex = new plasmaWordIndex(plasmaroot, indexRoot, true, cacheMem, 10000, log);
Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false);
long urlCounter = 0, wordCounter = 0;
@ -954,7 +941,7 @@ public final class yacy {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000);
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
HashMap doms = new HashMap();
System.out.println("Started domain list extraction from " + pool.loadedURL.size() + " url entries.");
System.out.println("a dump will be written after double-check of all extracted domains.");
@ -1070,7 +1057,7 @@ public final class yacy {
private static void urllist(String homePath, String source, boolean html, String targetName) {
File root = new File(homePath);
try {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, false, 1000, false, 1000, false, 10000);
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
File file = new File(root, targetName);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
@ -1131,7 +1118,7 @@ public final class yacy {
}
private static void migratelurls(File root, File urlHash) {
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, true, 1000, true, 1000, true, 10000);
plasmaURLPool pool = new plasmaURLPool(new File(root, "DATA/PLASMADB"), new File(root, "DATA/INDEX"), 16000, 1000, 1000, 10000);
kelondroTree oldindex = null;
try {
oldindex = new kelondroTree(urlHash, 1000, -1, indexURLEntryOld.rowdef);
@ -1211,7 +1198,7 @@ public final class yacy {
serverLog log = new serverLog("URLDBCLEANUP");
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
try {
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexroot, 4194304, 10000, false);
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(plasmaroot, indexroot, 4194304, 10000);
currentUrlDB.urldbcleanup();
currentUrlDB.close();
} catch (IOException e) {
@ -1222,7 +1209,6 @@ public final class yacy {
private static void RWIHashList(String homePath, String targetName, String resource, String format) {
plasmaWordIndex WordIndex = null;
serverLog log = new serverLog("HASHLIST");
final serverSwitch sps = new serverPlainSwitch(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
File homeDBroot = new File(new File(homePath), "DATA/PLASMADB");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
String wordChunkStartHash = "------------";
@ -1232,7 +1218,7 @@ public final class yacy {
try {
Iterator indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log, sps.getConfigBool("useCollectionIndex", false));
WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, true, 8*1024*1024, 3000, log);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
} else if (resource.equals("assortments")) {
plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, indexRWIEntryOld.urlEntryRow, 16*1024*1024, 3000, log);

@ -818,10 +818,6 @@ currentSkin=
# temporary flag for new database structure. set only true for testing
# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
# table-types: RAM = 0, TREE = 1, FLEX = 2;
useCollectionIndex=true
useFlexTableForNURL=true
useFlexTableForEURL=true
useFlexTableForLURL=true
tableTypeForPreNURL=2
# flag to show surftipps on index.html page

Loading…
Cancel
Save