diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 1f6737588..0b5551d4e 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -97,7 +97,7 @@ public final class IndexImport_p { if (startImport) { dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType); if (importerThread != null) { - importerThread.init(new File(importPath), cacheSize, 100); + importerThread.init(new File(importPath), switchboard.indexPublicTextPath, cacheSize, 100); importerThread.startIt(); } prop.put("LOCATION",""); diff --git a/source/de/anomic/index/indexCollectionRI.java b/source/de/anomic/index/indexCollectionRI.java index 4616997c4..e28b6ff9f 100644 --- a/source/de/anomic/index/indexCollectionRI.java +++ b/source/de/anomic/index/indexCollectionRI.java @@ -38,18 +38,27 @@ import de.anomic.kelondro.kelondroOutOfLimitsException; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRowCollection; import de.anomic.kelondro.kelondroRowSet; +import de.anomic.server.logging.serverLog; public class indexCollectionRI extends indexAbstractRI implements indexRI { kelondroCollectionIndex collectionIndex; - public indexCollectionRI(File path, String filenameStub, long buffersize, long preloadTime) throws IOException { - kelondroRow rowdef = new kelondroRow(new int[]{}); - - collectionIndex = new kelondroCollectionIndex( - path, filenameStub, 9 /*keyLength*/, - kelondroNaturalOrder.naturalOrder, buffersize, preloadTime, - 4 /*loadfactor*/, rowdef); + public indexCollectionRI(File path, String filenameStub, long buffersize, long preloadTime) { + kelondroRow rowdef = indexURLEntry.urlEntryRow; + try { + collectionIndex = new kelondroCollectionIndex( + path, + filenameStub, + 12 /*keyLength*/, + kelondroNaturalOrder.naturalOrder, + buffersize, + preloadTime, + 4 /*loadfactor*/, + rowdef); + } catch (IOException e) { + serverLog.logSevere("PLASMA", "unable to open collection index at " + path.toString() + ":" + e.getMessage()); + } } public int size() { @@ -133,7 +142,7 @@ public class indexCollectionRI extends indexAbstractRI implements indexRI { String wordHash = newEntries.getWordHash(); try { collectionIndex.merge(wordHash.getBytes(), (kelondroRowCollection) newEntries); - return getContainer(wordHash, true, -1); // FIXME: this is not optimal + return null; // merge does allways 'eat' up all entries unlike the assortments; they may return an overflow container } catch (kelondroOutOfLimitsException e) { e.printStackTrace(); return null; diff --git a/source/de/anomic/kelondro/kelondroCollectionIndex.java b/source/de/anomic/kelondro/kelondroCollectionIndex.java index 27c7c65e3..3ca5f3993 100644 --- a/source/de/anomic/kelondro/kelondroCollectionIndex.java +++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java @@ -55,7 +55,7 @@ public class kelondroCollectionIndex { "int chunksize-4 {b256}," + "int chunkcount-4 {b256}," + "int indexpos-4 {b256}," + - "short lastread-2 {b256}" + + "short lastread-2 {b256}, " + "short lastwrote-2 {b256}" ); } @@ -157,7 +157,7 @@ public class kelondroCollectionIndex { private int putmergeremove(byte[] key, kelondroRowCollection collection, boolean merge, Set removekeys, boolean deletecomplete) throws IOException, kelondroOutOfLimitsException { //if (collection.size() > maxChunks) throw new kelondroOutOfLimitsException(maxChunks, collection.size()); - if ((!merge) && (collection.size() == 0)) { + if ((!merge) && (removekeys != null) && (collection != null) && (collection.size() == 0)) { // this is not a replacement, it is a deletion delete(key); return 0; diff --git a/source/de/anomic/plasma/dbImport/AbstractImporter.java b/source/de/anomic/plasma/dbImport/AbstractImporter.java index ccbac802a..823f6642a 100644 --- a/source/de/anomic/plasma/dbImport/AbstractImporter.java +++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java @@ -14,7 +14,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ protected boolean paused = false; protected plasmaSwitchboard sb; - protected File importPath; + protected File importPath, indexPath; protected int cacheSize; protected long preloadTime; @@ -33,9 +33,10 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ return this.error; } - public void init(File theImportPath) { + public void init(File theImportPath, File theIndexPath) { if (theImportPath == null) throw new NullPointerException("The Import path must not be null."); - this.importPath = theImportPath; + this.importPath = theImportPath; + this.indexPath = theIndexPath; // getting a job id from the import manager this.jobID = this.sb.dbImportManager.getJobID(); diff --git a/source/de/anomic/plasma/dbImport/AssortmentImporter.java b/source/de/anomic/plasma/dbImport/AssortmentImporter.java index 54f1b8fc7..1969b5e46 100644 --- a/source/de/anomic/plasma/dbImport/AssortmentImporter.java +++ b/source/de/anomic/plasma/dbImport/AssortmentImporter.java @@ -21,8 +21,8 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{ this.jobType = "ASSORTMENT"; } - public void init(File theImportAssortmentFile, int theCacheSize, long preloadTime) { - super.init(theImportAssortmentFile); + public void init(File theImportAssortmentFile, File theIndexFile, int theCacheSize, long preloadTime) { + super.init(theImportAssortmentFile, theIndexFile); this.importAssortmentFile = theImportAssortmentFile; this.cacheSize = theCacheSize; if (this.cacheSize < 2*1024*1024) this.cacheSize = 2*1024*1024; diff --git a/source/de/anomic/plasma/dbImport/dbImporter.java b/source/de/anomic/plasma/dbImport/dbImporter.java index 0002b2780..81fe9de94 100644 --- a/source/de/anomic/plasma/dbImport/dbImporter.java +++ b/source/de/anomic/plasma/dbImport/dbImporter.java @@ -24,6 +24,6 @@ public interface dbImporter { public String getError(); public String getStatus(); - public void init(File importPath, int cacheSize, long preloadTime); + public void init(File importPath, File indexPath, int cacheSize, long preloadTime); public void startIt(); } diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index e3368dc6f..258956b47 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -45,8 +45,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor return theStatus.toString(); } - public void init(File theImportPath, int theCacheSize, long preloadTime) { - super.init(theImportPath); + public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) { + super.init(theImportPath, theIndexPath); this.cacheSize = theCacheSize; this.preloadTime = preloadTime; diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index f5b3394f4..36819d94d 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -51,9 +51,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { return theStatus.toString(); } - public void init(File theImportPath, int theCacheSize, long preloadTime) { - super.init(theImportPath); - + public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) { + super.init(theImportPath, theIndexPath); + this.homeWordIndex = this.sb.wordIndex; this.homeUrlDB = this.sb.urlPool.loadedURL; this.cacheSize = theCacheSize; @@ -75,7 +75,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { } this.log.logFine("Initializing source word index db."); - this.importWordIndex = new plasmaWordIndex(this.importPath, (this.cacheSize/2)/1024, preloadTime / 2, this.log); + this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, this.log); this.log.logFine("Initializing import URL db."); this.importUrlDB = new plasmaCrawlLURL(new File(this.importPath, "urlHash.db"), (this.cacheSize/2)/1024, preloadTime / 2); this.importStartSize = this.importWordIndex.size(); diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index 68fcbfafc..d32357d5f 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -243,7 +243,7 @@ public class plasmaDHTChunk { } // create result indexContainers = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]); - +//[C[16GwGuFzwffp] has 1 entries, C[16hGKMAl0w97] has 9 entries, C[17A8cDPF6SfG] has 9 entries, C[17Kdj__WWnUy] has 1 entries, C[1 if ((indexContainers == null) || (indexContainers.length == 0)) { log.logFine("No index available for index transfer, hash start-point " + startPointHash); this.status = chunkStatus_FAILED; @@ -269,7 +269,7 @@ public class plasmaDHTChunk { } - public int deleteTransferIndexes() { + public synchronized int deleteTransferIndexes() { Iterator urlIter; indexEntry iEntry; HashSet urlHashes; @@ -277,6 +277,10 @@ public class plasmaDHTChunk { for (int i = 0; i < this.indexContainers.length; i++) { // delete entries separately + if (this.indexContainers[i] == null) { + log.logFine("Deletion of partial index #" + i + " not possible, entry is null"); + continue; + } int c = this.indexContainers[i].size(); urlHashes = new HashSet(this.indexContainers[i].size()); urlIter = this.indexContainers[i].entries(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 565fe6dfa..c58fa3144 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -180,6 +180,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // storage management public File htCachePath; private File plasmaPath; + public File indexPublicTextPath; public File listsPath; public File htDocsPath; public File rankingPath; @@ -260,6 +261,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load values from configs this.plasmaPath = new File(rootPath, getConfig("dbPath", "DATA/PLASMADB")); this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString()); + this.indexPublicTextPath = new File(rootPath, getConfig("indexPublicTextPath", "DATA/INDEX/PUBLIC/TEXT")); + this.log.logConfig("Index Path: " + this.indexPublicTextPath.toString()); this.listsPath = new File(rootPath, getConfig("listsPath", "DATA/LISTS")); this.log.logConfig("Lists Path: " + this.listsPath.toString()); this.htDocsPath = new File(rootPath, getConfig("htDocsPath", "DATA/HTDOCS")); @@ -386,7 +389,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logConfig("Starting Indexing Management"); urlPool = new plasmaURLPool(plasmaPath, ramLURL, ramNURL, ramEURL, ramLURL_time); - wordIndex = new plasmaWordIndex(plasmaPath, ramRWI, ramRWI_time, log); + wordIndex = new plasmaWordIndex(plasmaPath, indexPublicTextPath, ramRWI, ramRWI_time, log); int wordCacheMaxCount = (int) getConfigLong("wordCacheMaxCount", 10000); wordIndex.setMaxWordCount(wordCacheMaxCount); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 3a61ee628..96f2c021d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -58,6 +58,7 @@ import java.util.TreeSet; import de.anomic.net.URL; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.index.indexCollectionRI; import de.anomic.index.indexContainer; import de.anomic.index.indexContainerOrder; import de.anomic.index.indexEntry; @@ -78,28 +79,37 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { private static final String indexAssortmentClusterPath = "ACLUSTER"; private static final int assortmentCount = 64; + private static final boolean useCollectionIndex = false; - private final File databaseRoot; - private final indexRAMCacheRI ramCache; - private final plasmaWordIndexAssortmentCluster assortmentCluster; - private int assortmentBufferSize; //kb - private final plasmaWordIndexFileCluster backend; - private final kelondroOrder indexOrder = new kelondroNaturalOrder(true); + private final File oldDatabaseRoot; + private final kelondroOrder indexOrder = new kelondroNaturalOrder(true); + private final indexRAMCacheRI ramCache; + private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster + private int assortmentBufferSize; // kb + private final plasmaWordIndexAssortmentCluster assortmentCluster; // old database structure, to be replaced by CollectionRI + private final plasmaWordIndexFileCluster backend; // old database structure, to be replaced by CollectionRI - public plasmaWordIndex(File databaseRoot, int bufferkb, long preloadTime, serverLog log) { - this.databaseRoot = databaseRoot; - this.backend = new plasmaWordIndexFileCluster(databaseRoot, log); - this.ramCache = new indexRAMCacheRI(databaseRoot, log); + public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, int bufferkb, long preloadTime, serverLog log) { + this.oldDatabaseRoot = oldDatabaseRoot; + this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, log); + this.ramCache = new indexRAMCacheRI(oldDatabaseRoot, log); - // create new assortment cluster path - File assortmentClusterPath = new File(databaseRoot, indexAssortmentClusterPath); + // create assortment cluster path + File assortmentClusterPath = new File(oldDatabaseRoot, indexAssortmentClusterPath); if (!(assortmentClusterPath.exists())) assortmentClusterPath.mkdirs(); this.assortmentBufferSize = bufferkb; this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, assortmentBufferSize, preloadTime, log); + + // create collections storage path + if (!(newIndexRoot.exists())) newIndexRoot.mkdirs(); + if (useCollectionIndex) + collections = new indexCollectionRI(newIndexRoot, "test_generation0", bufferkb * 1024, preloadTime); + else + collections = null; } public File getRoot() { - return databaseRoot; + return oldDatabaseRoot; } public int maxURLinWCache() { @@ -203,9 +213,16 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { private synchronized void flushCache(String wordHash) { indexContainer c = ramCache.deleteContainer(wordHash); if (c != null) { - indexContainer feedback = assortmentCluster.addEntries(c, c.updated(), false); - if (feedback != null) { - backend.addEntries(feedback, System.currentTimeMillis(), true); + if (useCollectionIndex) { + indexContainer feedback = collections.addEntries(c, c.updated(), false); + if (feedback != null) { + throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString()); + } + } else { + indexContainer feedback = assortmentCluster.addEntries(c, c.updated(), false); + if (feedback != null) { + backend.addEntries(feedback, System.currentTimeMillis(), true); + } } } } @@ -292,15 +309,25 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { // get from cache indexContainer container = ramCache.getContainer(wordHash, true, -1); + // We must not use the container from cache to store everything we find, + // as that container remains linked to in the cache and might be changed later + // while the returned container is still in use. + // create a clone from the container + if (container != null) container = container.topLevelClone(); + + // get from collection index + if (useCollectionIndex) { + if (container == null) { + container = collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime); + } else { + container.add(collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1); + } + } + // get from assortments if (container == null) { container = assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime); } else { - // We must not use the container from cache to store everything we find, - // as that container remains linked to in the cache and might be changed later - // while the returned container is still in use. - // create a clone from the container - container = container.topLevelClone(); // add containers from assortment cluster container.add(assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1); } @@ -357,6 +384,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { entity.close(); } } catch (IOException e) {} + if (useCollectionIndex) size += collections.size(); size += assortmentCluster.indexSize(wordHash); size += ramCache.indexSize(wordHash); return size; @@ -364,6 +392,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public synchronized void close(int waitingBoundSeconds) { ramCache.close(waitingBoundSeconds); + if (useCollectionIndex) collections.close(-1); assortmentCluster.close(-1); backend.close(10); } @@ -371,7 +400,8 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public synchronized indexContainer deleteContainer(String wordHash) { indexContainer c = ramCache.deleteContainer(wordHash); if (c == null) c = new indexRowSetContainer(wordHash); - c.add(assortmentCluster.deleteContainer(wordHash, -1), -1); + if (useCollectionIndex) c.add(collections.deleteContainer(wordHash), -1); + c.add(assortmentCluster.deleteContainer(wordHash), -1); c.add(backend.deleteContainer(wordHash), -1); return c; } @@ -379,6 +409,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { synchronized (this) { if (ramCache.removeEntry(wordHash, urlHash, deleteComplete)) return true; + if (useCollectionIndex) {if (collections.removeEntry(wordHash, urlHash, deleteComplete)) return true;} if (assortmentCluster.removeEntry(wordHash, urlHash, deleteComplete)) return true; return backend.removeEntry(wordHash, urlHash, deleteComplete); } @@ -389,6 +420,10 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { synchronized (this) { removed += ramCache.removeEntries(wordHash, urlHashes, deleteComplete); if (removed == urlHashes.size()) return removed; + if (useCollectionIndex) { + removed += collections.removeEntries(wordHash, urlHashes, deleteComplete); + if (removed == urlHashes.size()) return removed; + } removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete); if (removed == urlHashes.size()) return removed; removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); @@ -405,9 +440,9 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { } public static final int RL_RAMCACHE = 0; - public static final int RL_COLLECTIONS = 1; // the 'new' index structure - public static final int RL_ASSORTMENTS = 2; - public static final int RL_WORDFILES = 3; + public static final int RL_COLLECTIONS = 1; // the new index structure + public static final int RL_ASSORTMENTS = 2; // (to be) outdated structure + public static final int RL_WORDFILES = 3; // (to be) outdated structure public synchronized TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) throws IOException { @@ -446,16 +481,56 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) { return ramCache.wordContainers(startWordHash, false); } - if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) { + if ((resourceLevel == plasmaWordIndex.RL_COLLECTIONS) && (useCollectionIndex)) { return new kelondroMergeIterator( + ramCache.wordContainers(startWordHash, false), + collections.wordContainers(startWordHash, false), + new indexContainerOrder(kelondroNaturalOrder.naturalOrder), + indexRowSetContainer.containerMergeMethod, + true); + } + if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) { + if (useCollectionIndex) { + return new kelondroMergeIterator( + new kelondroMergeIterator( + ramCache.wordContainers(startWordHash, false), + collections.wordContainers(startWordHash, false), + new indexContainerOrder(kelondroNaturalOrder.naturalOrder), + indexRowSetContainer.containerMergeMethod, + true), + assortmentCluster.wordContainers(startWordHash, true, false), + new indexContainerOrder(kelondroNaturalOrder.naturalOrder), + indexRowSetContainer.containerMergeMethod, + true); + } else { + return new kelondroMergeIterator( ramCache.wordContainers(startWordHash, false), assortmentCluster.wordContainers(startWordHash, true, false), new indexContainerOrder(kelondroNaturalOrder.naturalOrder), indexRowSetContainer.containerMergeMethod, true); + } } if (resourceLevel == plasmaWordIndex.RL_WORDFILES) { - return new kelondroMergeIterator( + if (useCollectionIndex) { + return new kelondroMergeIterator( + new kelondroMergeIterator( + new kelondroMergeIterator( + ramCache.wordContainers(startWordHash, false), + collections.wordContainers(startWordHash, false), + new indexContainerOrder(kelondroNaturalOrder.naturalOrder), + indexRowSetContainer.containerMergeMethod, + true), + assortmentCluster.wordContainers(startWordHash, true, false), + new indexContainerOrder(kelondroNaturalOrder.naturalOrder), + indexRowSetContainer.containerMergeMethod, + true), + backend.wordContainers(startWordHash, false), + new indexContainerOrder(kelondroNaturalOrder.naturalOrder), + indexRowSetContainer.containerMergeMethod, + true); + } else { + return new kelondroMergeIterator( new kelondroMergeIterator( ramCache.wordContainers(startWordHash, false), assortmentCluster.wordContainers(startWordHash, true, false), @@ -466,6 +541,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { new indexContainerOrder(kelondroNaturalOrder.naturalOrder), indexRowSetContainer.containerMergeMethod, true); + } } return null; } @@ -505,11 +581,11 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public Object migrateWords2Assortment(String wordhash) throws IOException { // returns the number of entries that had been added to the assortments // can be negative if some assortments have been moved to the backend - File db = plasmaWordIndexFile.wordHash2path(databaseRoot, wordhash); + File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash); if (!(db.exists())) return "not available"; plasmaWordIndexFile entity = null; try { - entity = new plasmaWordIndexFile(databaseRoot, wordhash, true); + entity = new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true); int size = entity.size(); if (size > assortmentCluster.clusterCapacity) { // this will be too big to integrate it @@ -671,8 +747,9 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { public static void main(String[] args) { // System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y")); // System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis())))); - - plasmaWordIndex index = new plasmaWordIndex(new File("D:\\dev\\proxy\\DATA\\PLASMADB"), 555, 1000, new serverLog("TESTAPP")); + File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB"); + File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX\\PRIVATE\\TEXT"); + plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, 555, 1000, new serverLog("TESTAPP")); try { Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true); while (containerIter.hasNext()) { diff --git a/source/yacy.java b/source/yacy.java index 38427cf2b..f53a0daa1 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -646,9 +646,10 @@ public final class yacy { // run with "java -classpath classes yacy -migratewords" try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} File dbroot = new File(new File(homePath), "DATA/PLASMADB"); + File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT"); serverLog log = new serverLog("WORDMIGRATION"); log.logInfo("STARTING MIGRATION"); - plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, 20000, 10000, log); + plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, 20000, 10000, log); enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true); String wordhash; File wordfile; @@ -686,6 +687,7 @@ public final class yacy { // run with "java -classpath classes yacy -minimizeUrlDB" try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} File dbroot = new File(new File(homePath), "DATA/PLASMADB"); + File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT"); serverLog log = new serverLog("URL-CLEANUP"); try { log.logInfo("STARTING URL CLEANUP"); @@ -702,7 +704,7 @@ public final class yacy { int cacheMem = (int)((rt.maxMemory()-rt.totalMemory())/1024)-(2*cache + 8*1024); if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up."); - plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, cacheMem, 10000, log); + plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, indexRoot, cacheMem, 10000, log); Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false); long urlCounter = 0, wordCounter = 0; @@ -1137,6 +1139,7 @@ public final class yacy { plasmaWordIndex WordIndex = null; serverLog log = new serverLog("HASHLIST"); File homeDBroot = new File(new File(homePath), "DATA/PLASMADB"); + File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT"); String wordChunkStartHash = "------------"; try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} log.logInfo("STARTING CREATION OF RWI-HASHLIST"); @@ -1144,7 +1147,7 @@ public final class yacy { try { Iterator indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new plasmaWordIndex(homeDBroot, 8*1024*1024, 3000, log); + WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, 8*1024*1024, 3000, log); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); } else if (resource.equals("assortments")) { plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, 16*1024*1024, 3000, log); diff --git a/yacy.init b/yacy.init index b8c386bc9..0eab6f472 100644 --- a/yacy.init +++ b/yacy.init @@ -185,9 +185,12 @@ parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp # other peer users promoteSearchPageGreeting = -# the path to the PLASMA database, especially the reverse word index +# the path to the PLASMA database of the web spider dbPath=DATA/PLASMADB +# the path to the public reverse word index for text files (web pages) +indexPublicTextPath=DATA/INDEX/PUBLIC/TEXT + # the path to the LISTS files. Most lists are used to filter web content listsPath=DATA/LISTS