diff --git a/htroot/IndexImport_p.html b/htroot/IndexImport_p.html index 00c82f040..d5fc5227d 100644 --- a/htroot/IndexImport_p.html +++ b/htroot/IndexImport_p.html @@ -48,8 +48,18 @@ Import Path: - - + + + + + Import Path: + + + + + Import Path: + +

Attention:
Always do a backup of your source and destination database before starting to use this import function.

diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 1045675ce..28405ab3c 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -73,7 +73,9 @@ public final class IndexImport_p { if (post.containsKey("startIndexDbImport")) { try { // getting the import path - String importPath = (String) post.get("importPath"); + String importPlasmaPath = (String) post.get("importPlasmaPath"); + String importIndexPrimaryPath = (String) post.get("importIndexPrimaryPath"); + String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath"); String importType = (String) post.get("importType"); String cacheSizeStr = (String) post.get("cacheSize"); int cacheSize = 8*1024*1024; @@ -98,7 +100,7 @@ public final class IndexImport_p { if (startImport) { dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType); if (importerThread != null) { - importerThread.init(new File(importPath), cacheSize, 100); + importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100); importerThread.startIt(); } prop.put("LOCATION",""); diff --git a/source/de/anomic/plasma/dbImport/AbstractImporter.java b/source/de/anomic/plasma/dbImport/AbstractImporter.java index 4dcdd8798..c0a07bbe1 100644 --- a/source/de/anomic/plasma/dbImport/AbstractImporter.java +++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java @@ -13,7 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ protected boolean stopped = false; protected boolean paused = false; - protected File importPath; + protected File importPrimaryPath, importSecondaryPath; protected int cacheSize; protected long preloadTime; @@ -33,9 +33,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ return this.error; } - public void init(File theImportPath) { - if (theImportPath == null) throw new NullPointerException("The Import path must not be null."); - this.importPath = theImportPath; + public void init(File thePrimaryPath, File theSecondaryPath) { + if (thePrimaryPath == null) throw new NullPointerException("The Primary Import path must not be null."); + if (theSecondaryPath == null) throw new NullPointerException("The Secondary Import path must not be null."); + this.importPrimaryPath = thePrimaryPath; + this.importSecondaryPath = theSecondaryPath; // getting a job id from the import manager //this.jobID = this.sb.dbImportManager.getJobID(); @@ -115,8 +117,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ return this.jobType; } - public File getImportPath() { - return this.importPath; + public File getPrimaryImportPath() { + return this.importPrimaryPath; + } + public File getSecondaryImportPath() { + return this.importSecondaryPath; } public abstract long getEstimatedTime(); diff --git a/source/de/anomic/plasma/dbImport/dbImporter.java b/source/de/anomic/plasma/dbImport/dbImporter.java index c141f68fc..fd551bffa 100644 --- a/source/de/anomic/plasma/dbImport/dbImporter.java +++ b/source/de/anomic/plasma/dbImport/dbImporter.java @@ -20,10 +20,11 @@ public interface dbImporter { public int getJobID(); public String getJobName(); public String getJobType(); - public File getImportPath(); + public File getPrimaryImportPath(); + public File getSecondaryImportPath(); public String getError(); public String getStatus(); - public void init(File indexPath, int cacheSize, long preloadTime); + public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime); public void startIt(); } diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 998c2aa0d..57c403b60 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -31,7 +31,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor } public String getJobName() { - return this.importPath.toString(); + return this.importPrimaryPath.toString(); } public int getProcessingStatusPercent() { @@ -47,23 +47,23 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor return theStatus.toString(); } - public void init(File theImportPath, int theCacheSize, long preloadTime) { - super.init(theImportPath); + public void init(File plasmaPath, File indexPrimary, File indexSecondary, int theCacheSize, long preloadTime) { + super.init(indexPrimary, indexSecondary); this.cacheSize = theCacheSize; this.preloadTime = preloadTime; - File noticeUrlDbFile = new File(this.importPath,"urlNotice1.db"); - File profileDbFile = new File(this.importPath, "crawlProfiles0.db"); + File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db"); + File profileDbFile = new File(plasmaPath, "crawlProfiles0.db"); String errorMsg = null; - if (!this.importPath.exists()) - errorMsg = "The import path '" + this.importPath + "' does not exist."; - else if (!this.importPath.isDirectory()) - errorMsg = "The import path '" + this.importPath + "' is not a directory."; - else if (!this.importPath.canRead()) - errorMsg = "The import path '" + this.importPath + "' is not readable."; - else if (!this.importPath.canWrite()) - errorMsg = "The import path '" + this.importPath + "' is not writeable."; + if (!plasmaPath.exists()) + errorMsg = "The import path '" + plasmaPath+ "' does not exist."; + else if (!plasmaPath.isDirectory()) + errorMsg = "The import path '" + plasmaPath + "' is not a directory."; + else if (!plasmaPath.canRead()) + errorMsg = "The import path '" + plasmaPath + "' is not readable."; + else if (!plasmaPath.canWrite()) + errorMsg = "The import path '" + plasmaPath + "' is not writeable."; else if (!noticeUrlDbFile.exists()) errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist."; @@ -90,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // init noticeUrlDB this.log.logInfo("Initializing the source noticeUrlDB"); - this.importNurlDB = new plasmaCrawlNURL(this.importPath); + this.importNurlDB = new plasmaCrawlNURL(plasmaPath); this.importStartSize = this.importNurlDB.size(); //int stackSize = this.importNurlDB.stackSize(); diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 64564b7e3..28fbdd8f4 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -31,7 +31,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { } public String getJobName() { - return this.importPath.toString(); + return this.importPrimaryPath.toString(); } public String getStatus() { @@ -46,25 +46,33 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { return theStatus.toString(); } - public void init(File theImportPath, int theCacheSize, long preloadTime) { - super.init(theImportPath); + public void init(File plasmaPath, File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) { + super.init(thePrimaryPath, theSecondaryPath); this.cacheSize = theCacheSize; if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024; // configure import DB String errorMsg = null; - if (!this.importPath.exists()) errorMsg = "Import directory does not exist."; - if (!this.importPath.canRead()) errorMsg = "Import directory is not readable."; - if (!this.importPath.canWrite()) errorMsg = "Import directory is not writeable"; - if (!this.importPath.isDirectory()) errorMsg = "ImportDirectory is not a directory."; + if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist."; + if (!this.importPrimaryPath.canRead()) errorMsg = "Primary Import directory is not readable."; + if (!this.importPrimaryPath.canWrite()) errorMsg = "Primary Import directory is not writeable"; + if (!this.importPrimaryPath.isDirectory()) errorMsg = "Primary Import directory is not a directory."; if (errorMsg != null) { - this.log.logSevere(errorMsg + "\nName: " + this.importPath.getAbsolutePath()); + this.log.logSevere(errorMsg + "\nName: " + this.importPrimaryPath.getAbsolutePath()); throw new IllegalArgumentException(errorMsg); - } + } + if (!this.importSecondaryPath.exists()) errorMsg = "Secondary Import directory does not exist."; + if (!this.importSecondaryPath.canRead()) errorMsg = "Secondary Import directory is not readable."; + if (!this.importSecondaryPath.canWrite()) errorMsg = "Secondary Import directory is not writeable"; + if (!this.importSecondaryPath.isDirectory()) errorMsg = "Secondary Import directory is not a directory."; + if (errorMsg != null) { + this.log.logSevere(errorMsg + "\nName: " + this.importSecondaryPath.getAbsolutePath()); + throw new IllegalArgumentException(errorMsg); + } this.log.logFine("Initializing source word index db."); - this.importWordIndex = new plasmaWordIndex(this.importPath, preloadTime / 2, this.log); + this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, importSecondaryPath, preloadTime / 2, this.log); this.importStartSize = this.importWordIndex.size(); } @@ -93,8 +101,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { public void importWordsDB() { this.log.logInfo("STARTING DB-IMPORT"); - try { - this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'"); + try { + this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'"); this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs."); this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs."); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index a503425d3..9d815bdd1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -201,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // storage management public File htCachePath; private File plasmaPath; - public File indexPath; + public File indexPrimaryPath, indexSecondaryPath; public File listsPath; public File htDocsPath; public File rankingPath; @@ -728,7 +728,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser *

Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the * whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored

*/ - public static final String INDEX_PATH = "indexPath"; + public static final String INDEX_PRIMARY_PATH = "indexPrimaryPath"; // this is a relative path to the data root + public static final String INDEX_SECONDARY_PATH = "indexSecondaryPath"; // this is a absolute path to any location public static final String INDEX_PATH_DEFAULT = "DATA/INDEX"; /** *

public static final String LISTS_PATH = "listsPath"

@@ -868,8 +869,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load values from configs this.plasmaPath = new File(rootPath, getConfig(DBPATH, DBPATH_DEFAULT)); this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString()); - this.indexPath = new File(rootPath, getConfig(INDEX_PATH, INDEX_PATH_DEFAULT)); - this.log.logConfig("Index Path: " + this.indexPath.toString()); + this.indexPrimaryPath = new File(rootPath, getConfig(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT)); + this.log.logConfig("Index Primary Path: " + this.indexPrimaryPath.toString()); + this.indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, "")); + this.log.logConfig("Index Secondary Path: " + this.indexSecondaryPath.toString()); this.listsPath = new File(rootPath, getConfig(LISTS_PATH, LISTS_PATH_DEFAULT)); this.log.logConfig("Lists Path: " + this.listsPath.toString()); this.htDocsPath = new File(rootPath, getConfig(HTDOCS_PATH, HTDOCS_PATH_DEFAULT)); @@ -1040,7 +1043,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser noticeURL = new plasmaCrawlNURL(plasmaPath); errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db"); delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db"); - wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log); + wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log); // set a high maximum cache size to current size; this is adopted later automatically int wordCacheMaxCount = Math.max((int) getConfigLong(WORDCACHE_INIT_COUNT, 30000), diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 94fe8bb6f..75492950d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -70,19 +70,19 @@ public final class plasmaWordIndex implements indexRI { private int flushsize; public final plasmaCrawlLURL loadedURL; - public plasmaWordIndex(File indexRoot, long preloadTime, serverLog log) { - File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE"); + public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, long preloadTime, serverLog log) { + File textindexcache = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICACHE"); if (!(textindexcache.exists())) textindexcache.mkdirs(); this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump1.array", log); this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump2.array", log); // create collections storage path - File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION"); + File textindexcollections = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICOLLECTION"); if (!(textindexcollections.exists())) textindexcollections.mkdirs(); this.collections = new indexCollectionRI(textindexcollections, "collection", preloadTime, maxCollectionPartition, indexRWIEntry.urlEntryRow); // create LURL-db - loadedURL = new plasmaCrawlLURL(indexRoot, preloadTime); + loadedURL = new plasmaCrawlLURL(indexSecondaryRoot, preloadTime); // performance settings busyCacheFlush = false; diff --git a/source/yacy.java b/source/yacy.java index 19b3aaf58..e1b0f31ca 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -616,14 +616,15 @@ public final class yacy { public static void minimizeUrlDB(String homePath) { // run with "java -classpath classes yacy -minimizeUrlDB" try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} - File indexRoot = new File(new File(homePath), "DATA/INDEX"); + File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX"); + File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX"); File indexRoot2 = new File(new File(homePath), "DATA/INDEX2"); serverLog log = new serverLog("URL-CLEANUP"); try { log.logInfo("STARTING URL CLEANUP"); // db containing all currently loades urls - plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, 10000); + plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexSecondaryRoot, 10000); // db used to hold all neede urls plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000); @@ -632,7 +633,7 @@ public final class yacy { int cacheMem = (int)(serverMemory.max-rt.totalMemory()); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); - plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, 10000, log); + plasmaWordIndex wordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 10000, log); Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false); long urlCounter = 0, wordCounter = 0; @@ -1000,7 +1001,8 @@ public final class yacy { private static void RWIHashList(String homePath, String targetName, String resource, String format) { plasmaWordIndex WordIndex = null; serverLog log = new serverLog("HASHLIST"); - File indexRoot = new File(new File(homePath), "DATA/INDEX"); + File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX"); + File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX"); String wordChunkStartHash = "AAAAAAAAAAAA"; try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} log.logInfo("STARTING CREATION OF RWI-HASHLIST"); @@ -1008,7 +1010,7 @@ public final class yacy { try { Iterator indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new plasmaWordIndex(indexRoot, 3000, log); + WordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 3000, log); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false); } int counter = 0; diff --git a/yacy.init b/yacy.init index 1426e8fc8..8ab2b8b89 100644 --- a/yacy.init +++ b/yacy.init @@ -198,7 +198,10 @@ promoteSearchPageGreeting = dbPath=DATA/PLASMADB # the path to the public reverse word index for text files (web pages) -indexPath=DATA/INDEX +# the primary path is relative to the data root, the secondary path is an absolute path +# when the secondary path should be equal to the primary, it must be declared empty +indexPrimaryPath=DATA/INDEX +indexSecondaryPath= # the path to the LISTS files. Most lists are used to filter web content listsPath=DATA/LISTS