Attention: Always do a backup of your source and destination database before starting to use this import function.
diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java
index 1045675ce..28405ab3c 100644
--- a/htroot/IndexImport_p.java
+++ b/htroot/IndexImport_p.java
@@ -73,7 +73,9 @@ public final class IndexImport_p {
if (post.containsKey("startIndexDbImport")) {
try {
// getting the import path
- String importPath = (String) post.get("importPath");
+ String importPlasmaPath = (String) post.get("importPlasmaPath");
+ String importIndexPrimaryPath = (String) post.get("importIndexPrimaryPath");
+ String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath");
String importType = (String) post.get("importType");
String cacheSizeStr = (String) post.get("cacheSize");
int cacheSize = 8*1024*1024;
@@ -98,7 +100,7 @@ public final class IndexImport_p {
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) {
- importerThread.init(new File(importPath), cacheSize, 100);
+ importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100);
importerThread.startIt();
}
prop.put("LOCATION","");
diff --git a/source/de/anomic/plasma/dbImport/AbstractImporter.java b/source/de/anomic/plasma/dbImport/AbstractImporter.java
index 4dcdd8798..c0a07bbe1 100644
--- a/source/de/anomic/plasma/dbImport/AbstractImporter.java
+++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java
@@ -13,7 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected boolean stopped = false;
protected boolean paused = false;
- protected File importPath;
+ protected File importPrimaryPath, importSecondaryPath;
protected int cacheSize;
protected long preloadTime;
@@ -33,9 +33,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.error;
}
- public void init(File theImportPath) {
- if (theImportPath == null) throw new NullPointerException("The Import path must not be null.");
- this.importPath = theImportPath;
+ public void init(File thePrimaryPath, File theSecondaryPath) {
+ if (thePrimaryPath == null) throw new NullPointerException("The Primary Import path must not be null.");
+ if (theSecondaryPath == null) throw new NullPointerException("The Secondary Import path must not be null.");
+ this.importPrimaryPath = thePrimaryPath;
+ this.importSecondaryPath = theSecondaryPath;
// getting a job id from the import manager
//this.jobID = this.sb.dbImportManager.getJobID();
@@ -115,8 +117,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.jobType;
}
- public File getImportPath() {
- return this.importPath;
+ public File getPrimaryImportPath() {
+ return this.importPrimaryPath;
+ }
+ public File getSecondaryImportPath() {
+ return this.importSecondaryPath;
}
public abstract long getEstimatedTime();
diff --git a/source/de/anomic/plasma/dbImport/dbImporter.java b/source/de/anomic/plasma/dbImport/dbImporter.java
index c141f68fc..fd551bffa 100644
--- a/source/de/anomic/plasma/dbImport/dbImporter.java
+++ b/source/de/anomic/plasma/dbImport/dbImporter.java
@@ -20,10 +20,11 @@ public interface dbImporter {
public int getJobID();
public String getJobName();
public String getJobType();
- public File getImportPath();
+ public File getPrimaryImportPath();
+ public File getSecondaryImportPath();
public String getError();
public String getStatus();
- public void init(File indexPath, int cacheSize, long preloadTime);
+ public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime);
public void startIt();
}
diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
index 998c2aa0d..57c403b60 100644
--- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
@@ -31,7 +31,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
public String getJobName() {
- return this.importPath.toString();
+ return this.importPrimaryPath.toString();
}
public int getProcessingStatusPercent() {
@@ -47,23 +47,23 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
return theStatus.toString();
}
- public void init(File theImportPath, int theCacheSize, long preloadTime) {
- super.init(theImportPath);
+ public void init(File plasmaPath, File indexPrimary, File indexSecondary, int theCacheSize, long preloadTime) {
+ super.init(indexPrimary, indexSecondary);
this.cacheSize = theCacheSize;
this.preloadTime = preloadTime;
- File noticeUrlDbFile = new File(this.importPath,"urlNotice1.db");
- File profileDbFile = new File(this.importPath, "crawlProfiles0.db");
+ File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
+ File profileDbFile = new File(plasmaPath, "crawlProfiles0.db");
String errorMsg = null;
- if (!this.importPath.exists())
- errorMsg = "The import path '" + this.importPath + "' does not exist.";
- else if (!this.importPath.isDirectory())
- errorMsg = "The import path '" + this.importPath + "' is not a directory.";
- else if (!this.importPath.canRead())
- errorMsg = "The import path '" + this.importPath + "' is not readable.";
- else if (!this.importPath.canWrite())
- errorMsg = "The import path '" + this.importPath + "' is not writeable.";
+ if (!plasmaPath.exists())
+ errorMsg = "The import path '" + plasmaPath+ "' does not exist.";
+ else if (!plasmaPath.isDirectory())
+ errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
+ else if (!plasmaPath.canRead())
+ errorMsg = "The import path '" + plasmaPath + "' is not readable.";
+ else if (!plasmaPath.canWrite())
+ errorMsg = "The import path '" + plasmaPath + "' is not writeable.";
else if (!noticeUrlDbFile.exists())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
@@ -90,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB");
- this.importNurlDB = new plasmaCrawlNURL(this.importPath);
+ this.importNurlDB = new plasmaCrawlNURL(plasmaPath);
this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();
diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
index 64564b7e3..28fbdd8f4 100644
--- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
@@ -31,7 +31,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
public String getJobName() {
- return this.importPath.toString();
+ return this.importPrimaryPath.toString();
}
public String getStatus() {
@@ -46,25 +46,33 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return theStatus.toString();
}
- public void init(File theImportPath, int theCacheSize, long preloadTime) {
- super.init(theImportPath);
+ public void init(File plasmaPath, File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) {
+ super.init(thePrimaryPath, theSecondaryPath);
this.cacheSize = theCacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
// configure import DB
String errorMsg = null;
- if (!this.importPath.exists()) errorMsg = "Import directory does not exist.";
- if (!this.importPath.canRead()) errorMsg = "Import directory is not readable.";
- if (!this.importPath.canWrite()) errorMsg = "Import directory is not writeable";
- if (!this.importPath.isDirectory()) errorMsg = "ImportDirectory is not a directory.";
+ if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist.";
+ if (!this.importPrimaryPath.canRead()) errorMsg = "Primary Import directory is not readable.";
+ if (!this.importPrimaryPath.canWrite()) errorMsg = "Primary Import directory is not writeable";
+ if (!this.importPrimaryPath.isDirectory()) errorMsg = "Primary Import directory is not a directory.";
if (errorMsg != null) {
- this.log.logSevere(errorMsg + "\nName: " + this.importPath.getAbsolutePath());
+ this.log.logSevere(errorMsg + "\nName: " + this.importPrimaryPath.getAbsolutePath());
throw new IllegalArgumentException(errorMsg);
- }
+ }
+ if (!this.importSecondaryPath.exists()) errorMsg = "Secondary Import directory does not exist.";
+ if (!this.importSecondaryPath.canRead()) errorMsg = "Secondary Import directory is not readable.";
+ if (!this.importSecondaryPath.canWrite()) errorMsg = "Secondary Import directory is not writeable";
+ if (!this.importSecondaryPath.isDirectory()) errorMsg = "Secondary Import directory is not a directory.";
+ if (errorMsg != null) {
+ this.log.logSevere(errorMsg + "\nName: " + this.importSecondaryPath.getAbsolutePath());
+ throw new IllegalArgumentException(errorMsg);
+ }
this.log.logFine("Initializing source word index db.");
- this.importWordIndex = new plasmaWordIndex(this.importPath, preloadTime / 2, this.log);
+ this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, importSecondaryPath, preloadTime / 2, this.log);
this.importStartSize = this.importWordIndex.size();
}
@@ -93,8 +101,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
public void importWordsDB() {
this.log.logInfo("STARTING DB-IMPORT");
- try {
- this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'");
+ try {
+ this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index a503425d3..9d815bdd1 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -201,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// storage management
public File htCachePath;
private File plasmaPath;
- public File indexPath;
+ public File indexPrimaryPath, indexSecondaryPath;
public File listsPath;
public File htDocsPath;
public File rankingPath;
@@ -728,7 +728,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
*
Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the
* whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored
*/
- public static final String INDEX_PATH = "indexPath";
+ public static final String INDEX_PRIMARY_PATH = "indexPrimaryPath"; // this is a relative path to the data root
+ public static final String INDEX_SECONDARY_PATH = "indexSecondaryPath"; // this is a absolute path to any location
public static final String INDEX_PATH_DEFAULT = "DATA/INDEX";
/**
*
public static final String LISTS_PATH = "listsPath"
@@ -868,8 +869,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// load values from configs
this.plasmaPath = new File(rootPath, getConfig(DBPATH, DBPATH_DEFAULT));
this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
- this.indexPath = new File(rootPath, getConfig(INDEX_PATH, INDEX_PATH_DEFAULT));
- this.log.logConfig("Index Path: " + this.indexPath.toString());
+ this.indexPrimaryPath = new File(rootPath, getConfig(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT));
+ this.log.logConfig("Index Primary Path: " + this.indexPrimaryPath.toString());
+ this.indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, ""));
+ this.log.logConfig("Index Secondary Path: " + this.indexSecondaryPath.toString());
this.listsPath = new File(rootPath, getConfig(LISTS_PATH, LISTS_PATH_DEFAULT));
this.log.logConfig("Lists Path: " + this.listsPath.toString());
this.htDocsPath = new File(rootPath, getConfig(HTDOCS_PATH, HTDOCS_PATH_DEFAULT));
@@ -1040,7 +1043,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
noticeURL = new plasmaCrawlNURL(plasmaPath);
errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db");
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db");
- wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log);
+ wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log);
// set a high maximum cache size to current size; this is adopted later automatically
int wordCacheMaxCount = Math.max((int) getConfigLong(WORDCACHE_INIT_COUNT, 30000),
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 94fe8bb6f..75492950d 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -70,19 +70,19 @@ public final class plasmaWordIndex implements indexRI {
private int flushsize;
public final plasmaCrawlLURL loadedURL;
- public plasmaWordIndex(File indexRoot, long preloadTime, serverLog log) {
- File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE");
+ public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, long preloadTime, serverLog log) {
+ File textindexcache = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump1.array", log);
this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump2.array", log);
// create collections storage path
- File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION");
+ File textindexcollections = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new indexCollectionRI(textindexcollections, "collection", preloadTime, maxCollectionPartition, indexRWIEntry.urlEntryRow);
// create LURL-db
- loadedURL = new plasmaCrawlLURL(indexRoot, preloadTime);
+ loadedURL = new plasmaCrawlLURL(indexSecondaryRoot, preloadTime);
// performance settings
busyCacheFlush = false;
diff --git a/source/yacy.java b/source/yacy.java
index 19b3aaf58..e1b0f31ca 100644
--- a/source/yacy.java
+++ b/source/yacy.java
@@ -616,14 +616,15 @@ public final class yacy {
public static void minimizeUrlDB(String homePath) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
- File indexRoot = new File(new File(homePath), "DATA/INDEX");
+ File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
+ File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexRoot2 = new File(new File(homePath), "DATA/INDEX2");
serverLog log = new serverLog("URL-CLEANUP");
try {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
- plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, 10000);
+ plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexSecondaryRoot, 10000);
// db used to hold all neede urls
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000);
@@ -632,7 +633,7 @@ public final class yacy {
int cacheMem = (int)(serverMemory.max-rt.totalMemory());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
- plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, 10000, log);
+ plasmaWordIndex wordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 10000, log);
Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
long urlCounter = 0, wordCounter = 0;
@@ -1000,7 +1001,8 @@ public final class yacy {
private static void RWIHashList(String homePath, String targetName, String resource, String format) {
plasmaWordIndex WordIndex = null;
serverLog log = new serverLog("HASHLIST");
- File indexRoot = new File(new File(homePath), "DATA/INDEX");
+ File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
+ File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
String wordChunkStartHash = "AAAAAAAAAAAA";
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
log.logInfo("STARTING CREATION OF RWI-HASHLIST");
@@ -1008,7 +1010,7 @@ public final class yacy {
try {
Iterator indexContainerIterator = null;
if (resource.equals("all")) {
- WordIndex = new plasmaWordIndex(indexRoot, 3000, log);
+ WordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 3000, log);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
}
int counter = 0;
diff --git a/yacy.init b/yacy.init
index 1426e8fc8..8ab2b8b89 100644
--- a/yacy.init
+++ b/yacy.init
@@ -198,7 +198,10 @@ promoteSearchPageGreeting =
dbPath=DATA/PLASMADB
# the path to the public reverse word index for text files (web pages)
-indexPath=DATA/INDEX
+# the primary path is relative to the data root, the secondary path is an absolute path
+# when the secondary path should be equal to the primary, it must be declared empty
+indexPrimaryPath=DATA/INDEX
+indexSecondaryPath=
# the path to the LISTS files. Most lists are used to filter web content
listsPath=DATA/LISTS