added option to configure a path to a secondary index location.

this shall be used to store a fragment of the index on another physical device,
to split IO load and enhance access speed. The index is splitted in such a way
that the LURLs are stored to the secondary location, and the RWIs to the primary
location. This is especially useful for environments where symbolic links are
not possible and may cause IO access even if there is no write access to the
device which hosts the symbolic link.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3519 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 07cd30cf9b
commit 5c3afb3202

@ -48,7 +48,17 @@
</tr> </tr>
<tr class="TableCellLight"> <tr class="TableCellLight">
<td title="Path to the PLASMADB directory of the foreign peer">Import&nbsp;Path:</td> <td title="Path to the PLASMADB directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importPath" type="text" size="50" value="" /></td> <td colspan="3"><input name="importPlasmaPath" type="text" size="50" value="" /></td>
<td></td>
</tr>
<tr class="TableCellLight">
<td title="Path to the primary Index directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importIndexPrimaryPath" type="text" size="50" value="" /></td>
<td></td>
</tr>
<tr class="TableCellLight">
<td title="Path to the secondary Index directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importIndexSecondaryPath" type="text" size="50" value="" /></td>
<td><input type="submit" name="startIndexDbImport" value="Start Import" /></td> <td><input type="submit" name="startIndexDbImport" value="Start Import" /></td>
</tr> </tr>
</table> </table>

@ -73,7 +73,9 @@ public final class IndexImport_p {
if (post.containsKey("startIndexDbImport")) { if (post.containsKey("startIndexDbImport")) {
try { try {
// getting the import path // getting the import path
String importPath = (String) post.get("importPath"); String importPlasmaPath = (String) post.get("importPlasmaPath");
String importIndexPrimaryPath = (String) post.get("importIndexPrimaryPath");
String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath");
String importType = (String) post.get("importType"); String importType = (String) post.get("importType");
String cacheSizeStr = (String) post.get("cacheSize"); String cacheSizeStr = (String) post.get("cacheSize");
int cacheSize = 8*1024*1024; int cacheSize = 8*1024*1024;
@ -98,7 +100,7 @@ public final class IndexImport_p {
if (startImport) { if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType); dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) { if (importerThread != null) {
importerThread.init(new File(importPath), cacheSize, 100); importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100);
importerThread.startIt(); importerThread.startIt();
} }
prop.put("LOCATION",""); prop.put("LOCATION","");

@ -13,7 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected boolean stopped = false; protected boolean stopped = false;
protected boolean paused = false; protected boolean paused = false;
protected File importPath; protected File importPrimaryPath, importSecondaryPath;
protected int cacheSize; protected int cacheSize;
protected long preloadTime; protected long preloadTime;
@ -33,9 +33,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.error; return this.error;
} }
public void init(File theImportPath) { public void init(File thePrimaryPath, File theSecondaryPath) {
if (theImportPath == null) throw new NullPointerException("The Import path must not be null."); if (thePrimaryPath == null) throw new NullPointerException("The Primary Import path must not be null.");
this.importPath = theImportPath; if (theSecondaryPath == null) throw new NullPointerException("The Secondary Import path must not be null.");
this.importPrimaryPath = thePrimaryPath;
this.importSecondaryPath = theSecondaryPath;
// getting a job id from the import manager // getting a job id from the import manager
//this.jobID = this.sb.dbImportManager.getJobID(); //this.jobID = this.sb.dbImportManager.getJobID();
@ -115,8 +117,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.jobType; return this.jobType;
} }
public File getImportPath() { public File getPrimaryImportPath() {
return this.importPath; return this.importPrimaryPath;
}
public File getSecondaryImportPath() {
return this.importSecondaryPath;
} }
public abstract long getEstimatedTime(); public abstract long getEstimatedTime();

@ -20,10 +20,11 @@ public interface dbImporter {
public int getJobID(); public int getJobID();
public String getJobName(); public String getJobName();
public String getJobType(); public String getJobType();
public File getImportPath(); public File getPrimaryImportPath();
public File getSecondaryImportPath();
public String getError(); public String getError();
public String getStatus(); public String getStatus();
public void init(File indexPath, int cacheSize, long preloadTime); public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime);
public void startIt(); public void startIt();
} }

@ -31,7 +31,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
} }
public String getJobName() { public String getJobName() {
return this.importPath.toString(); return this.importPrimaryPath.toString();
} }
public int getProcessingStatusPercent() { public int getProcessingStatusPercent() {
@ -47,23 +47,23 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
return theStatus.toString(); return theStatus.toString();
} }
public void init(File theImportPath, int theCacheSize, long preloadTime) { public void init(File plasmaPath, File indexPrimary, File indexSecondary, int theCacheSize, long preloadTime) {
super.init(theImportPath); super.init(indexPrimary, indexSecondary);
this.cacheSize = theCacheSize; this.cacheSize = theCacheSize;
this.preloadTime = preloadTime; this.preloadTime = preloadTime;
File noticeUrlDbFile = new File(this.importPath,"urlNotice1.db"); File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
File profileDbFile = new File(this.importPath, "crawlProfiles0.db"); File profileDbFile = new File(plasmaPath, "crawlProfiles0.db");
String errorMsg = null; String errorMsg = null;
if (!this.importPath.exists()) if (!plasmaPath.exists())
errorMsg = "The import path '" + this.importPath + "' does not exist."; errorMsg = "The import path '" + plasmaPath+ "' does not exist.";
else if (!this.importPath.isDirectory()) else if (!plasmaPath.isDirectory())
errorMsg = "The import path '" + this.importPath + "' is not a directory."; errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
else if (!this.importPath.canRead()) else if (!plasmaPath.canRead())
errorMsg = "The import path '" + this.importPath + "' is not readable."; errorMsg = "The import path '" + plasmaPath + "' is not readable.";
else if (!this.importPath.canWrite()) else if (!plasmaPath.canWrite())
errorMsg = "The import path '" + this.importPath + "' is not writeable."; errorMsg = "The import path '" + plasmaPath + "' is not writeable.";
else if (!noticeUrlDbFile.exists()) else if (!noticeUrlDbFile.exists())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist."; errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
@ -90,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// init noticeUrlDB // init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB"); this.log.logInfo("Initializing the source noticeUrlDB");
this.importNurlDB = new plasmaCrawlNURL(this.importPath); this.importNurlDB = new plasmaCrawlNURL(plasmaPath);
this.importStartSize = this.importNurlDB.size(); this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize(); //int stackSize = this.importNurlDB.stackSize();

@ -31,7 +31,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
} }
public String getJobName() { public String getJobName() {
return this.importPath.toString(); return this.importPrimaryPath.toString();
} }
public String getStatus() { public String getStatus() {
@ -46,25 +46,33 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return theStatus.toString(); return theStatus.toString();
} }
public void init(File theImportPath, int theCacheSize, long preloadTime) { public void init(File plasmaPath, File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) {
super.init(theImportPath); super.init(thePrimaryPath, theSecondaryPath);
this.cacheSize = theCacheSize; this.cacheSize = theCacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024; if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
// configure import DB // configure import DB
String errorMsg = null; String errorMsg = null;
if (!this.importPath.exists()) errorMsg = "Import directory does not exist."; if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist.";
if (!this.importPath.canRead()) errorMsg = "Import directory is not readable."; if (!this.importPrimaryPath.canRead()) errorMsg = "Primary Import directory is not readable.";
if (!this.importPath.canWrite()) errorMsg = "Import directory is not writeable"; if (!this.importPrimaryPath.canWrite()) errorMsg = "Primary Import directory is not writeable";
if (!this.importPath.isDirectory()) errorMsg = "ImportDirectory is not a directory."; if (!this.importPrimaryPath.isDirectory()) errorMsg = "Primary Import directory is not a directory.";
if (errorMsg != null) { if (errorMsg != null) {
this.log.logSevere(errorMsg + "\nName: " + this.importPath.getAbsolutePath()); this.log.logSevere(errorMsg + "\nName: " + this.importPrimaryPath.getAbsolutePath());
throw new IllegalArgumentException(errorMsg);
}
if (!this.importSecondaryPath.exists()) errorMsg = "Secondary Import directory does not exist.";
if (!this.importSecondaryPath.canRead()) errorMsg = "Secondary Import directory is not readable.";
if (!this.importSecondaryPath.canWrite()) errorMsg = "Secondary Import directory is not writeable";
if (!this.importSecondaryPath.isDirectory()) errorMsg = "Secondary Import directory is not a directory.";
if (errorMsg != null) {
this.log.logSevere(errorMsg + "\nName: " + this.importSecondaryPath.getAbsolutePath());
throw new IllegalArgumentException(errorMsg); throw new IllegalArgumentException(errorMsg);
} }
this.log.logFine("Initializing source word index db."); this.log.logFine("Initializing source word index db.");
this.importWordIndex = new plasmaWordIndex(this.importPath, preloadTime / 2, this.log); this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, importSecondaryPath, preloadTime / 2, this.log);
this.importStartSize = this.importWordIndex.size(); this.importStartSize = this.importWordIndex.size();
} }
@ -94,7 +102,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
this.log.logInfo("STARTING DB-IMPORT"); this.log.logInfo("STARTING DB-IMPORT");
try { try {
this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'"); this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs."); this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs."); this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");

@ -201,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// storage management // storage management
public File htCachePath; public File htCachePath;
private File plasmaPath; private File plasmaPath;
public File indexPath; public File indexPrimaryPath, indexSecondaryPath;
public File listsPath; public File listsPath;
public File htDocsPath; public File htDocsPath;
public File rankingPath; public File rankingPath;
@ -728,7 +728,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the
* whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored</p> * whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored</p>
*/ */
public static final String INDEX_PATH = "indexPath"; public static final String INDEX_PRIMARY_PATH = "indexPrimaryPath"; // this is a relative path to the data root
public static final String INDEX_SECONDARY_PATH = "indexSecondaryPath"; // this is a absolute path to any location
public static final String INDEX_PATH_DEFAULT = "DATA/INDEX"; public static final String INDEX_PATH_DEFAULT = "DATA/INDEX";
/** /**
* <p><code>public static final String <strong>LISTS_PATH</strong> = "listsPath"</code></p> * <p><code>public static final String <strong>LISTS_PATH</strong> = "listsPath"</code></p>
@ -868,8 +869,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// load values from configs // load values from configs
this.plasmaPath = new File(rootPath, getConfig(DBPATH, DBPATH_DEFAULT)); this.plasmaPath = new File(rootPath, getConfig(DBPATH, DBPATH_DEFAULT));
this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString()); this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
this.indexPath = new File(rootPath, getConfig(INDEX_PATH, INDEX_PATH_DEFAULT)); this.indexPrimaryPath = new File(rootPath, getConfig(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT));
this.log.logConfig("Index Path: " + this.indexPath.toString()); this.log.logConfig("Index Primary Path: " + this.indexPrimaryPath.toString());
this.indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, ""));
this.log.logConfig("Index Secondary Path: " + this.indexSecondaryPath.toString());
this.listsPath = new File(rootPath, getConfig(LISTS_PATH, LISTS_PATH_DEFAULT)); this.listsPath = new File(rootPath, getConfig(LISTS_PATH, LISTS_PATH_DEFAULT));
this.log.logConfig("Lists Path: " + this.listsPath.toString()); this.log.logConfig("Lists Path: " + this.listsPath.toString());
this.htDocsPath = new File(rootPath, getConfig(HTDOCS_PATH, HTDOCS_PATH_DEFAULT)); this.htDocsPath = new File(rootPath, getConfig(HTDOCS_PATH, HTDOCS_PATH_DEFAULT));
@ -1040,7 +1043,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
noticeURL = new plasmaCrawlNURL(plasmaPath); noticeURL = new plasmaCrawlNURL(plasmaPath);
errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db"); errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db");
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db"); delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db");
wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log); wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log);
// set a high maximum cache size to current size; this is adopted later automatically // set a high maximum cache size to current size; this is adopted later automatically
int wordCacheMaxCount = Math.max((int) getConfigLong(WORDCACHE_INIT_COUNT, 30000), int wordCacheMaxCount = Math.max((int) getConfigLong(WORDCACHE_INIT_COUNT, 30000),

@ -70,19 +70,19 @@ public final class plasmaWordIndex implements indexRI {
private int flushsize; private int flushsize;
public final plasmaCrawlLURL loadedURL; public final plasmaCrawlLURL loadedURL;
public plasmaWordIndex(File indexRoot, long preloadTime, serverLog log) { public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, long preloadTime, serverLog log) {
File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE"); File textindexcache = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs(); if (!(textindexcache.exists())) textindexcache.mkdirs();
this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump1.array", log); this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump1.array", log);
this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump2.array", log); this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump2.array", log);
// create collections storage path // create collections storage path
File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION"); File textindexcollections = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs(); if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new indexCollectionRI(textindexcollections, "collection", preloadTime, maxCollectionPartition, indexRWIEntry.urlEntryRow); this.collections = new indexCollectionRI(textindexcollections, "collection", preloadTime, maxCollectionPartition, indexRWIEntry.urlEntryRow);
// create LURL-db // create LURL-db
loadedURL = new plasmaCrawlLURL(indexRoot, preloadTime); loadedURL = new plasmaCrawlLURL(indexSecondaryRoot, preloadTime);
// performance settings // performance settings
busyCacheFlush = false; busyCacheFlush = false;

@ -616,14 +616,15 @@ public final class yacy {
public static void minimizeUrlDB(String homePath) { public static void minimizeUrlDB(String homePath) {
// run with "java -classpath classes yacy -minimizeUrlDB" // run with "java -classpath classes yacy -minimizeUrlDB"
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File indexRoot = new File(new File(homePath), "DATA/INDEX"); File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexRoot2 = new File(new File(homePath), "DATA/INDEX2"); File indexRoot2 = new File(new File(homePath), "DATA/INDEX2");
serverLog log = new serverLog("URL-CLEANUP"); serverLog log = new serverLog("URL-CLEANUP");
try { try {
log.logInfo("STARTING URL CLEANUP"); log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls // db containing all currently loades urls
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, 10000); plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexSecondaryRoot, 10000);
// db used to hold all neede urls // db used to hold all neede urls
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000); plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000);
@ -632,7 +633,7 @@ public final class yacy {
int cacheMem = (int)(serverMemory.max-rt.totalMemory()); int cacheMem = (int)(serverMemory.max-rt.totalMemory());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, 10000, log); plasmaWordIndex wordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 10000, log);
Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false); Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
long urlCounter = 0, wordCounter = 0; long urlCounter = 0, wordCounter = 0;
@ -1000,7 +1001,8 @@ public final class yacy {
private static void RWIHashList(String homePath, String targetName, String resource, String format) { private static void RWIHashList(String homePath, String targetName, String resource, String format) {
plasmaWordIndex WordIndex = null; plasmaWordIndex WordIndex = null;
serverLog log = new serverLog("HASHLIST"); serverLog log = new serverLog("HASHLIST");
File indexRoot = new File(new File(homePath), "DATA/INDEX"); File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
String wordChunkStartHash = "AAAAAAAAAAAA"; String wordChunkStartHash = "AAAAAAAAAAAA";
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
log.logInfo("STARTING CREATION OF RWI-HASHLIST"); log.logInfo("STARTING CREATION OF RWI-HASHLIST");
@ -1008,7 +1010,7 @@ public final class yacy {
try { try {
Iterator indexContainerIterator = null; Iterator indexContainerIterator = null;
if (resource.equals("all")) { if (resource.equals("all")) {
WordIndex = new plasmaWordIndex(indexRoot, 3000, log); WordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 3000, log);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
} }
int counter = 0; int counter = 0;

@ -198,7 +198,10 @@ promoteSearchPageGreeting =
dbPath=DATA/PLASMADB dbPath=DATA/PLASMADB
# the path to the public reverse word index for text files (web pages) # the path to the public reverse word index for text files (web pages)
indexPath=DATA/INDEX # the primary path is relative to the data root, the secondary path is an absolute path
# when the secondary path should be equal to the primary, it must be declared empty
indexPrimaryPath=DATA/INDEX
indexSecondaryPath=
# the path to the LISTS files. Most lists are used to filter web content # the path to the LISTS files. Most lists are used to filter web content
listsPath=DATA/LISTS listsPath=DATA/LISTS

Loading…
Cancel
Save