From 6f1ddb2519fa0fde0041ec7061fe68e116468441 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 25 Jul 2012 01:53:47 +0200 Subject: [PATCH] Moved solr index-add method to the same method where the YaCy index is written. Also done some code-cleanup. --- htroot/IndexFederated_p.java | 6 +- htroot/api/schema_p.java | 4 +- source/de/anomic/crawler/CrawlQueues.java | 8 +- .../yacy/cora/protocol/ResponseHeader.java | 8 + source/net/yacy/peers/Protocol.java | 3 +- .../net/yacy/search/IndexingQueueEntry.java | 41 +++ source/net/yacy/search/Shutdown.java | 47 ++++ source/net/yacy/search/Switchboard.java | 83 ++---- .../net/yacy/search/index/DocumentIndex.java | 9 +- source/net/yacy/search/index/Segment.java | 255 +++++++++--------- .../yacy/search/index/SolrConfiguration.java | 20 +- source/net/yacy/search/query/RWIProcess.java | 12 +- .../net/yacy/search/query/SnippetProcess.java | 2 +- source/net/yacy/yacy.java | 4 +- 14 files changed, 275 insertions(+), 227 deletions(-) create mode 100644 source/net/yacy/search/IndexingQueueEntry.java create mode 100644 source/net/yacy/search/Shutdown.java diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index 2d43aff42..08b842bd4 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -137,7 +137,7 @@ public class IndexFederated_p { } // read index scheme table flags - final Iterator i = sb.solrScheme.entryIterator(); + final Iterator i = sb.index.getSolrScheme().entryIterator(); ConfigurationSet.Entry entry; boolean modified = false; // flag to remember changes while (i.hasNext()) { @@ -160,7 +160,7 @@ public class IndexFederated_p { } if (modified) { // save settings to config file if modified try { - sb.solrScheme.commit(); + sb.index.getSolrScheme().commit(); modified = false; } catch (IOException ex) {} } @@ -191,7 +191,7 @@ public class IndexFederated_p { // use enum SolrField to keep defined order for(SolrField field : SolrField.values()) { prop.put("scheme_" + c + "_dark", dark ? 1 : 0); dark = !dark; - prop.put("scheme_" + c + "_checked", sb.solrScheme.contains(field.name()) ? 1 : 0); + prop.put("scheme_" + c + "_checked", sb.index.getSolrScheme().contains(field.name()) ? 1 : 0); prop.putHTML("scheme_" + c + "_key", field.name()); prop.putHTML("scheme_" + c + "_solrfieldname",field.name().equalsIgnoreCase(field.getSolrFieldName()) ? "" : field.getSolrFieldName()); if (field.getComment() != null) prop.putHTML("scheme_" + c + "_comment",field.getComment()); diff --git a/htroot/api/schema_p.java b/htroot/api/schema_p.java index 69ba5525f..5a224eb27 100644 --- a/htroot/api/schema_p.java +++ b/htroot/api/schema_p.java @@ -24,6 +24,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; +import net.yacy.search.index.SolrConfiguration; import net.yacy.search.index.SolrField; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -37,8 +38,9 @@ public class schema_p { // write scheme int c = 0; + SolrConfiguration solrScheme = sb.index.getSolrScheme(); for (SolrField field : SolrField.values()) { - if (sb.solrScheme.contains(field.name())) { + if (solrScheme.contains(field.name())) { prop.put("fields_" + c + "_solrname", field.getSolrFieldName()); prop.put("fields_" + c + "_type", field.getType().printName()); prop.put("fields_" + c + "_comment", field.getComment()); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index c0241818c..88037837d 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -81,8 +81,8 @@ public class CrawlQueues { this.log.logConfig("Starting Crawling Management"); this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(sb.index.getSolr(), sb.solrScheme, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); - this.delegatedURL = new ZURL(sb.index.getSolr(), sb.solrScheme, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); + this.errorURL = new ZURL(sb.index.getSolr(), sb.index.getSolrScheme(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); + this.delegatedURL = new ZURL(sb.index.getSolr(), sb.index.getSolrScheme(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); } public void relocate(final File newQueuePath) { @@ -93,8 +93,8 @@ public class CrawlQueues { this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727); FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(this.sb.index.getSolr(), this.sb.solrScheme, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); - this.delegatedURL = new ZURL(this.sb.index.getSolr(), this.sb.solrScheme, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); + this.errorURL = new ZURL(this.sb.index.getSolr(), this.sb.index.getSolrScheme(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); + this.delegatedURL = new ZURL(this.sb.index.getSolr(), this.sb.index.getSolrScheme(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); } public synchronized void close() { diff --git a/source/net/yacy/cora/protocol/ResponseHeader.java b/source/net/yacy/cora/protocol/ResponseHeader.java index 328c10722..e4185ad4a 100644 --- a/source/net/yacy/cora/protocol/ResponseHeader.java +++ b/source/net/yacy/cora/protocol/ResponseHeader.java @@ -159,4 +159,12 @@ public class ResponseHeader extends HeaderFramework { } return Charset.forName(charSetName); } + + public String getXRobotsTag() { + String x_robots_tag = this.get(HeaderFramework.X_ROBOTS_TAG, ""); + if (x_robots_tag.isEmpty()) { + x_robots_tag = this.get(HeaderFramework.X_ROBOTS, ""); + } + return x_robots_tag; + } } diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 11d195525..b5902dc54 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -786,7 +786,8 @@ public final class Protocol // store remote result to local result container // insert one container into the search result buffer // one is enough, only the references are used, not the word - containerCache.add(container.get(0), false, target.getName() + "/" + target.hash, result.joincount, true, time); + containerCache.add(container.get(0), false, target.getName() + "/" + target.hash, result.joincount, time); + containerCache.addFinalize(); containerCache.addExpectedRemoteReferences(-count); // insert the containers to the index diff --git a/source/net/yacy/search/IndexingQueueEntry.java b/source/net/yacy/search/IndexingQueueEntry.java new file mode 100644 index 000000000..250921688 --- /dev/null +++ b/source/net/yacy/search/IndexingQueueEntry.java @@ -0,0 +1,41 @@ +/** + * IndexingQueueEntry + * Copyright 2012 by Michael Peter Christen + * First released 24.07.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.search; + +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.kelondro.workflow.WorkflowJob; +import de.anomic.crawler.retrieval.Response; + +public class IndexingQueueEntry extends WorkflowJob { + + public Response queueEntry; + public Document[] documents; + public Condenser[] condenser; + + public IndexingQueueEntry(final Response queueEntry, final Document[] documents, final Condenser[] condenser) { + super(); + this.queueEntry = queueEntry; + this.documents = documents; + this.condenser = condenser; + } +} diff --git a/source/net/yacy/search/Shutdown.java b/source/net/yacy/search/Shutdown.java new file mode 100644 index 000000000..b139f398f --- /dev/null +++ b/source/net/yacy/search/Shutdown.java @@ -0,0 +1,47 @@ +/** + * Shutdown + * Copyright 2012 by Michael Peter Christen + * First released 24.07.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search; + +import net.yacy.kelondro.logging.Log; + +public class Shutdown extends Thread { + private final Switchboard sb; + private final long delay; + private final String reason; + + public Shutdown(final Switchboard sb, final long delay, final String reason) { + this.sb = sb; + this.delay = delay; + this.reason = reason; + } + + @Override + public void run() { + try { + Thread.sleep(this.delay); + } catch ( final InterruptedException e ) { + this.sb.getLog().logInfo("interrupted delayed shutdown"); + } catch ( final Exception e ) { + Log.logException(e); + } + this.sb.terminate(this.reason); + } +} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 245b4da89..8a29de350 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -97,7 +97,6 @@ import net.yacy.cora.protocol.http.ProxySettings; import net.yacy.cora.services.federated.solr.ShardSelection; import net.yacy.cora.services.federated.solr.ShardSolrConnector; import net.yacy.cora.services.federated.solr.SolrConnector; -import net.yacy.cora.services.federated.solr.SolrDoc; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -251,7 +250,6 @@ public final class Switchboard extends serverSwitch public SeedDB peers; public WorkTables tables; public Tray tray; - public SolrConfiguration solrScheme; public WorkflowProcessor indexingDocumentProcessor; public WorkflowProcessor indexingCondensementProcessor; @@ -376,16 +374,6 @@ public final class Switchboard extends serverSwitch this.networkRoot.mkdirs(); this.queuesRoot.mkdirs(); - // initialize index - ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0); - final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS"); - this.index = new Segment(this.log, new File(segmentsPath, "default")); - final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000); - if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax); - if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); - if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727); - if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr(connectWithinMs); - // prepare a solr index profile switch list final File solrBackupProfile = new File("defaults/solr.keys.list"); final String schemename = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_SCHEMEFILE, "solr.keys.default.list"); @@ -395,11 +383,21 @@ public final class Switchboard extends serverSwitch } final boolean solrlazy = getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_LAZY, true); final SolrConfiguration backupScheme = new SolrConfiguration(solrBackupProfile, solrlazy); - this.solrScheme = new SolrConfiguration(solrWorkProfile, solrlazy); - + final SolrConfiguration solrScheme = new SolrConfiguration(solrWorkProfile, solrlazy); // update the working scheme with the backup scheme. This is necessary to include new features. // new features are always activated by default (if activated in input-backupScheme) - this.solrScheme.fill(backupScheme, true); + solrScheme.fill(backupScheme, true); + + // initialize index + ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0); + final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS"); + this.index = new Segment(this.log, new File(segmentsPath, "default"), solrScheme); + final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000); + if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax); + if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); + if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_URLDB, true)) this.index.connectUrlDb(this.useTailCache, this.exceed134217727); + if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_SOLR, true)) this.index.connectLocalSolr(connectWithinMs); + // set up the solr interface final String solrurls = getConfig(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_URL, "http://127.0.0.1:8983/solr"); @@ -1133,6 +1131,9 @@ public final class Switchboard extends serverSwitch // switch the networks synchronized ( this ) { + // remember the solr scheme + SolrConfiguration solrScheme = this.index.getSolrScheme(); + // shut down this.crawler.close(); if ( this.dhtDispatcher != null ) { @@ -1179,7 +1180,7 @@ public final class Switchboard extends serverSwitch partitionExponent, this.useTailCache, this.exceed134217727); - this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default")); + this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"), solrScheme); final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); @@ -2395,55 +2396,8 @@ public final class Switchboard extends serverSwitch return new IndexingQueueEntry(in.queueEntry, in.documents, null); } - boolean localSolr = this.index.connectedLocalSolr(); - boolean remoteSolr = this.index.connectedRemoteSolr(); - if (localSolr || remoteSolr) { - // send the documents to solr - for ( final Document doc : in.documents ) { - try { - final String id = UTF8.String(new DigestURI(doc.dc_identifier()).hash()); - final String iquh = UTF8.String(in.queueEntry.url().hash()); - if ( !id.equals(iquh) ) { - this.log.logWarning("condenseDocument consistency check doc=" - + id - + ":" - + doc.dc_identifier() - + ", query=" - + iquh - + ":" - + in.queueEntry.url()); - // in case that this happens it appears that the doc id is the right one - } - try { - SolrDoc solrDoc = this.solrScheme.yacy2solr(id, in.queueEntry.getResponseHeader(), doc); - this.index.getSolr().add(solrDoc); - } catch ( final IOException e ) { - Log.logWarning( - "SOLR", - "failed to send " - + in.queueEntry.url().toNormalform(true, false) - + " to solr: " - + e.getMessage()); - } - } catch ( final MalformedURLException e ) { - Log.logException(e); - continue; - } - } - } - - // check if we should accept the document for our index - if (!this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) { - if ( this.log.isInfo() ) { - this.log.logInfo("Not Condensed Resource '" - + in.queueEntry.url().toNormalform(false, true) - + "': indexing not wanted by federated rule for YaCy"); - } - return new IndexingQueueEntry(in.queueEntry, in.documents, null); - } - final List doclist = new ArrayList(); - // check which files may take part in the indexing process + final List doclist = new ArrayList(); for ( final Document document : in.documents ) { if ( document.indexingDenied() ) { if ( this.log.isInfo() ) { @@ -2569,6 +2523,7 @@ public final class Switchboard extends serverSwitch queueEntry.lastModified(), new Date(), queueEntry.size(), + queueEntry.getResponseHeader(), document, condenser, searchEvent, diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 21ebf5878..518d2c08f 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -73,9 +73,9 @@ public class DocumentIndex extends Segment static final ThreadGroup workerThreadGroup = new ThreadGroup("workerThreadGroup"); - public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) + public DocumentIndex(final File segmentPath, final File schemePath, final CallbackListener callback, final int cachesize) throws IOException { - super(new Log("DocumentIndex"), segmentPath); + super(new Log("DocumentIndex"), segmentPath, schemePath == null ? null : new SolrConfiguration(schemePath, true)); super.connectRWI(cachesize, targetFileSize * 4 - 1); super.connectCitation(cachesize, targetFileSize * 4 - 1); super.connectUrlDb( @@ -174,6 +174,7 @@ public class DocumentIndex extends Segment new Date(url.lastModified()), new Date(), url.length(), + null, document, condenser, null, @@ -306,7 +307,7 @@ public class DocumentIndex extends Segment try { if ( args[1].equals("add") ) { final DigestURI f = new DigestURI(args[2]); - final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); + final DocumentIndex di = new DocumentIndex(segmentPath, null, callback, 100000); di.addConcurrent(f); di.close(); } else { @@ -315,7 +316,7 @@ public class DocumentIndex extends Segment query += args[i]; } query.trim(); - final DocumentIndex di = new DocumentIndex(segmentPath, callback, 100000); + final DocumentIndex di = new DocumentIndex(segmentPath, null, callback, 100000); final ArrayList results = di.find(query, 100); for ( final DigestURI f : results ) { if ( f != null ) { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 2faf13d25..f3c16058d 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -39,7 +39,9 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.ByteOrder; +import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.services.federated.solr.SolrConnector; +import net.yacy.cora.services.federated.solr.SolrDoc; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -100,15 +102,16 @@ public class Segment { private final Log log; private final File segmentPath; + private final SolrConfiguration solrScheme; protected final MetadataRepository urlMetadata; protected IndexCell termIndex; protected IndexCell urlCitationIndex; - public Segment(final Log log, final File segmentPath) { - + public Segment(final Log log, final File segmentPath, final SolrConfiguration solrScheme) { log.logInfo("Initializing Segment '" + segmentPath + "."); this.log = log; this.segmentPath = segmentPath; + this.solrScheme = solrScheme; // create LURL-db this.urlMetadata = new MetadataRepository(segmentPath); @@ -197,10 +200,15 @@ public class Segment { public void disconnectLocalSolr() { this.urlMetadata.disconnectLocalSolr(); } + public SolrConnector getSolr() { return this.urlMetadata.getSolr(); } + public SolrConfiguration getSolrScheme() { + return this.solrScheme; + } + public SolrConnector getRemoteSolr() { return this.urlMetadata.getRemoteSolr(); } @@ -318,94 +326,6 @@ public class Segment { return this.segmentPath; } - /** - * this is called by the switchboard to put in a new page into the index - * use all the words in one condenser object to simultanous create index entries - * - * @param url - * @param urlModified - * @param document - * @param condenser - * @param language - * @param doctype - * @param outlinksSame - * @param outlinksOther - * @return - */ - private int addPageIndex( - final DigestURI url, - final Date urlModified, - final Document document, - final Condenser condenser, - final String language, - final char doctype, - final int outlinksSame, - final int outlinksOther, - final SearchEvent searchEvent, - final String sourceName) { - final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult(); - int wordCount = 0; - final int urlLength = url.toNormalform(true, true).length(); - final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; - - // iterate over all words of content text - final Iterator> i = condenser.words().entrySet().iterator(); - Map.Entry wentry; - String word; - final int len = (document == null) ? urlLength : document.dc_title().length(); - final WordReferenceRow ientry = new WordReferenceRow(url.hash(), - urlLength, urlComps, len, - condenser.RESULT_NUMB_WORDS, - condenser.RESULT_NUMB_SENTENCES, - urlModified.getTime(), - System.currentTimeMillis(), - UTF8.getBytes(language), - doctype, - outlinksSame, outlinksOther); - Word wprop = null; - byte[] wordhash; - while (i.hasNext()) { - wentry = i.next(); - word = wentry.getKey(); - wprop = wentry.getValue(); - assert (wprop.flags != null); - ientry.setWord(wprop); - wordhash = Word.word2hash(word); - if (this.termIndex != null) try { - this.termIndex.add(wordhash, ientry); - } catch (final Exception e) { - Log.logException(e); - } - wordCount++; - - // during a search event it is possible that a heuristic is used which aquires index - // data during search-time. To transfer indexed data directly to the search process - // the following lines push the index data additionally to the search process - // this is done only for searched words - if (searchEvent != null && !searchEvent.getQuery().query_exclude_hashes.has(wordhash) && searchEvent.getQuery().query_include_hashes.has(wordhash)) { - // if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result - ReferenceContainer container; - try { - container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1); - container.add(ientry); - rankingProcess.add(container, true, sourceName, -1, !i.hasNext(), 5000); - } catch (final RowSpaceExceededException e) { - continue; - } - } - } - - // assign the catchall word - ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics - if (this.termIndex != null) try { - this.termIndex.add(catchallHash, ientry); - } catch (final Exception e) { - Log.logException(e); - } - - return wordCount; - } - private int addCitationIndex(final DigestURI url, final Date urlModified, final Map anchors) { if (anchors == null) return 0; int refCount = 0; @@ -433,25 +353,12 @@ public class Segment { if (this.urlCitationIndex != null) this.urlCitationIndex.close(); } - public URIMetadataRow storeDocument( - final DigestURI url, - final DigestURI referrerURL, - Date modDate, - final Date loadDate, - final long sourcesize, - final Document document, - final Condenser condenser, - final SearchEvent searchEvent, - final String sourceName - ) throws IOException { - final long startTime = System.currentTimeMillis(); - - // CREATE INDEX - - // load some document metadata - final String dc_title = document.dc_title(); - - // do a identification of the language + private String votedLanguage( + final DigestURI url, + final String urlNormalform, + final Document document, + final Condenser condenser) { + // do a identification of the language String language = condenser.language(); // this is a statistical analysation of the content: will be compared with other attributes final String bymetadata = document.dc_language(); // the languageByMetadata may return null if there was no declaration if (language == null) { @@ -466,7 +373,7 @@ public class Segment { else { final String error = "LANGUAGE-BY-STATISTICS: " + url + " CONFLICTING: " + language + " (the language given by the TLD is " + url.language() + ")"; // see if we have a hint in the url that the statistic was right - final String u = url.toNormalform(true, false).toLowerCase(); + final String u = urlNormalform.toLowerCase(); if (!u.contains("/" + language + "/") && !u.contains("/" + ISO639.country(language).toLowerCase() + "/")) { // no confirmation using the url, use the TLD language = url.language(); @@ -491,9 +398,46 @@ public class Segment { } } } + return language; + } - // create a new loaded URL db entry - if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; + public URIMetadataRow storeDocument( + final DigestURI url, + final DigestURI referrerURL, + Date modDate, + final Date loadDate, + final long sourcesize, + final ResponseHeader responseHeader, + final Document document, + final Condenser condenser, + final SearchEvent searchEvent, + final String sourceName + ) throws IOException { + final long startTime = System.currentTimeMillis(); + + // CREATE INDEX + + // load some document metadata + final String id = ASCII.String(url.hash()); + final String dc_title = document.dc_title(); + final String urlNormalform = url.toNormalform(true, false); + final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language + + // STORE TO SOLR + boolean localSolr = this.connectedLocalSolr(); + boolean remoteSolr = this.connectedRemoteSolr(); + if (localSolr || remoteSolr) { + try { + SolrDoc solrDoc = this.solrScheme.yacy2solr(id, responseHeader, document); + this.getSolr().add(solrDoc); + } catch ( final IOException e ) { + Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage()); + } + } + + // STORE URL TO LOADED-URL-DB + if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; // TODO: compare with modTime from responseHeader + char docType = Response.docType(document.dc_format()); final URIMetadataRow newEntry = new URIMetadataRow( url, // URL dc_title, // document description @@ -509,7 +453,7 @@ public class Segment { new byte[0], // md5 (int) sourcesize, // size condenser.RESULT_NUMB_WORDS, // word count - Response.docType(document.dc_format()), // doctype + docType, // doctype condenser.RESULT_FLAGS, // flags UTF8.getBytes(language), // language document.inboundLinks().size(), // inbound links @@ -519,25 +463,72 @@ public class Segment { document.getVideolinks().size(), // lvideo document.getApplinks().size() // lapp ); - - // STORE URL TO LOADED-URL-DB - this.urlMetadata.store(newEntry); // TODO: should be serialized; integrated in IODispatcher - + this.urlMetadata.store(newEntry); final long storageEndTime = System.currentTimeMillis(); // STORE PAGE INDEX INTO WORD INDEX DB - final int words = addPageIndex( - url, // document url - modDate, // document mod date - document, // document content - condenser, // document condenser - language, // document language - Response.docType(document.dc_format()), // document type - document.inboundLinks().size(), // inbound links - document.outboundLinks().size(), // outbound links - searchEvent, // a search event that can have results directly - sourceName // the name of the source where the index was created - ); + int outlinksSame = document.inboundLinks().size(); + int outlinksOther = document.outboundLinks().size(); + final RWIProcess rankingProcess = (searchEvent == null) ? null : searchEvent.getRankingResult(); + int wordCount = 0; + final int urlLength = urlNormalform.length(); + final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; + + // create a word prototype which is re-used for all entries + final int len = (document == null) ? urlLength : document.dc_title().length(); + final WordReferenceRow ientry = new WordReferenceRow( + url.hash(), + urlLength, urlComps, len, + condenser.RESULT_NUMB_WORDS, + condenser.RESULT_NUMB_SENTENCES, + modDate.getTime(), + System.currentTimeMillis(), + UTF8.getBytes(language), + docType, + outlinksSame, outlinksOther); + + // iterate over all words of content text + Word wprop = null; + byte[] wordhash; + String word; + for (Map.Entry wentry: condenser.words().entrySet()) { + word = wentry.getKey(); + wprop = wentry.getValue(); + assert (wprop.flags != null); + ientry.setWord(wprop); + wordhash = Word.word2hash(word); + if (this.termIndex != null) try { + this.termIndex.add(wordhash, ientry); + } catch (final Exception e) { + Log.logException(e); + } + wordCount++; + + // during a search event it is possible that a heuristic is used which aquires index + // data during search-time. To transfer indexed data directly to the search process + // the following lines push the index data additionally to the search process + // this is done only for searched words + if (searchEvent != null && !searchEvent.getQuery().query_exclude_hashes.has(wordhash) && searchEvent.getQuery().query_include_hashes.has(wordhash)) { + // if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result + ReferenceContainer container; + try { + container = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, wordhash, 1); + container.add(ientry); + rankingProcess.add(container, true, sourceName, -1, 5000); + } catch (final RowSpaceExceededException e) { + continue; + } + } + } + if (rankingProcess != null) rankingProcess.addFinalize(); + + // assign the catchall word + ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics + if (this.termIndex != null) try { + this.termIndex.add(catchallHash, ientry); + } catch (final Exception e) { + Log.logException(e); + } // STORE PAGE REFERENCES INTO CITATION INDEX final int refs = addCitationIndex(url, modDate, document.getAnchors()); @@ -546,10 +537,8 @@ public class Segment { final long indexingEndTime = System.currentTimeMillis(); if (this.log.isInfo()) { - // TODO: UTF-8 docDescription seems not to be displayed correctly because - // of string concatenation - this.log.logInfo("*Indexed " + words + " words in URL " + url + - " [" + ASCII.String(url.hash()) + "]" + + this.log.logInfo("*Indexed " + wordCount + " words in URL " + url + + " [" + id + "]" + "\n\tDescription: " + dc_title + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "Size: " + document.getTextLength() + " bytes | " + diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 5fc5efe4f..ef7a3b093 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -106,7 +106,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String[] value) { if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && value.length > 0))) solrdoc.addSolr(key, value); } - + protected void addSolr(final SolrDoc solrdoc, final SolrField key, final List value) { if ((isEmpty() || contains(key.name())) && (!this.lazy || (value != null && !value.isEmpty()))) solrdoc.addSolr(key, value); } @@ -163,7 +163,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable addSolr(solrdoc, SolrField.author, yacydoc.dc_creator()); addSolr(solrdoc, SolrField.description, yacydoc.dc_description()); addSolr(solrdoc, SolrField.content_type, yacydoc.dc_format()); - addSolr(solrdoc, SolrField.last_modified, header.lastModified()); + addSolr(solrdoc, SolrField.last_modified, header == null ? new Date() : header.lastModified()); addSolr(solrdoc, SolrField.keywords, yacydoc.dc_subject(' ')); final String content = yacydoc.getTextString(); addSolr(solrdoc, SolrField.text_t, content); @@ -224,10 +224,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (robots_meta.indexOf("noindex",0) >= 0) b += 4; // set bit 2 if (robots_meta.indexOf("nofollow",0) >= 0) b += 8; // set bit 3 } - String x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, ""); - if (x_robots_tag.isEmpty()) { - x_robots_tag = header.get(HeaderFramework.X_ROBOTS, ""); - } else { + String x_robots_tag = ""; + if (header != null) { + x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, ""); + if (x_robots_tag.isEmpty()) { + x_robots_tag = header.get(HeaderFramework.X_ROBOTS, ""); + } + } + if (!x_robots_tag.isEmpty()) { // this tag may have values: noarchive, nosnippet, noindex, unavailable_after if (x_robots_tag.indexOf("noarchive",0) >= 0) b += 256; // set bit 8 if (x_robots_tag.indexOf("nosnippet",0) >= 0) b += 512; // set bit 9 @@ -398,7 +402,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // response time - addSolr(solrdoc, SolrField.responsetime_i, header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0")); + addSolr(solrdoc, SolrField.responsetime_i, header == null ? 0 : Integer.parseInt(header.get(HeaderFramework.RESPONSE_TIME_MILLIS, "0"))); } // list all links @@ -487,7 +491,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); } - addSolr(solrdoc, SolrField.httpstatus_i, header.getStatusCode()); + addSolr(solrdoc, SolrField.httpstatus_i, header == null ? 200 : header.getStatusCode()); return solrdoc; } diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index 54ddf747b..3248598d8 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -221,7 +221,8 @@ public final class RWIProcess extends Thread System.currentTimeMillis() - timer), false); if ( !index.isEmpty() ) { - add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true, this.maxtime); + add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, this.maxtime); + addFinalize(); } } catch ( final Exception e ) { Log.logException(e); @@ -230,12 +231,15 @@ public final class RWIProcess extends Thread } } + public void addFinalize() { + this.addRunning = false; + } + public void add( final ReferenceContainer index, final boolean local, final String resourceName, final int fullResource, - final boolean finalizeAddAtEnd, final long maxtime) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime @@ -422,10 +426,6 @@ public final class RWIProcess extends Thread } catch ( final InterruptedException e ) { } catch ( final RowSpaceExceededException e ) { - } finally { - if ( finalizeAddAtEnd ) { - this.addRunning = false; - } } //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index e2d4b24e9..d381d7d9e 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -503,7 +503,7 @@ public class SnippetProcess { sd = sdl.get(0); } if (sd != null) { - solrContent = Switchboard.getSwitchboard().solrScheme.solrGetText(sd); + solrContent = Switchboard.getSwitchboard().index.getSolrScheme().solrGetText(sd); } } diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index 62564b8b4..d10281b63 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -666,7 +666,7 @@ public final class yacy { final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total()); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); - final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT")); + final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null); wordIndex.connectRWI(10000, Integer.MAX_VALUE); wordIndex.connectUrlDb(false, false); final Iterator> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false); @@ -845,7 +845,7 @@ public final class yacy { try { Iterator> indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT")); + WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null); WordIndex.connectRWI(10000, Integer.MAX_VALUE); WordIndex.connectUrlDb(false, false); indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);