- removed dependencies from URIMetadataRow and made direct access to

URIMetadataNode which creates the opportunity to access Solr objects
directly and use their information richness
- lazy initialization of the URIMetadataNode object - should cause less
computation and memory usage during search.
- removed dead code
pull/1/head
Michael Peter Christen 13 years ago
parent cc98496ff3
commit 43f3345c90

@ -44,7 +44,7 @@ failreason_t
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i
## html status return code (i.e. \"200\" for ok), -1 if not loaded
## redirect url if the error code is 299 < httpstatus_i < 310
#httpstatus_redirect_s

@ -52,7 +52,7 @@ import net.yacy.data.BookmarksDB.Tag;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.NewsPool;
import net.yacy.search.Switchboard;
@ -195,7 +195,7 @@ public class Bookmarks {
final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
final URIMetadata urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash));
final URIMetadataNode urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash));
if (urlentry != null) try {
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay));
prop.put("mode_edit", "0"); // create mode

@ -40,7 +40,7 @@ import net.yacy.crawler.data.ResultURLs;
import net.yacy.crawler.data.ResultURLs.EventOrigin;
import net.yacy.crawler.data.ResultURLs.InitExecEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
@ -182,7 +182,7 @@ public class CrawlResults {
boolean dark = true;
String urlstr, urltxt;
Seed initiatorSeed, executorSeed;
URIMetadata urle;
URIMetadataNode urle;
int cnt = 0;
final Iterator<Map.Entry<String, InitExecEntry>> i = ResultURLs.results(tabletype);

@ -158,7 +158,7 @@ public class HostBrowser {
String host = uri.getHost();
prop.putHTML("outbound_host", host);
prop.putHTML("inbound_host", host);
String hosthash = ASCII.String(uri.hash(), 6, 12);
String hosthash = ASCII.String(uri.hash(), 6, 6);
// get all files for a specific host from the index
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000);

@ -278,7 +278,7 @@ public class IndexControlRWIs_p {
WordReferenceRow.urlEntryRow.objectOrder,
index.size());
Reference iEntry;
URIMetadata lurl;
URIMetadataNode lurl;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
lurl = segment.fulltext().getMetadata(iEntry.urlhash());
@ -290,11 +290,7 @@ public class IndexControlRWIs_p {
}
urlIter.remove();
} else {
if (lurl instanceof URIMetadataRow) {
knownURLs.put(iEntry.urlhash(), (URIMetadataRow) lurl);
} else if (lurl instanceof URIMetadataNode) {
knownURLs.put(iEntry.urlhash(), ((URIMetadataNode) lurl).toRow());
}
knownURLs.put(iEntry.urlhash(), lurl.toRow());
}
}

@ -42,6 +42,7 @@ import net.yacy.crawler.data.ResultURLs;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.RotateIterator;
@ -175,7 +176,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
@ -233,7 +234,7 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final Iterator<URIMetadataNode> entryIt = new RotateIterator<URIMetadataNode>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URIMetadata entry;
int i = 0, rows = 0, cols = 0;
@ -347,7 +348,7 @@ public class IndexControlURLs_p {
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash());
final URIMetadataNode le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash());
if (entry.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);

@ -55,7 +55,7 @@ import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.server.serverObjects;
@ -131,7 +131,7 @@ public class ViewFile {
pre = post.getBoolean("pre");
} catch (final MalformedURLException e) {}
URIMetadata urlEntry = null;
URIMetadataNode urlEntry = null;
// get the urlEntry that belongs to the url hash
//boolean ue = urlHash.length() > 0 && indexSegment.exists(ASCII.getBytes(urlHash));
//if (ue) Log.logInfo("ViewFile", "exists(" + urlHash + ")");

@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
@ -86,12 +86,12 @@ public class Vocabulary_p {
if (p >= 0) t = t.substring(p + 1);
}
if (discoverFromTitle || discoverFromTitleSplitted) {
URIMetadata m = segment.fulltext().getMetadata(u.hash());
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_title();
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
}
if (discoverFromAuthor) {
URIMetadata m = segment.fulltext().getMetadata(u.hash());
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_creator();
}
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();

@ -37,6 +37,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.RequestHeader.FileType;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@ -99,7 +100,7 @@ public class yacydoc {
}
if (urlhash == null || urlhash.isEmpty()) return prop;
final URIMetadata entry = segment.fulltext().getMetadata(urlhash.getBytes());
final URIMetadataNode entry = segment.fulltext().getMetadata(urlhash.getBytes());
if (entry == null) return prop;
if (entry.url() == null) {

@ -34,7 +34,7 @@ import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.peers.Protocol;
import net.yacy.search.Switchboard;
import net.yacy.server.serverObjects;
@ -110,7 +110,7 @@ public class urls {
if (urlhashes.length() % 12 != 0) return prop;
final int count = urlhashes.length() / 12;
int c = 0;
URIMetadata entry;
URIMetadataNode entry;
DigestURI referrer;
for (int i = 0; i < count; i++) {
entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));

@ -60,7 +60,7 @@ import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Bitfield;
@ -653,7 +653,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
final URIMetadata urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(recommendHash));
final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(recommendHash));
if ( urlentry != null ) {
Document[] documents = null;
try {
@ -689,7 +689,7 @@ public class yacysearch {
return prop;
}
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final URIMetadata urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(bookmarkHash));
final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(bookmarkHash));
if ( urlentry != null ) {
try {
sb.tables.bookmarks.createBookmark(

@ -59,7 +59,7 @@ import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.SMBLoader;
import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.peers.SeedDB;
@ -447,7 +447,7 @@ public final class CrawlStacker {
// check if the url is double registered
final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
final URIMetadata oldEntry = this.indexSegment.fulltext().getMetadata(url.hash());
final URIMetadataNode oldEntry = this.indexSegment.fulltext().getMetadata(url.hash());
if (oldEntry == null) {
if (dbocc != null) {
// do double-check

@ -33,7 +33,7 @@ import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.parser.sitemapParser;
import net.yacy.document.parser.sitemapParser.URLEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@ -84,7 +84,7 @@ public class SitemapImporter extends Thread {
final String dbocc = this.sb.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
final URIMetadata oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null

@ -37,7 +37,7 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segment;
import net.yacy.search.snippet.TextSnippet;
@ -106,7 +106,7 @@ public class YMarkMetadata {
public EnumMap<METADATA, String> getMetadata() {
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
final URIMetadata urlEntry = this.indexSegment.fulltext().getMetadata(this.uri.hash());
final URIMetadataNode urlEntry = this.indexSegment.fulltext().getMetadata(this.uri.hash());
if (urlEntry != null) {
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));

@ -52,16 +52,16 @@ import org.apache.solr.common.SolrDocument;
*/
public class URIMetadataNode implements URIMetadata {
private final byte[] hash;
private final String urlRaw, keywords;
private DigestURI url;
private Bitfield flags;
private final int imagec, audioc, videoc, appc;
private double lat, lon;
private long ranking; // during generation of a search result this value is set
private final SolrDocument doc;
private final String snippet;
private WordReference word; // this is only used if the url is transported via remote search requests
private byte[] hash = null;
private String urlRaw = null, keywords = null;
private DigestURI url = null;
private Bitfield flags = null;
private int imagec = -1, audioc = -1, videoc = -1, appc = -1;
private double lat = Double.NaN, lon = Double.NaN;
private long ranking = -1; // during generation of a search result this value is set
private SolrDocument doc = null;
private String snippet = null;
private WordReference word = null; // this is only used if the url is transported via remote search requests
public URIMetadataNode(final SolrDocument doc) {
this.doc = doc;
@ -76,30 +76,6 @@ public class URIMetadataNode implements URIMetadata {
Log.logException(e);
this.url = null;
}
// to set the flags bitfield we need to pre-load some values from the Solr document
this.keywords = getString(YaCySchema.keywords);
this.imagec = getInt(YaCySchema.imagescount_i);
this.audioc = getInt(YaCySchema.audiolinkscount_i);
this.videoc = getInt(YaCySchema.videolinkscount_i);
this.appc = getInt(YaCySchema.videolinkscount_i);
this.lon = 0.0d;
this.lat = 0.0d;
String latlon = (String) this.doc.getFieldValue(YaCySchema.coordinate_p.name());
if (latlon != null) {
int p = latlon.indexOf(',');
if (p > 0) {
this.lat = Double.parseDouble(latlon.substring(0, p));
this.lon = Double.parseDouble(latlon.substring(p + 1));
}
}
this.flags = new Bitfield();
if (this.keywords != null && this.keywords.indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true);
if (this.lon != 0.0d || this.lat != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true);
if (this.imagec > 0) this.flags.set(Condenser.flag_cat_hasimage, true);
if (this.audioc > 0) this.flags.set(Condenser.flag_cat_hasaudio, true);
if (this.videoc > 0) this.flags.set(Condenser.flag_cat_hasvideo, true);
if (this.appc > 0) this.flags.set(Condenser.flag_cat_hasapp, true);
}
public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) {
@ -206,16 +182,32 @@ public class URIMetadataNode implements URIMetadata {
@Override
public String dc_subject() {
if (this.keywords == null) {
this.keywords = getString(YaCySchema.keywords);
}
return this.keywords;
}
@Override
public double lat() {
if (this.lat == Double.NaN) {
this.lon = 0.0d;
this.lat = 0.0d;
String latlon = (String) this.doc.getFieldValue(YaCySchema.coordinate_p.name());
if (latlon != null) {
int p = latlon.indexOf(',');
if (p > 0) {
this.lat = Double.parseDouble(latlon.substring(0, p));
this.lon = Double.parseDouble(latlon.substring(p + 1));
}
}
}
return this.lat;
}
@Override
public double lon() {
if (this.lon == Double.NaN) lat();
return this.lon;
}
@ -242,7 +234,7 @@ public class URIMetadataNode implements URIMetadata {
@Override
public char doctype() {
ArrayList<String> a = getArrayList(YaCySchema.content_type);
if (a == null || a.size() == 0) return Response.docType(this.url);
if (a == null || a.size() == 0) return Response.docType(url());
return Response.docType(a.get(0));
}
@ -268,6 +260,15 @@ public class URIMetadataNode implements URIMetadata {
@Override
public Bitfield flags() {
if (flags == null) {
this.flags = new Bitfield();
if (dc_subject() != null && dc_subject().indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true);
if (lon() != 0.0d || lat() != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true);
if (limage() > 0) this.flags.set(Condenser.flag_cat_hasimage, true);
if (laudio() > 0) this.flags.set(Condenser.flag_cat_hasaudio, true);
if (lvideo() > 0) this.flags.set(Condenser.flag_cat_hasvideo, true);
if (lapp() > 0) this.flags.set(Condenser.flag_cat_hasapp, true);
}
return this.flags;
}
@ -288,21 +289,33 @@ public class URIMetadataNode implements URIMetadata {
@Override
public int limage() {
if (this.imagec == -1) {
this.imagec = getInt(YaCySchema.imagescount_i);
}
return this.imagec;
}
@Override
public int laudio() {
if (this.audioc == -1) {
this.audioc = getInt(YaCySchema.audiolinkscount_i);
}
return this.audioc;
}
@Override
public int lvideo() {
if (this.videoc == -1) {
this.videoc = getInt(YaCySchema.videolinkscount_i);
}
return this.videoc;
}
@Override
public int lapp() {
if (this.appc == -1) {
this.appc = getInt(YaCySchema.videolinkscount_i);
}
return this.appc;
}
@ -337,7 +350,7 @@ public class URIMetadataNode implements URIMetadata {
return false;
}
public static StringBuilder corePropList(URIMetadata md) {
protected static StringBuilder corePropList(URIMetadata md) {
// generate a parseable string; this is a simple property-list
final StringBuilder s = new StringBuilder(300);

@ -300,10 +300,6 @@ public class URIMetadataRow implements URIMetadata {
}
}
public Row.Entry toRowEntry() {
return this.entry;
}
@Override
public byte[] hash() {
// return a url-hash, based on the md5 algorithm

@ -28,8 +28,6 @@ package net.yacy.kelondro.data.word;
import java.util.ArrayList;
import java.util.Collection;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.ASCII;
@ -79,7 +77,6 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
* object for termination of concurrent blocking queue processing
*/
public static final Row.Entry poisonRowEntry = urlEntryRow.newEntry();
private static final WordReferenceRow poison = new WordReferenceRow(poisonRowEntry);
// static properties
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
@ -205,69 +202,6 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_posofphrase, word.numOfPhrase);
}
public static class ExternalParser {
private static final String PIN = "_";
private final BlockingQueue<String> in;
private final BlockingQueue<WordReferenceRow> out;
private Thread[] worker;
public ExternalParser(final int concurrency) {
this.in = new LinkedBlockingQueue<String>();
this.out = new LinkedBlockingQueue<WordReferenceRow>();
for (int i = 0; i < concurrency; i++) {
this.worker[i] = new Thread() {
@Override
public void run() {
Thread.currentThread().setName("WordReferenceRow.ExternalParser:" + concurrency);
String s;
try {
while ((s = ExternalParser.this.in.take()) != PIN) {
ExternalParser.this.out.put(new WordReferenceRow(s));
}
} catch (final InterruptedException e) {
}
}
};
this.worker[i].start();
}
}
public ExternalParser() {
this(Runtime.getRuntime().availableProcessors());
}
public void put(final String s) {
try {
this.in.put(s);
} catch (final InterruptedException e) {
}
}
public void terminate() {
for (@SuppressWarnings("unused") final Thread w : this.worker) {
try {
this.in.put(PIN);
} catch (final InterruptedException e) {
}
}
for (final Thread w : this.worker) {
try {
if (w.isAlive()) w.join();
} catch (final InterruptedException e) {
}
}
try {
this.out.put(poison);
} catch (final InterruptedException e) {
}
}
public WordReferenceRow take() {
WordReferenceRow row;
try {
row = this.out.take();
} catch (final InterruptedException e) {
return poison;
}
return row;
}
}
public WordReferenceRow(final String external) {
this.entry = urlEntryRow.newEntry(external, true);
}
@ -276,9 +210,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry = urlEntryRow.newEntry(row);
}
public WordReferenceRow(final byte[] row, final int offset, final boolean clone) {
this.entry = urlEntryRow.newEntry(row, offset, clone);
}
public WordReferenceRow(final Row.Entry rentry) {
// no cloning is necessary since there is no further manipulation after this initial instantiation

@ -55,7 +55,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
*/
public static final WordReferenceVars poison = new WordReferenceVars();
private static int cores = Runtime.getRuntime().availableProcessors();
public static final byte[] default_language = UTF8.getBytes("uk");
protected static final byte[] default_language = UTF8.getBytes("uk");
private final Bitfield flags;
private long lastModified;
@ -274,7 +274,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return this.posofphrase;
}
public WordReferenceRow toRowEntry() {
private WordReferenceRow toRowEntry() {
return new WordReferenceRow(
this.urlHash,
this.urllength, // byte-length of complete URL
@ -470,13 +470,13 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
return vars;
}
public static class TransformDistributor extends Thread {
private static class TransformDistributor extends Thread {
private ReferenceContainer<WordReference> container;
private BlockingQueue<WordReferenceVars> out;
private long maxtime;
public TransformDistributor(final ReferenceContainer<WordReference> container, final BlockingQueue<WordReferenceVars> out, final long maxtime) {
private TransformDistributor(final ReferenceContainer<WordReference> container, final BlockingQueue<WordReferenceVars> out, final long maxtime) {
this.container = container;
this.out = out;
this.maxtime = maxtime;
@ -521,19 +521,19 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
}
}
public static class TransformWorker extends Thread {
private static class TransformWorker extends Thread {
private BlockingQueue<Row.Entry> in;
private BlockingQueue<WordReferenceVars> out;
private long maxtime;
public TransformWorker(final BlockingQueue<WordReferenceVars> out, final long maxtime) {
private TransformWorker(final BlockingQueue<WordReferenceVars> out, final long maxtime) {
this.in = new LinkedBlockingQueue<Row.Entry>();
this.out = out;
this.maxtime = maxtime;
}
public void add(final Row.Entry entry) {
private void add(final Row.Entry entry) {
try {
this.in.put(entry);
} catch (final InterruptedException e) {

@ -293,7 +293,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>,
if (this.rowdef.objectOrder != null && this.rowdef.objectOrder instanceof Base64Order) {
// first try to find in sorted area
assert this.rowdef.objectOrder.wellformed(a, astart, this.rowdef.primaryKeyLength) : "not wellformed: " + UTF8.String(a, astart, this.rowdef.primaryKeyLength);
assert this.rowdef.objectOrder.wellformed(a, astart, this.rowdef.primaryKeyLength) : "not wellformed: " + ASCII.String(a, astart, this.rowdef.primaryKeyLength);
}
// first try to find in sorted area

@ -35,7 +35,6 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -175,16 +174,12 @@ public class Transmission {
notFoundx.add(e.urlhash());
continue;
}
final URIMetadata r = Transmission.this.segment.fulltext().getMetadata(e.urlhash());
final URIMetadataNode r = Transmission.this.segment.fulltext().getMetadata(e.urlhash());
if (r == null) {
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());
} else {
if (r instanceof URIMetadataRow) {
this.references.put(e.urlhash(), (URIMetadataRow) r);
} else if (r instanceof URIMetadataNode) {
this.references.put(e.urlhash(), ((URIMetadataNode) r).toRow());
}
this.references.put(e.urlhash(), r.toRow());
}
}
// now delete all references that were not found

@ -571,7 +571,7 @@ public class WebStructureGraph {
for ( final MultiProtocolURI u : lro.globalRefURLs ) {
if (Switchboard.getSwitchboard().shallTerminate()) break;
du = DigestURI.toDigestURI(u);
hosthash = ASCII.String(du.hash(), 6, 12);
hosthash = ASCII.String(du.hash(), 6, 6);
if (!exists(hosthash)) {
// this must be recorded as an host with no references
synchronized ( this.structure_new ) {

@ -147,6 +147,7 @@ import net.yacy.interaction.contentcontrol.ContentControlImportThread;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
@ -1529,7 +1530,7 @@ public final class Switchboard extends serverSwitch
if ( urlhash.length == 0 ) {
return null;
}
final URIMetadata le = this.index.fulltext().getMetadata(urlhash);
final URIMetadataNode le = this.index.fulltext().getMetadata(urlhash);
if ( le != null ) {
return le.url();
}

@ -198,17 +198,17 @@ public final class Fulltext implements Iterable<byte[]> {
* @param obrwi
* @return
*/
public URIMetadata getMetadata(WordReference wre, long weight) {
public URIMetadataNode getMetadata(WordReference wre, long weight) {
if (wre == null) return null; // all time was already wasted in takeRWI to get another element
return getMetadata(wre.urlhash(), wre, weight);
}
public URIMetadata getMetadata(final byte[] urlHash) {
public URIMetadataNode getMetadata(final byte[] urlHash) {
if (urlHash == null) return null;
return getMetadata(urlHash, null, 0);
}
private URIMetadata getMetadata(final byte[] urlHash, WordReference wre, long weight) {
private URIMetadataNode getMetadata(final byte[] urlHash, WordReference wre, long weight) {
// get the metadata from Solr
try {
@ -227,46 +227,9 @@ public final class Fulltext implements Iterable<byte[]> {
final Row.Entry entry = this.urlIndexFile.remove(urlHash);
if (entry == null) return null;
URIMetadataRow row = new URIMetadataRow(entry, wre, weight);
this.putDocument(this.solrScheme.metadata2solr(row));
return row;
} catch (final IOException e) {
Log.logException(e);
}
return null;
}
public SolrDocument getDocument(WordReference wre, long weight) {
if (wre == null) return null; // all time was already wasted in takeRWI to get another element
return getDocument(wre.urlhash(), wre, weight);
}
public SolrDocument getDocument(final byte[] urlHash) {
if (urlHash == null) return null;
return getDocument(urlHash, null, 0);
}
private SolrDocument getDocument(final byte[] urlHash, WordReference wre, long weight) {
// get the document from Solr
try {
SolrDocument doc = this.solr.get(ASCII.String(urlHash));
if (doc != null) {
if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash);
return doc;
}
} catch (IOException e) {
Log.logException(e);
}
// get the document from the old metadata index
if (this.urlIndexFile != null) try {
// slow migration to solr
final Row.Entry entry = this.urlIndexFile.remove(urlHash);
if (entry == null) return null;
URIMetadataRow row = new URIMetadataRow(entry, wre, weight);
this.putDocument(this.solrScheme.metadata2solr(row));
return ClientUtils.toSolrDocument(getSolrScheme().metadata2solr(row));
SolrInputDocument solrInput = this.solrScheme.metadata2solr(row);
this.putDocument(solrInput);
return new URIMetadataNode(ClientUtils.toSolrDocument(solrInput), wre, weight);
} catch (final IOException e) {
Log.logException(e);
}
@ -303,6 +266,7 @@ public final class Fulltext implements Iterable<byte[]> {
public void putMetadata(final URIMetadata entry) throws IOException {
if (entry instanceof URIMetadataNode) {
putDocument(ClientUtils.toSolrInputDocument(((URIMetadataNode) entry).getDocument()));
return;
}
assert entry instanceof URIMetadataRow;
URIMetadataRow row = (URIMetadataRow) entry;
@ -399,12 +363,12 @@ public final class Fulltext implements Iterable<byte[]> {
true);
}
public CloneableIterator<URIMetadata> entries() {
public CloneableIterator<URIMetadataNode> entries() {
// enumerates entry elements
final Iterator<byte[]> ids = iterator();
return new CloneableIterator<URIMetadata>() {
return new CloneableIterator<URIMetadataNode>() {
@Override
public CloneableIterator<URIMetadata> clone(final Object secondHash) {
public CloneableIterator<URIMetadataNode> clone(final Object secondHash) {
return this;
}
@Override
@ -412,7 +376,7 @@ public final class Fulltext implements Iterable<byte[]> {
return ids.hasNext();
}
@Override
public final URIMetadata next() {
public final URIMetadataNode next() {
byte[] id = ids.next();
if (id == null) return null;
return getMetadata(id);
@ -551,7 +515,7 @@ public final class Fulltext implements Iterable<byte[]> {
this.count++;
}
} else {
final Iterator<URIMetadata> i = entries(); // iterates indexURLEntry objects
final Iterator<URIMetadataNode> i = entries(); // iterates indexURLEntry objects
URIMetadata entry;
String url;
while (i.hasNext()) {
@ -650,7 +614,7 @@ public final class Fulltext implements Iterable<byte[]> {
// collect hashes from all domains
// fetch urls from the database to determine the host in clear text
URIMetadata urlref;
URIMetadataNode urlref;
if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
this.statsDump = new ArrayList<HostStat>();
final TreeSet<String> set = new TreeSet<String>();
@ -687,7 +651,7 @@ public final class Fulltext implements Iterable<byte[]> {
*/
public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
URIMetadata urlref;
URIMetadataNode urlref;
final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
@ -708,7 +672,7 @@ public final class Fulltext implements Iterable<byte[]> {
// fetch urls from the database to determine the host in clear text
final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
URIMetadata urlref;
URIMetadataNode urlref;
String urlhash;
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();

@ -57,6 +57,7 @@ import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -506,7 +507,7 @@ public class Segment {
if (urlhash == null) return 0;
// determine the url string
final URIMetadata entry = fulltext().getMetadata(urlhash);
final URIMetadataNode entry = fulltext().getMetadata(urlhash);
if (entry == null) return 0;
if (entry.url() == null) return 0;

@ -59,6 +59,7 @@ import net.yacy.document.LibraryProvider;
import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -635,7 +636,7 @@ public final class RWIProcess extends Thread
if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element
}
final URIMetadata page = this.query.getSegment().fulltext().getMetadata(obrwi.getElement(), obrwi.getWeight());
final URIMetadataNode page = this.query.getSegment().fulltext().getMetadata(obrwi.getElement(), obrwi.getWeight());
if ( page == null ) {
try {
this.misses.putUnique(obrwi.getElement().urlhash());
@ -896,7 +897,7 @@ public final class RWIProcess extends Thread
}
final Iterator<String> domhashs = this.hostNavigator.keys(false);
URIMetadata row;
URIMetadataNode row;
byte[] urlhash;
String hosthash, hostname;
if ( this.hostResolver != null ) {

Loading…
Cancel
Save