From ab06a6edd2ee88911bd68478fbdf3b4129d2b26e Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 2 Jun 2009 15:20:10 +0000 Subject: [PATCH] renamed topwords to topics and enhanced computation methods of topics topics will now only be computed using the document title, not the document url, because the host navigator is now responsible for statistical effects of urls. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6011 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/search.java | 9 +- htroot/yacysearchtrailer.java | 22 ++- .../de/anomic/plasma/plasmaSearchEvent.java | 13 +- .../plasma/plasmaSearchRankingProcess.java | 147 ++++++++++-------- source/de/anomic/yacy/yacyClient.java | 4 +- 5 files changed, 104 insertions(+), 91 deletions(-) diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 041adf213..63e40b6ac 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -32,7 +32,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.Set; import java.util.TreeSet; import de.anomic.content.RSSMessage; @@ -49,6 +48,7 @@ import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSearchEvent.ResultEntry; +import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; @@ -321,11 +321,10 @@ public final class search { // prepare reference hints final long timer = System.currentTimeMillis(); - final Set ws = theSearch.references(10); + final ArrayList ws = theSearch.topics(10); final StringBuilder refstr = new StringBuilder(); - final Iterator j = ws.iterator(); - while (j.hasNext()) { - refstr.append(",").append(j.next()); + for (NavigatorEntry e: ws) { + refstr.append(",").append(e.name); } prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString()); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), "reference collection", ws.size(), System.currentTimeMillis() - timer), false); diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 8b6ed6877..1ce1c6dd8 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -26,7 +26,6 @@ import java.util.ArrayList; import java.util.Iterator; -import java.util.Set; import java.util.TreeSet; import de.anomic.http.httpRequestHeader; @@ -36,7 +35,7 @@ import de.anomic.plasma.plasmaProfiling; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry; +import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverProfiling; import de.anomic.server.serverSwitch; @@ -65,17 +64,17 @@ public class yacysearchtrailer { // compose search navigation - ArrayList hostNavigator = theSearch.getHostNavigator(10); + ArrayList hostNavigator = theSearch.getHostNavigator(10); if (hostNavigator == null) { prop.put("navigation", 0); } else { prop.put("navigation", 1); - hostnaventry entry; + NavigatorEntry entry; int i; for (i = 0; i < hostNavigator.size(); i++) { entry = hostNavigator.get(i); - prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")"); - prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")"); + prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")"); + prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")"); prop.put("navigation_domains_" + i + "_nl", 1); } i--; @@ -84,16 +83,13 @@ public class yacysearchtrailer { } // attach the bottom line with search references (topwords) - final Set references = theSearch.references(20); + final ArrayList references = theSearch.topics(20); if (references.size() > 0) { // get the topwords final TreeSet topwords = new TreeSet(NaturalOrder.naturalComparator); - String tmp = ""; - final Iterator i = references.iterator(); - while (i.hasNext()) { - tmp = i.next(); - if (tmp.matches("[a-z]+")) { - topwords.add(tmp); + for (NavigatorEntry e: references) { + if (e.name.matches("[a-z]+")) { + topwords.add(e.name); } } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index ce68b0840..67946cfe8 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -32,7 +32,6 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -53,7 +52,7 @@ import de.anomic.kelondro.util.SortStore; import de.anomic.kelondro.util.Log; import de.anomic.plasma.parser.Word; import de.anomic.plasma.parser.Condenser; -import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry; +import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry; import de.anomic.plasma.plasmaSnippetCache.MediaSnippet; import de.anomic.server.serverProfiling; import de.anomic.yacy.yacySearch; @@ -99,7 +98,7 @@ public final class plasmaSearchEvent { long urlRetrievalAllTime; long snippetComputationAllTime; public ResultURLs crawlResults; - private ArrayList hostNavigator; + private ArrayList hostNavigator; @SuppressWarnings("unchecked") private plasmaSearchEvent(final plasmaSearchQuery query, @@ -559,7 +558,7 @@ public final class plasmaSearchEvent { // place the result to the result vector if (!result.exists(resultEntry)) { result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()))); - rankedCache.addReferences(resultEntry); + rankedCache.addTopics(resultEntry); } //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url()); } @@ -579,7 +578,7 @@ public final class plasmaSearchEvent { Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason); } - public ArrayList getHostNavigator(int maxentries) { + public ArrayList getHostNavigator(int maxentries) { if (this.hostNavigator != null) return this.hostNavigator; if (localSearchThread != null && localSearchThread.isAlive()) { try {Thread.sleep(100L);} catch (final InterruptedException e) {} @@ -778,9 +777,9 @@ public final class plasmaSearchEvent { //assert e != null; } - public Set references(final int count) { + public ArrayList topics(final int count) { // returns a set of words that are computed as toplist - return this.rankedCache.getReferences(count); + return this.rankedCache.getTopicNavigator(count); } public static class ResultEntry { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 0b20aeb72..050cc9793 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -48,7 +48,6 @@ import de.anomic.kelondro.text.Segment; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; -import de.anomic.kelondro.util.ScoreCluster; import de.anomic.kelondro.util.SortStack; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.parser.Word; @@ -71,13 +70,13 @@ public final class plasmaSearchRankingProcess { private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; private final ReferenceOrder order; private final ConcurrentHashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) - private final ScoreCluster ref; // reference score computation for the commonSense heuristic private final int[] flagcount; // flag counter private final TreeSet misses; // contains url-hashes that could not been found in the LURL-DB private final Segment indexSegment; private HashMap>[] localSearchContainerMaps; private final int[] domZones; - private ConcurrentHashMap hostNavigator; + private final ConcurrentHashMap hostNavigator; + private final ConcurrentHashMap ref; // reference score computation for the commonSense heuristic public plasmaSearchRankingProcess( final Segment indexSegment, @@ -99,13 +98,13 @@ public final class plasmaSearchRankingProcess { this.remote_resourceSize = 0; this.local_resourceSize = 0; this.urlhashes = new ConcurrentHashMap(0, 0.75f, concurrency); - this.ref = new ScoreCluster(); this.misses = new TreeSet(); this.indexSegment = indexSegment; this.flagcount = new int[32]; for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} - this.domZones = new int[8]; this.hostNavigator = new ConcurrentHashMap(); + this.ref = new ConcurrentHashMap(); + this.domZones = new int[8]; for (int i = 0; i < 8; i++) {this.domZones[i] = 0;} } @@ -232,52 +231,6 @@ public final class plasmaSearchRankingProcess { serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false); } - public class hoststat { - public int count; - public String hashsample; - public hoststat(String urlhash) { - this.count = 1; - this.hashsample = urlhash; - } - public void inc() { - this.count++; - } - } - - public static final Comparator hscomp = new Comparator() { - public int compare(hoststat o1, hoststat o2) { - if (o1.count < o2.count) return 1; - if (o2.count < o1.count) return -1; - return 0; - } - }; - - public class hostnaventry { - public int count; - public String host; - public hostnaventry(String host, int count) { - this.host = host; - this.count = count; - } - } - - public ArrayList getHostNavigator(int maxentries) { - hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]); - Arrays.sort(hsa, hscomp); - int rc = Math.min(maxentries, hsa.length); - ArrayList result = new ArrayList(); - URLMetadataRow mr; - yacyURL url; - for (int i = 0; i < rc; i++) { - mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0); - if (mr == null) continue; - url = mr.metadata().url(); - if (url == null) continue; - result.add(new hostnaventry(url.getHost(), hsa[i].count)); - } - return result; - } - private boolean testFlags(final WordReference ientry) { if (query.constraint == null) return true; // test if ientry matches with filter @@ -424,37 +377,103 @@ public final class plasmaSearchRankingProcess { return this.misses.iterator(); } - public Set getReferences(final int count) { + public class hoststat { + public int count; + public String hashsample; + public hoststat(String urlhash) { + this.count = 1; + this.hashsample = urlhash; + } + public void inc() { + this.count++; + } + } + + public static final Comparator hscomp = new Comparator() { + public int compare(hoststat o1, hoststat o2) { + if (o1.count < o2.count) return 1; + if (o2.count < o1.count) return -1; + return 0; + } + }; + + public class NavigatorEntry { + public int count; + public String name; + public NavigatorEntry(String name, int count) { + this.name = name; + this.count = count; + } + } + + public ArrayList getHostNavigator(int count) { + hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]); + Arrays.sort(hsa, hscomp); + int rc = Math.min(count, hsa.length); + ArrayList result = new ArrayList(); + URLMetadataRow mr; + yacyURL url; + for (int i = 0; i < rc; i++) { + mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0); + if (mr == null) continue; + url = mr.metadata().url(); + if (url == null) continue; + result.add(new NavigatorEntry(url.getHost(), hsa[i].count)); + } + return result; + } + + public static final Comparator> mecomp = new Comparator>() { + public int compare(Map.Entry o1, Map.Entry o2) { + if (o1.getValue().intValue() < o2.getValue().intValue()) return 1; + if (o2.getValue().intValue() < o1.getValue().intValue()) return -1; + return 0; + } + }; + + @SuppressWarnings("unchecked") + public ArrayList getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls - final Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE); - final TreeSet s = new TreeSet(String.CASE_INSENSITIVE_ORDER); - for (int i = 0; i < refs.length; i++) { - s.add((String) refs[i]); + + Map.Entry[] a = this.ref.entrySet().toArray(new Map.Entry[this.ref.size()]); + Arrays.sort(a, mecomp); + int rc = Math.min(count, a.length); + ArrayList result = new ArrayList(); + Map.Entry e; + int c; + for (int i = 0; i < rc; i++) { + e = a[i]; + c = e.getValue().intValue(); + if (c == 0) break; + result.add(new NavigatorEntry(e.getKey(), c)); } - return s; + return result; } - public void addReferences(final String[] words) { + public void addTopic(final String[] words) { String word; for (int i = 0; i < words.length; i++) { word = words[i].toLowerCase(); + Integer c; if ((word.length() > 2) && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && - (!(query.queryHashes.contains(Word.word2hash(word))))) - ref.incScore(word); + (!(query.queryHashes.contains(Word.word2hash(word))))) { + c = ref.get(word); + if (c == null) ref.put(word, 1); else ref.put(word, c.intValue() + 1); + } } } - protected void addReferences(final plasmaSearchEvent.ResultEntry resultEntry) { + protected void addTopics(final plasmaSearchEvent.ResultEntry resultEntry) { // take out relevant information for reference computation if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; - final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url + //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url final String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description // add references - addReferences(urlcomps); - addReferences(descrcomps); + //addTopic(urlcomps); + addTopic(descrcomps); } public ReferenceOrder getOrder() { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 3eaa91a63..4cbbcabb6 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -604,8 +604,8 @@ public final class yacyClient { yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references); if (references != null) { // add references twice, so they can be countet (must have at least 2 entries) - containerCache.addReferences(references.split(",")); - containerCache.addReferences(references.split(",")); + containerCache.addTopic(references.split(",")); + containerCache.addTopic(references.split(",")); } }