From 11b78539406a234cf9492318791459b475305fbf Mon Sep 17 00:00:00 2001
From: orbiter
Date: Sun, 27 Jun 2010 21:38:16 +0000
Subject: [PATCH] added a configuration page for search heuristics. currently
you can switch on there: - a site-operation heuristic that loads all direct
links from a portal page if the site-operator is used - a direct crawl for
search results from scroogle for the given search terms The configuration
page can be found directly beside the network configuration page
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6951 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
defaults/yacy.init | 5 ++
htroot/ConfigHeuristics_p.html | 72 ++++++++++++++++++
htroot/ConfigHeuristics_p.java | 57 ++++++++++++++
htroot/env/templates/submenuConfig.template | 1 +
htroot/yacysearch.java | 4 +-
htroot/yacysearchitem.html | 4 +-
htroot/yacysearchitem.java | 22 +++---
source/de/anomic/search/ReferenceOrder.java | 84 +++++++++++++--------
source/de/anomic/search/SearchEvent.java | 2 +-
source/de/anomic/search/Switchboard.java | 4 +-
10 files changed, 205 insertions(+), 50 deletions(-)
create mode 100644 htroot/ConfigHeuristics_p.html
create mode 100644 htroot/ConfigHeuristics_p.java
diff --git a/defaults/yacy.init b/defaults/yacy.init
index 86ac2d0a9..138a65991 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -941,3 +941,8 @@ segment.process.default_tmp = default
# this is only shown, if the about.body is filled
about.headline =
about.body =
+
+# search heuristics
+heuristic.site = false
+heuristic.scroogle = false
+
diff --git a/htroot/ConfigHeuristics_p.html b/htroot/ConfigHeuristics_p.html
new file mode 100644
index 000000000..3a42dc786
--- /dev/null
+++ b/htroot/ConfigHeuristics_p.html
@@ -0,0 +1,72 @@
+
+
+
+ YaCy '#[clientname]#': Network Configuration
+ #%env/templates/metas.template%#
+
+
+ #%env/templates/header.template%#
+ #%env/templates/submenuConfig.template%#
+ Heuristics Configuration
+
+ A heuristic is an 'experience-based technique that help in problem solving, learning and discovery' (wikipedia). The search heuristics that can be switched on here are techniques that help the discovery of possible search results based on link guessing, in-search crawling and requests to other search engines.
+ When a search heuristic is used, the resulting links are not used directly as search result but the loaded pages are indexed and stored like other content. This ensures that blacklists can be used and that the searched word actually appears on the page that was discovered by the heuristic.
+
+
+
+
+
#(heuristic)#::
-
![heuristic:#[name]# (redundant) heuristic#[name]# (redundant)](/env/grafics/heuristic_redundant.gif)
::
-
![heuristic:#[name]# (new link) heuristic#[name]# (new link)](/env/grafics/heuristic_new.gif)
+
![heuristic:#[name]# (redundant) heuristic:#[name]# (redundant)](/env/grafics/heuristic_redundant.gif)
::
+
![heuristic:#[name]# (new link) heuristic:#[name]# (new link)](/env/grafics/heuristic_new.gif)
#(/heuristic)#
#(authorized)#::

diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java
index a7c606499..035f8d47b 100644
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@@ -117,17 +117,6 @@ public class yacysearchitem {
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", new String(result.hash()));
- SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash());
- if (heuristic == null) {
- prop.put("content_heuristic", 0);
- } else {
- if (heuristic.redundant) {
- prop.put("content_heuristic", 1);
- } else {
- prop.put("content_heuristic", 2);
- }
- prop.put("content_heuristic_name", heuristic.heuristicName);
- }
String resulthashString = new String(result.hash());
prop.putHTML("content_title", result.title());
prop.putXML("content_title-xml", result.title());
@@ -160,6 +149,17 @@ public class yacysearchitem {
prop.put("content_description", desc);
prop.putXML("content_description-xml", desc);
prop.putJSON("content_description-json", desc);
+ SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash());
+ if (heuristic == null) {
+ prop.put("content_heuristic", 0);
+ } else {
+ if (heuristic.redundant) {
+ prop.put("content_heuristic", 1);
+ } else {
+ prop.put("content_heuristic", 2);
+ }
+ prop.put("content_heuristic_name", heuristic.heuristicName);
+ }
EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.FINALIZATION + "-" + item, 0, 0), false, 30000, ProfilingGraph.maxTime);
return prop;
diff --git a/source/de/anomic/search/ReferenceOrder.java b/source/de/anomic/search/ReferenceOrder.java
index 16b791144..ef9ea2f25 100644
--- a/source/de/anomic/search/ReferenceOrder.java
+++ b/source/de/anomic/search/ReferenceOrder.java
@@ -117,7 +117,6 @@ public class ReferenceOrder {
private final BlockingQueue
decodedEntries;
public NormalizeWorker(final BlockingQueue out, Semaphore termination) {
- // normalize ranking: find minimum and maximum of separate ranking criteria
this.out = out;
this.termination = termination;
this.decodedEntries = new LinkedBlockingQueue();
@@ -131,38 +130,8 @@ public class ReferenceOrder {
}
public void run() {
-
- Map doms0 = new HashMap();
- Integer int1 = 1;
-
- WordReferenceVars iEntry;
- String dom;
- Integer count;
try {
- // calculate min and max for normalization
- while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) {
- out.put(iEntry);
- // find min/max
- if (min == null) min = iEntry.clone(); else min.min(iEntry);
- if (max == null) max = iEntry.clone(); else max.max(iEntry);
- // update domcount
- dom = new String(iEntry.metadataHash()).substring(6);
- count = doms0.get(dom);
- if (count == null) {
- doms0.put(dom, int1);
- } else {
- doms0.put(dom, Integer.valueOf(count.intValue() + 1));
- }
- }
-
- // update domain score
- Map.Entry entry;
- final Iterator> di = doms0.entrySet().iterator();
- while (di.hasNext()) {
- entry = di.next();
- doms.addScore(entry.getKey(), (entry.getValue()).intValue());
- }
- if (!doms.isEmpty()) maxdomcount = doms.getMaxScore();
+ addNormalizer(decodedEntries, out);
} catch (InterruptedException e) {
Log.logException(e);
} catch (Exception e) {
@@ -177,6 +146,57 @@ public class ReferenceOrder {
}
}
+ /**
+ * normalize ranking: find minimum and maximum of separate ranking criteria
+ * @param decodedEntries
+ * @param out
+ * @throws InterruptedException
+ */
+ public void addNormalizer(BlockingQueue decodedEntries, final BlockingQueue out) throws InterruptedException {
+ WordReferenceVars iEntry;
+ Map doms0 = new HashMap();
+ String dom;
+ Integer count;
+ final Integer int1 = 1;
+ while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) {
+ out.put(iEntry);
+ // find min/max
+ if (min == null) min = iEntry.clone(); else min.min(iEntry);
+ if (max == null) max = iEntry.clone(); else max.max(iEntry);
+ // update domcount
+ dom = new String(iEntry.metadataHash()).substring(6);
+ count = doms0.get(dom);
+ if (count == null) {
+ doms0.put(dom, int1);
+ } else {
+ doms0.put(dom, Integer.valueOf(count.intValue() + 1));
+ }
+ }
+
+ // update domain score
+ Map.Entry entry;
+ final Iterator> di = doms0.entrySet().iterator();
+ while (di.hasNext()) {
+ entry = di.next();
+ doms.addScore(entry.getKey(), (entry.getValue()).intValue());
+ }
+ if (!doms.isEmpty()) this.maxdomcount = doms.getMaxScore();
+ }
+
+ public void addNormalizer(WordReferenceVars iEntry, final BlockingQueue out) throws InterruptedException {
+ out.put(iEntry);
+
+ // find min/max
+ if (min == null) min = iEntry.clone(); else min.min(iEntry);
+ if (max == null) max = iEntry.clone(); else max.max(iEntry);
+
+ // update domcount
+ String dom = new String(iEntry.metadataHash()).substring(6);
+ doms.addScore(dom, 1);
+
+ if (!doms.isEmpty()) this.maxdomcount = doms.getMaxScore();
+ }
+
public int authority(final byte[] urlHash) {
return (doms.getScore(new String(urlHash, 6, 6)) << 8) / (1 + this.maxdomcount);
}
diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java
index d58b3f26e..7816611ae 100644
--- a/source/de/anomic/search/SearchEvent.java
+++ b/source/de/anomic/search/SearchEvent.java
@@ -319,7 +319,7 @@ public final class SearchEvent {
return this.rankedCache.getAuthorNavigator(maxentries);
}
- public void addHeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) {
+ public void addHeuristic(byte[] urlhash, String heuristicName, boolean redundant) {
synchronized (this.heuristics) {
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
}
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index 26c4b15ee..9383c0b1e 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -1930,7 +1930,7 @@ public final class Switchboard extends serverSwitch {
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, ParserException {
final Segments.Process process = Segments.Process.LOCALCRAWLING;
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) {
- searchEvent.addHeuristicResult(url.hash(), heuristicName, true);
+ searchEvent.addHeuristic(url.hash(), heuristicName, true);
return; // don't do double-work
}
final Request request = loader.request(url, true, true);
@@ -1939,9 +1939,9 @@ public final class Switchboard extends serverSwitch {
log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
return;
}
+ searchEvent.addHeuristic(url.hash(), heuristicName, false);
new Thread() {public void run() {
try {
- searchEvent.addHeuristicResult(url.hash(), heuristicName, false);
Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
if (response == null) throw new IOException("response == null");
if (response.getContent() == null) throw new IOException("content == null");