added feature as requested in

http://forum.yacy-websuche.de/viewtopic.php?f=18&t=3461
The search can now be configured with a non-display host list.
the search will always exlude the given list of host unless they are
requested directly using the host navigation
pull/1/head
Michael Peter Christen 14 years ago
parent 42661fa0cd
commit 0bcef2d156

@ -762,6 +762,9 @@ search.navigation=hosts,authors,namespace,topics,filetype,protocol
all search results are valid without verification all search results are valid without verification
search.verify = iffresh search.verify = iffresh
search.excludehosts=
search.excludehosth=
# in case that a link verification fails then the corresponding index reference can be # in case that a link verification fails then the corresponding index reference can be
# deleted to clean up the index. If this property is set then failed index verification in # deleted to clean up the index. If this property is set then failed index verification in
# the cases of nocache, iffresh and ifexist causes an index deletion # the cases of nocache, iffresh and ifexist causes an index deletion

@ -109,12 +109,18 @@
</select> </select>
</dd> </dd>
<dt>Exclude Hosts</dt>
<dd>List of hosts that shall be excluded from search results by default but can be included using the site:&lt;host&gt; operator:<br/>
<input type="text" name="search.excludehosts" value="#[search.excludehosts]#" size="60" /><br/>
#[search.excludehosth]#
</dd>
<dt>'About' Column<br/>(shown in a column alongside<br/>with the search result page)</dt> <dt>'About' Column<br/>(shown in a column alongside<br/>with the search result page)</dt>
<dd><input type="text" name="about.headline" value="#[about.headline]#" size="60" />(Headline)</br> <dd><input type="text" name="about.headline" value="#[about.headline]#" size="60" />(Headline)</br>
<textarea name="about.body" cols="60" rows="8">#[about.body]#</textarea>(Content) <textarea name="about.body" cols="60" rows="8">#[about.body]#</textarea>(Content)
</dd> </dd>
<dt>&nbsp;</dt> <dt>&nbsp;</dt>
<dd> <dd>
<input type="submit" name="searchpage_set" value="Change Search Page" />&nbsp;&nbsp; <input type="submit" name="searchpage_set" value="Change Search Page" />&nbsp;&nbsp;
<input type="submit" name="searchpage_default" value="Set to Default Values" /> <input type="submit" name="searchpage_default" value="Set to Default Values" />

@ -26,6 +26,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
import de.anomic.data.WorkTables; import de.anomic.data.WorkTables;
@ -93,6 +94,10 @@ public class ConfigPortal {
sb.setConfig("about.headline", post.get("about.headline", "")); sb.setConfig("about.headline", post.get("about.headline", ""));
sb.setConfig("about.body", post.get("about.body", "")); sb.setConfig("about.body", post.get("about.body", ""));
String excludehosts = post.get("search.excludehosts", "");
sb.setConfig("search.excludehosts", excludehosts);
sb.setConfig("search.excludehosth", DigestURI.hosthashes(excludehosts));
// construct navigation String // construct navigation String
String nav = ""; String nav = "";
if (post.getBoolean("search.navigation.hosts", false)) nav += "hosts,"; if (post.getBoolean("search.navigation.hosts", false)) nav += "hosts,";
@ -126,8 +131,10 @@ public class ConfigPortal {
sb.setConfig("search.result.show.pictures", false); sb.setConfig("search.result.show.pictures", false);
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, "iffresh"); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, "iffresh");
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, "true"); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, "true");
prop.put("about.headline", ""); sb.setConfig("about.headline", "");
prop.put("about.body", ""); sb.setConfig("about.body", "");
sb.setConfig("search.excludehosts", "");
sb.setConfig("search.excludehosth", "");
} }
} }
@ -167,6 +174,9 @@ public class ConfigPortal {
prop.put("about.headline", sb.getConfig("about.headline", "")); prop.put("about.headline", sb.getConfig("about.headline", ""));
prop.put("about.body", sb.getConfig("about.body", "")); prop.put("about.body", sb.getConfig("about.body", ""));
prop.put("search.excludehosts", sb.getConfig("search.excludehosts", ""));
prop.put("search.excludehosth", sb.getConfig("search.excludehosth", ""));
final String browserPopUpPage = sb.getConfig(SwitchboardConstants.BROWSER_POP_UP_PAGE, "ConfigBasic.html"); final String browserPopUpPage = sb.getConfig(SwitchboardConstants.BROWSER_POP_UP_PAGE, "ConfigBasic.html");
prop.put("popupFront", 0); prop.put("popupFront", 0);
prop.put("popupSearch", 0); prop.put("popupSearch", 0);

@ -244,6 +244,7 @@ public final class search {
null, null,
false, false,
sitehash, sitehash,
null,
authorhash, authorhash,
DigestURI.TLD_any_zone_filter, DigestURI.TLD_any_zone_filter,
client, client,
@ -305,6 +306,7 @@ public final class search {
constraint, constraint,
false, false,
sitehash, sitehash,
null,
authorhash, authorhash,
DigestURI.TLD_any_zone_filter, DigestURI.TLD_any_zone_filter,
client, client,

@ -618,6 +618,7 @@ public class yacysearch {
constraint, constraint,
true, true,
sitehash, sitehash,
DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")),
authorhash, authorhash,
DigestURI.TLD_any_zone_filter, DigestURI.TLD_any_zone_filter,
client, client,

@ -30,6 +30,8 @@ package net.yacy.kelondro.data.meta;
import java.io.File; import java.io.File;
import java.io.Serializable; import java.io.Serializable;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
@ -71,6 +73,37 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
} }
return (url == null) ? null : ASCII.String(url.hash(), 6, 6); return (url == null) ? null : ASCII.String(url.hash(), 6, 6);
} }
/**
* from a given list of hosts make a list of host hashes
* the list is separated by comma
* @param hostlist
* @return list of host hashes without separation
*/
public static String hosthashes(final String hostlist) {
String[] hs = hostlist.split(",");
StringBuilder sb = new StringBuilder(hostlist.length());
for (String h: hs) {
if (h == null) continue;
h = h.trim();
if (h.length() == 0) continue;
h = hosthash(h);
if (h == null || h.length() != 6) continue;
sb.append(h);
}
return sb.toString();
}
public static Set<String> hosthashess(String hosthashes) {
if (hosthashes == null || hosthashes.length() == 0) return null;
HashSet<String> h = new HashSet<String>();
assert hosthashes.length() % 6 == 0;
for (int i = 0; i < hosthashes.length(); i = i + 6) {
h.add(hosthashes.substring(i, i + 6));
}
return h;
}
/** /**
* DigestURI from File * DigestURI from File

@ -29,8 +29,10 @@ package net.yacy.search.query;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.SortedSet; import java.util.SortedSet;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
@ -124,6 +126,7 @@ public final class QueryParams {
private final Segment indexSegment; private final Segment indexSegment;
public final String host; // this is the client host that starts the query, not a site operator public final String host; // this is the client host that starts the query, not a site operator
public final String sitehash; // this is a domain hash, 6 bytes long or null public final String sitehash; // this is a domain hash, 6 bytes long or null
public final Set<String> siteexcludes; // set of domain hashes that are excluded if not included by sitehash
public final String authorhash; public final String authorhash;
public final String tenant; public final String tenant;
public final Modifier modifier; public final Modifier modifier;
@ -182,6 +185,7 @@ public final class QueryParams {
this.snippetCacheStrategy = null; this.snippetCacheStrategy = null;
this.host = null; this.host = null;
this.sitehash = null; this.sitehash = null;
this.siteexcludes = null;
this.authorhash = null; this.authorhash = null;
this.remotepeer = null; this.remotepeer = null;
this.time = Long.valueOf(System.currentTimeMillis()); this.time = Long.valueOf(System.currentTimeMillis());
@ -208,6 +212,7 @@ public final class QueryParams {
final Searchdom domType, final int domMaxTargets, final Searchdom domType, final int domMaxTargets,
final Bitfield constraint, final boolean allofconstraint, final Bitfield constraint, final boolean allofconstraint,
final String site, final String site,
final Set<String> siteexcludes,
final String authorhash, final String authorhash,
final int domainzone, final int domainzone,
final String host, final String host,
@ -250,6 +255,7 @@ public final class QueryParams {
this.constraint = constraint; this.constraint = constraint;
this.allofconstraint = allofconstraint; this.allofconstraint = allofconstraint;
this.sitehash = site; assert site == null || site.length() == 6; this.sitehash = site; assert site == null || site.length() == 6;
this.siteexcludes = siteexcludes != null && siteexcludes.size() == 0 ? null: siteexcludes;
this.authorhash = authorhash; assert authorhash == null || !authorhash.isEmpty(); this.authorhash = authorhash; assert authorhash == null || !authorhash.isEmpty();
this.snippetCacheStrategy = snippetCacheStrategy; this.snippetCacheStrategy = snippetCacheStrategy;
this.host = host; this.host = host;
@ -491,6 +497,8 @@ public final class QueryParams {
context.append(asterisk); context.append(asterisk);
context.append(this.sitehash); context.append(this.sitehash);
context.append(asterisk); context.append(asterisk);
context.append(this.siteexcludes);
context.append(asterisk);
context.append(this.authorhash); context.append(this.authorhash);
context.append(asterisk); context.append(asterisk);
context.append(this.targetlang); context.append(this.targetlang);

@ -311,6 +311,9 @@ public final class RWIProcess extends Thread
// check site constraints // check site constraints
final String hosthash = iEntry.hosthash(); final String hosthash = iEntry.hosthash();
if ( this.query.sitehash == null ) { if ( this.query.sitehash == null ) {
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) {
continue pollloop;
}
// no site constraint there; maybe collect host navigation information // no site constraint there; maybe collect host navigation information
if ( nav_hosts && this.query.urlMask_isCatchall ) { if ( nav_hosts && this.query.urlMask_isCatchall ) {
this.hostNavigator.inc(hosthash); this.hostNavigator.inc(hosthash);

Loading…
Cancel
Save