diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index f4ff43a0f..bab9f7922 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -42,6 +42,7 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.HarvestProcess; +import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.RSSLoader; import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; @@ -62,7 +63,7 @@ public class Load_RSS_p { final Switchboard sb = (Switchboard)env; final String collection = post == null ? "user" : CommonPattern.SPACE.matcher(post.get("collection", "user").trim()).replaceAll(""); - final String[] collections = collection.length() == 0 ? new String[0] : collection.split(","); + Map collections = CrawlProfile.collectionParser(collection); boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt); prop.put("showload_collectionEnabled", collectionEnabled ? 1 : 0); prop.put("showload_collection", collection); diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 4d3737b9b..a07bdf125 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -26,7 +26,9 @@ package net.yacy.crawler.data; import java.text.DateFormat; +import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -43,6 +45,7 @@ import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.kelondro.data.word.Word; +import net.yacy.search.query.QueryParams; import net.yacy.server.serverObjects; public class CrawlProfile extends ConcurrentHashMap implements Map { @@ -259,15 +262,29 @@ public class CrawlProfile extends ConcurrentHashMap implements M //if (r == null) return null; return r; } + + private Map cmap = null; /** * get the collections for this crawl * @return a list of collection names */ - public String[] collections() { + public Map collections() { + if (cmap != null) return cmap; final String r = get(COLLECTIONS); - if (r == null) return new String[0]; - return r.split(","); + this.cmap = collectionParser(r); + return this.cmap; + } + + public static Map collectionParser(String collectionString) { + if (collectionString == null || collectionString.length() == 0) return new HashMap(); + String[] cs = collectionString.split(","); + final Map cm = new LinkedHashMap(); + for (String c: cs) { + int p = c.indexOf(':'); + if (p < 0) cm.put(c, QueryParams.catchall_pattern); else cm.put(c.substring(0, p), Pattern.compile(c.substring(p + 1))); + } + return cm; } /** diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index 61a6d0ecc..80dffa45a 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -31,6 +31,7 @@ import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSFeed; @@ -58,10 +59,10 @@ public class RSSLoader extends Thread { private final DigestURI urlf; private final Switchboard sb; - private final String[] collections; + private final Map collections; private final ClientIdentification.Agent agent; - public RSSLoader(final Switchboard sb, final DigestURI urlf, final String[] collections, final ClientIdentification.Agent agent) { + public RSSLoader(final Switchboard sb, final DigestURI urlf, final Map collections, final ClientIdentification.Agent agent) { this.sb = sb; this.urlf = urlf; this.collections = collections; @@ -93,7 +94,7 @@ public class RSSLoader extends Thread { recordAPI(this.sb, null, this.urlf, feed, 7, "seldays"); } - public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, String[] collections) { + public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, Map collections) { int loadCount = 0; List list = new ArrayList(); Map urlmap = new HashMap(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d0c6fdf79..583205489 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2685,7 +2685,7 @@ public final class Switchboard extends serverSwitch { private void storeDocumentIndex( final Response queueEntry, - final String[] collections, + final Map collections, final Document document, final Condenser condenser, final SearchEvent searchEvent, @@ -2808,7 +2808,7 @@ public final class Switchboard extends serverSwitch { final Map links, final SearchEvent searchEvent, final String heuristicName, - final String[] collections) { + final Map collections) { List urls = new ArrayList(); // add the landing page to the index. should not load that again since it should be in the cache @@ -2978,7 +2978,7 @@ public final class Switchboard extends serverSwitch { * @throws IOException * @throws Parser.Failure */ - public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final String[] collections) { + public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections) { Map urlmap = new HashMap(); for (DigestURI url: urls) urlmap.put(ASCII.String(url.hash()), url); if (searchEvent != null) { @@ -3421,7 +3421,7 @@ public final class Switchboard extends serverSwitch { } // add all pages to the index - addAllToIndex(url, links, searchEvent, "site", new String[]{"site"}); + addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site")); } } catch (final Throwable e ) { ConcurrentLog.logException(e); @@ -3535,7 +3535,7 @@ public final class Switchboard extends serverSwitch { + feedName + "' rss feed"); // add all pages to the index - addAllToIndex(null, links, searchEvent, feedName, new String[]{"rss"}); + addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss")); } } catch (final Throwable e ) { //Log.logException(e); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index b5870827e..d33d020a5 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -36,6 +36,7 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; +import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; @@ -575,7 +576,7 @@ public class Segment { public SolrInputDocument storeDocument( final DigestURI url, final DigestURI referrerURL, - final String[] collections, + final Map collections, final ResponseHeader responseHeader, final Document document, final Condenser condenser, diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e267c2e7d..8d5828283 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -42,6 +42,7 @@ import java.util.Properties; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; +import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -195,7 +196,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return sd; } - public void addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURI digestURI, final char doctype) { + /** + * add uri attributes to solr document + * @param doc + * @param allAttr + * @param digestURI + * @param doctype + * @return the normalized url + */ + public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURI digestURI, final char doctype) { add(doc, CollectionSchema.id, ASCII.String(digestURI.hash())); String us = digestURI.toNormalform(true); add(doc, CollectionSchema.sku, us); @@ -236,6 +245,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.url_parameter_key_sxt)) add(doc, CollectionSchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()])); if (allAttr || contains(CollectionSchema.url_parameter_value_sxt)) add(doc, CollectionSchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()])); } + return us; } public SolrInputDocument metadata2solr(final URIMetadataRow md) { @@ -346,7 +356,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } public SolrVector yacy2solr( - final String id, final String[] collections, final ResponseHeader responseHeader, + final String id, final Map collections, final ResponseHeader responseHeader, final Document document, final Condenser condenser, final DigestURI referrerURL, final String language, final IndexCell citations, final WebgraphConfiguration webgraph) { @@ -354,7 +364,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri SolrVector doc = new SolrVector(); final DigestURI digestURI = document.dc_source(); boolean allAttr = this.isEmpty(); - addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); + String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); Set processTypes = new LinkedHashSet(); @@ -378,7 +388,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri processTypes.add(ProcessType.CITATION); // postprocessing needed } - if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.length > 0) add(doc, CollectionSchema.collection_sxt, collections); + if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) { + List cs = new ArrayList(); + for (Map.Entry e: collections.entrySet()) { + if (e.getValue().matcher(url).matches()) cs.add(e.getKey()); + } + add(doc, CollectionSchema.collection_sxt, cs); + } List titles = document.titles(); if (allAttr || contains(CollectionSchema.title)) { @@ -1166,19 +1182,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param httpstatus * @throws IOException */ - public SolrInputDocument err(final DigestURI digestURI, final String[] collections, final String failReason, final FailType failType, final int httpstatus) throws IOException { + public SolrInputDocument err(final DigestURI digestURI, final Map collections, final String failReason, final FailType failType, final int httpstatus) throws IOException { boolean allAttr = this.isEmpty(); assert allAttr || contains(CollectionSchema.failreason_s); final SolrInputDocument doc = new SolrInputDocument(); - addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); + String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI)); if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, new Date()); // fail reason and status if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, failReason); if (allAttr || contains(CollectionSchema.failtype_s)) add(doc, CollectionSchema.failtype_s, failType.name()); if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, httpstatus); - if (allAttr || contains(CollectionSchema.collection_sxt)) add(doc, CollectionSchema.collection_sxt, collections); + if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) { + List cs = new ArrayList(); + for (Map.Entry e: collections.entrySet()) { + if (e.getValue().matcher(url).matches()) cs.add(e.getKey()); + } + add(doc, CollectionSchema.collection_sxt, cs); + } return doc; } diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 5d944ca3d..e3bf994ef 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -37,6 +37,7 @@ import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.concurrent.BlockingQueue; +import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -114,7 +115,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial public void addEdges( final Subgraph subgraph, - final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source, + final DigestURI source, final ResponseHeader responseHeader, Map collections, int clickdepth_source, final Map alllinks, final Map images, final boolean inbound, final Set links, final IndexCell citations) { @@ -146,11 +147,17 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial add(edge, WebgraphSchema.load_date_dt, loadDate); } if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified()); - add(edge, WebgraphSchema.collection_sxt, collections); + final String source_url_string = source.toNormalform(false); + if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) { + List cs = new ArrayList(); + for (Map.Entry e: collections.entrySet()) { + if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey()); + } + add(edge, WebgraphSchema.collection_sxt, cs); + } // add the source attributes add(edge, WebgraphSchema.source_id_s, source_id); - final String source_url_string = source.toNormalform(false); int pr_source = source_url_string.indexOf("://",0); if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source)); if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));