diff --git a/defaults/federatecfg/datacite.solr.schema b/defaults/federatecfg/datacite.solr.schema
new file mode 100644
index 000000000..11823deae
--- /dev/null
+++ b/defaults/federatecfg/datacite.solr.schema
@@ -0,0 +1,32 @@
+## API datacite.org
+## This service is also available as an API. We use Solr Search Handler for our API calls, the endpoint is: http://search.datacite.org/api.
+
+## Please check Solr's common query parameters documentation in order to understand how to use API.
+## Examples
+
+## http://search.datacite.org/api?q=wind simple search for wind
+## http://search.datacite.org/api?q=wind&fl=doi,title&rows=5 search for wind, retrieve only doi and title, and return (at max.) 5 results
+## http://search.datacite.org/api?q=wind&fl=doi,title&wt=csv csv output
+## http://search.datacite.org/api?q=wind&fl=doi,title&wt=json&indent=true json output
+
+## YaCy solrconnector specific settings
+## the basic url to acces the system
+_baseurl = http://search.datacite.org/
+## Solr core, is appended to the _baseurl
+_corename = api
+## some systems store a identifier instead of a url for the resource, the prefix is prepended the identifier in _skufieldname
+_skuprefix = http://dx.doi.org/
+## the field name of the url of resource (in yacy/solr = sku)
+_skufieldname = doi
+
+## field mappings
+## YaCyFieldname = remoteFieldname
+keywords = subject
+author = creator
+publisher_t = publisher
+title = title
+description_txt = description
+language_s = language
+text_t = description
+size_i = size
+coordinate_p = geoLocationPoint
\ No newline at end of file
diff --git a/defaults/heuristicopensearch.conf b/defaults/heuristicopensearch.conf
index 143f25a03..712674aba 100644
--- a/defaults/heuristicopensearch.conf
+++ b/defaults/heuristicopensearch.conf
@@ -14,8 +14,15 @@
#Blekko = http://blekko.com/ws/{searchTerms}+/rss # get 20 results from blekko
#Faroo-News = http://www.faroo.com/api?q={searchTerms}&start={startIndex}&length=20&l=en&src=news&f=rss # get results from Faroo news-search
-#openBDB = http://www.openbdb.com/b/{searchTerms}.xml # Open Book Database
#WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs
#Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv
#Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2
#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web
+
+## In addition to OpenSearch systems other connectors are available to query foreign systems
+## the syntax is
+## SystemName = cfgfile:_connectortype_:_schemaconfig_
+## where cfgfile: is a fix prefix (to signal this is not a opensearch url)
+## _connectortype_ is the type of connector to use ( available is solrconnector )
+## _schemaconfig_ is the config file with filed name mappings (the file has to exist in DATA/SETTINGS/federatecfg
+#datacite.org = cfgfile:solrconnector:datacite.solr.schema # International Consortium for data citation
diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java
index a171ea078..6bd9768c9 100644
--- a/htroot/ConfigHeuristics_p.java
+++ b/htroot/ConfigHeuristics_p.java
@@ -25,7 +25,6 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-import com.google.common.io.Files;
import java.io.File;
@@ -37,9 +36,10 @@ import net.yacy.search.Switchboard;
import java.io.IOException;
import java.util.Iterator;
+import net.yacy.cora.federate.FederateSearchManager;
-import net.yacy.cora.federate.opensearch.OpenSearchConnector;
import net.yacy.cora.federate.solr.SchemaConfiguration;
+import net.yacy.cora.storage.Files;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.WebgraphSchema;
import net.yacy.server.serverObjects;
@@ -66,9 +66,9 @@ public class ConfigHeuristics_p {
if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false);
if (post.containsKey("opensearch_on")) {
sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true);
- // re-read config (and create work table)
- OpenSearchConnector os = new OpenSearchConnector(sb, true);
- if (os.getSize() == 0) {
+ // re-read config
+ FederateSearchManager.getManager().init(sb.getDataPath().getAbsolutePath()+ "DATA/SETTINGS/heuristicopensearch.conf");
+ if (FederateSearchManager.getManager().getSize() == 0) {
osderrmsg = "no active search targets are configured";
}
}
@@ -77,8 +77,8 @@ public class ConfigHeuristics_p {
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()));
if (metafieldavailable) {
- OpenSearchConnector osc = new OpenSearchConnector(sb, false);
- if (osc.discoverFromSolrIndex(sb)) {
+ //OpenSearchConnector osc = new OpenSearchConnector(sb, false);
+ if (FederateSearchManager.getManager().discoverFromSolrIndex(sb)) {
osderrmsg = "started background search for target systems, refresh page after some minutes";
} else {
osderrmsg = "Error: webgraph Solr index not enabled";
@@ -98,8 +98,7 @@ public class ConfigHeuristics_p {
if (tmpname != null && tmpurl !=null) {
if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) {
final String tmpcomment = post.get("ossys_newcomment");
- OpenSearchConnector osc = new OpenSearchConnector(sb,false);
- osc.add (tmpname,tmpurl,false,tmpcomment);
+ FederateSearchManager.getManager().addOpenSearchTarget(tmpname,tmpurl,false,tmpcomment);
} else osderrmsg = "Url template must contain '{searchTerms}'";
}
}
@@ -143,6 +142,10 @@ public class ConfigHeuristics_p {
if ((post.containsKey("resettodefaultosdlist") || !osdConfig.exists()) && osdDefaultConfig.exists()) {
try {
Files.copy(osdDefaultConfig, osdConfig);
+ File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg");
+ if (!defdir.exists()) {
+ Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir);
+ }
} catch (final IOException ex) {
osderrmsg = "file I/O error during copy";
}
@@ -240,7 +243,7 @@ public class ConfigHeuristics_p {
// re-read config (and create/update work table)
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) {
- new OpenSearchConnector(sb, true);
+ FederateSearchManager.getManager().init(f.getAbsolutePath());
}
}
}
diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java
index 2ca5a194b..97e408c8c 100644
--- a/htroot/ConfigNetwork_p.java
+++ b/htroot/ConfigNetwork_p.java
@@ -127,8 +127,8 @@ public class ConfigNetwork_p
sb.peers.mySeed().setPeerTags(MapTools.string2set(normalizedList(post.get("peertags")), ","));
}
- sb.setConfig("cluster.mode", post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER));
- sb.setConfig("cluster.peers.ipport", checkIPPortList(post.get("cluster.peers.ipport", "")));
+ sb.setConfig(SwitchboardConstants.CLUSTER_MODE, post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER));
+ sb.setConfig(SwitchboardConstants.CLUSTER_PEERS_IPPORT, checkIPPortList(post.get(SwitchboardConstants.CLUSTER_PEERS_IPPORT, "")));
sb.setConfig(
"cluster.peers.yacydomain",
checkYaCyDomainList(post.get("cluster.peers.yacydomain", "")));
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 24e7688d0..4743c59f7 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -45,7 +45,7 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
-import net.yacy.cora.federate.opensearch.OpenSearchConnector;
+import net.yacy.cora.federate.FederateSearchManager;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.lod.vocabulary.Tagging;
@@ -719,10 +719,10 @@ public class yacysearch {
sb.heuristicSite(theSearch, modifier.sitehost);
}
if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) {
- OpenSearchConnector.query(sb, theSearch);
+ FederateSearchManager.getManager().search(theSearch);
}
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) {
- OpenSearchConnector.query(sb, theSearch);
+ FederateSearchManager.getManager().search(theSearch);
}
}
diff --git a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java
new file mode 100644
index 000000000..48fd05134
--- /dev/null
+++ b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java
@@ -0,0 +1,197 @@
+/**
+ * AbstractFederateSearchConnector.java
+ * Copyright 2015 by Burkhard Buelte
+ * First released 19.01.2015 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option)
+ * any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt If not, see
+ * .
+ */
+package net.yacy.cora.federate;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import javax.servlet.http.HttpServletResponse;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.federate.solr.SchemaConfiguration;
+import net.yacy.cora.federate.solr.SchemaDeclaration;
+import net.yacy.cora.federate.solr.SolrType;
+import net.yacy.cora.sorting.ReversibleScoreMap;
+import net.yacy.cora.storage.Configuration;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.search.Switchboard;
+import net.yacy.search.query.SearchEvent;
+import net.yacy.search.schema.CollectionSchema;
+import org.apache.solr.common.SolrDocument;
+
+/**
+ * Base implementation class for Federated Search Connectors providing the basic
+ * funcitonality to search none YaCy systems
+ *
+ * - init() to read config file
+ *
- toYaCySchema() to convert remote schema fields to YaCy internal schema
+ * names, called by query()
+ *
- query() needs to be implemented in specific connectors
+ *
- search() call's query() in a thread and adds results to internal search request.
+ *
+ * Subclasses should/need to override query() and maybe toYaCySchema() if more
+ * is needed as a basic field mapping
+ */
+abstract public class AbstractFederateSearchConnector implements FederateSearchConnector {
+
+ public String instancename; // just a identifying name
+ protected SchemaConfiguration localcfg; // the schema conversion cfg for each fieldname, yacyname = remote fieldname
+ public long lastaccesstime = -1; // last time accessed, used for search delay calculation
+ protected String baseurl;
+
+ /**
+ * Inits the connector with the remote field names and matches to yacy
+ * schema and other specific settings from config file. Every connector
+ * needs at least a query target (where to query) and some definition to
+ * convert the remote serch result to the internal result presentation
+ * (field mapping)
+ *
+ * @param instanceName internal name
+ * @param cfgFileName e.g. DATA/SETTINGS/FEDERATECFG/instanceName.SCHEMA
+ * @return true if success false if not
+ */
+ @Override
+ public boolean init(String instance, String cfgFileName) {
+ this.instancename = instance;
+ File instanceCfgFile = new File(cfgFileName);
+ if (instanceCfgFile.exists()) {
+ try {
+ this.localcfg = new SchemaConfiguration(instanceCfgFile);
+ } catch (IOException ex) {
+ ConcurrentLog.config(this.instancename, "error reading schema " + cfgFileName);
+ return false;
+ }
+ // mandatory to contain a mapping for "sku" or alternatively "cfg_skufieldname" for a conversion to a final url
+ if (this.localcfg.contains(CollectionSchema.sku) || this.localcfg.contains("_skufieldname")) {
+ return true;
+ } else {
+ ConcurrentLog.config(this.instancename, "mandatory mapping for sku or _skufieldname missing in " + cfgFileName);
+ return false;
+ }
+ } else {
+ this.localcfg = null;
+ return false;
+ }
+ }
+
+ /**
+ * queries a remote system and adds the results to the searchevent and to
+ * the crawler if addResultsToLocalIndex is true
+ *
+ * @param theSearch receiving the results
+ */
+ @Override
+ public void search(final SearchEvent theSearch) {
+
+ final Thread job = new Thread() {
+ @Override
+ public void run() {
+ Thread.currentThread().setName("heuristic:" + instancename);
+ theSearch.oneFeederStarted();
+ List doclist = query(theSearch.getQuery());
+ if (doclist != null) {
+ Map> snippets = new HashMap>(); // add nodes doesn't allow null
+ Map> facets = new HashMap>(); // add nodes doesn't allow null
+ theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size());
+
+ for (URIMetadataNode doc : doclist) {
+ theSearch.addHeuristic(doc.hash(), instancename, false);
+ }
+ }
+ // that's all we need to display serach result
+ theSearch.oneFeederTerminated();
+
+ // optional: add to crawler to get the full resource (later)
+ if (doclist != null && !doclist.isEmpty() && theSearch.addResultsToLocalIndex) {
+ Collection urls = new ArrayList();
+ for (URIMetadataNode doc : doclist) {
+ urls.add(doc.url());
+ }
+ Switchboard.getSwitchboard().addToCrawler(urls, false);
+
+ }
+ }
+ };
+ job.start();
+ }
+
+ /**
+ * Converts a remote schema result to YaCy schema using the fieldname
+ * mapping provided as config file
+ *
+ * @param remote result (with remote fieldnames)
+ * @return SolrDocument with field names according to the YaCy schema
+ */
+ protected URIMetadataNode toYaCySchema(final SolrDocument doc) {
+ // set YaCy id
+ String urlstr;
+ if (localcfg.contains("sku")) {
+ urlstr = (String) doc.getFieldValue(localcfg.get("sku").getValue());
+ } else {
+ urlstr = (String) doc.getFieldValue(localcfg.get("_skufieldname").getValue());
+ if (this.localcfg.contains("_skuprefix")) {
+ String skuprefix = this.localcfg.get("_skuprefix").getValue();
+ urlstr = skuprefix + urlstr;
+ }
+ }
+
+ URIMetadataNode newdoc = new URIMetadataNode(urlstr);
+ Iterator it = localcfg.entryIterator();
+ while (it.hasNext()) {
+ Configuration.Entry et = it.next();
+ String yacyfieldname = et.key(); // config defines yacyfieldname = remotefieldname
+ String remotefieldname = et.getValue();
+ if (remotefieldname != null && !remotefieldname.isEmpty()) {
+ if (Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration().contains(yacyfieldname)) { // check if in local config
+
+ SchemaDeclaration est = CollectionSchema.valueOf(yacyfieldname);
+ if (est.isMultiValued()) {
+ if (doc.getFieldValues(remotefieldname) != null) {
+ newdoc.addField(yacyfieldname, doc.getFieldValues(remotefieldname)); //
+ }
+ } else {
+ if (doc.getFieldValue(remotefieldname) != null) {
+ Object val = doc.getFirstValue(remotefieldname);
+ // watch out for type conversion
+ try {
+ if (est.getType() == SolrType.num_integer && val instanceof String) {
+ newdoc.setField(yacyfieldname, Integer.parseInt((String) val));
+ } else {
+ newdoc.setField(yacyfieldname, val);
+ }
+ } catch (Exception ex) {
+ continue; // catch possible parse or type mismatch, skip the field
+ }
+ }
+ }
+ }
+ }
+ }
+
+ newdoc.addField(CollectionSchema.httpstatus_i.name(), HttpServletResponse.SC_OK); // yacy required
+ return newdoc;
+ }
+}
diff --git a/source/net/yacy/cora/federate/FederateSearchConnector.java b/source/net/yacy/cora/federate/FederateSearchConnector.java
new file mode 100644
index 000000000..4f7ebfa8e
--- /dev/null
+++ b/source/net/yacy/cora/federate/FederateSearchConnector.java
@@ -0,0 +1,62 @@
+/**
+ * FederateSearchConnector.java
+ * Copyright 2015 by Burkhard Buelte
+ * First released 19.01.2015 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option)
+ * any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt If not, see
+ * .
+ */
+package net.yacy.cora.federate;
+
+import java.util.List;
+import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.search.query.QueryParams;
+import net.yacy.search.query.SearchEvent;
+
+
+/**
+ * Interface for a query connector to search and gather query results from none
+ * YaCy systems (for the YaCy heuristic options)
+ */
+public interface FederateSearchConnector {
+
+ /**
+ * Load the configuration for this connector every connector needs at least
+ * a query target (where to query) and some definition to convert the remote
+ * serch result to the internal result presentation (field mapping)
+ *
+ * @param instanceName is also the name of the config file DATA/SETTINGS/instanceName.schema
+ * @param cfg config parameter
+ * @return true if success false if not
+ */
+ abstract boolean init(String instanceName, String cfg);
+
+ /**
+ * Queries a remote system and adds the result metadata to the search events
+ * result list. If SearchEvent.addResultsToLocalIndex (=default) result urls
+ * are added to the crawler.
+ * @param theSearch
+ */
+ abstract void search(SearchEvent theSearch);
+
+ /**
+ * Queries a remote system and returns the search result with field names
+ * according to YaCy schema.
+ *
+ * @param query
+ * @return result (metadata) in YaCy schema format
+ */
+ abstract List query(QueryParams query);
+
+}
diff --git a/source/net/yacy/cora/federate/FederateSearchManager.java b/source/net/yacy/cora/federate/FederateSearchManager.java
new file mode 100644
index 000000000..8d2ba017d
--- /dev/null
+++ b/source/net/yacy/cora/federate/FederateSearchManager.java
@@ -0,0 +1,427 @@
+/**
+ * FederateSearchManager.java
+ * Copyright 2015 by Burkhard Buelte
+ * First released 19.01.2015 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option)
+ * any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt If not, see
+ * .
+ */
+package net.yacy.cora.federate;
+
+import net.yacy.cora.federate.opensearch.OpenSearchConnector;
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import net.yacy.cora.document.analysis.Classification;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.solr.connector.SolrConnector;
+import net.yacy.cora.federate.yacy.CacheStrategy;
+import net.yacy.cora.storage.Configuration;
+import net.yacy.cora.storage.Configuration.Entry;
+import net.yacy.cora.storage.Files;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.parser.xml.opensearchdescriptionReader;
+import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.kelondro.util.Bitfield;
+import net.yacy.search.Switchboard;
+import net.yacy.search.SwitchboardConstants;
+import net.yacy.search.query.QueryGoal;
+import net.yacy.search.query.QueryModifier;
+import net.yacy.search.query.QueryParams;
+import net.yacy.search.query.SearchEvent;
+import net.yacy.search.schema.WebgraphSchema;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+
+/**
+ * Handling of queries to configured remote OpenSearch systems.
+ */
+public class FederateSearchManager {
+
+ private final int accessDelay = 15000; // delay between connects (in ms)
+
+ private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
+ private HashSet conlist; // connector list
+ protected Configuration cfg;//PropertiesConfiguration cfg;
+ private static FederateSearchManager manager = null; // self referenc for static .getManager()
+
+ public FederateSearchManager(Switchboard sb) {
+ super();
+ this.conlist = new HashSet();
+
+ // from here we need Switchboard settings
+ if (sb == null) {
+ return;
+ }
+ // Data needed active name, url(template), desc, rule-when-to-use, specifics
+ confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
+ if (!confFile.exists()) {
+ try {
+ Files.copy(new File(sb.appPath, "defaults/heuristicopensearch.conf"), confFile);
+ File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg");
+ if (!defdir.exists()) {
+ Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir);
+ }
+ } catch (IOException ex) {
+ }
+ }
+ // read settings config file
+ if (confFile.exists()) {
+ try {
+ cfg = new Configuration(confFile);
+ Iterator it = cfg.entryIterator();
+ while (it.hasNext()) {
+ Entry cfgentry = it.next();
+ String url = cfgentry.getValue();
+ if (cfgentry.enabled() && url != null && !url.isEmpty()) {
+ String name = cfgentry.key();
+ if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
+ // format prefix:connectortype:configfilename
+ // example cfgfile:solrconnector:testsys.solr.schema
+ String[] parts = url.split(":");
+ if (parts[1].equalsIgnoreCase("solrconnector")) {
+ SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
+ if (sfc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + parts[2])) {
+ conlist.add(sfc);
+ }
+ } else {
+ ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url);
+ }
+ } else { // handle opensearch url template
+ OpenSearchConnector osc = new OpenSearchConnector();
+ if (osc.init(name, url)) {
+ conlist.add(osc);
+ }
+ }
+ }
+ }
+ } catch (IOException ex) {
+ ConcurrentLog.logException(ex);
+ }
+ }
+ manager = this; // reference for static access via .getManager()
+ }
+
+ /**
+ * Get instance of this manager. There should be only one instance running,
+ * use this to get or initialize the manager.
+ *
+ * @return
+ */
+ public static FederateSearchManager getManager() {
+ if (manager == null) {
+ manager = new FederateSearchManager(Switchboard.getSwitchboard());
+ }
+ return manager;
+ }
+
+ /**
+ * Sends a query request to remote systems configured.
+ * If search query domain is LOCAL procedure does nothing.
+ *
+ * @param theSearch
+ */
+ public void search(SearchEvent theSearch) {
+ if (theSearch != null) {
+ if (!theSearch.query.isLocal()) {
+ Set picklist = getBest(theSearch.getQuery());
+ for (AbstractFederateSearchConnector fsc : picklist) {
+ fsc.search(theSearch);
+ }
+ }
+ }
+ }
+
+ /**
+ * Sends a query to configured remote systems.
+ *
+ * @param query
+ * @return list of results according to YaCy schema
+ */
+ public List query(QueryParams query) {
+ if (query.isLocal()) {
+ List sdl = new ArrayList();
+ Set picklist = getBest(query);
+ for (AbstractFederateSearchConnector fsc : picklist) {
+ sdl.addAll(fsc.query(query));
+ }
+ return sdl;
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Takes a search string, converts it to queryparams and calls the
+ * query(queryparams)
+ *
+ * @param querystr
+ * @return SolrDocumentlist of remote query results according to YaCy schema
+ */
+ public List query(String querystr) {
+
+ final QueryGoal qg = new QueryGoal(querystr);
+ final Switchboard sb = Switchboard.getSwitchboard();
+ Bitfield filter = new Bitfield();
+ final QueryParams query = new QueryParams(
+ qg,
+ new QueryModifier(),
+ Integer.MAX_VALUE,
+ "",
+ Classification.ContentDomain.ALL,
+ "", //lang
+ null,
+ CacheStrategy.IFFRESH,
+ 100, 0, //count, offset
+ ".*", //urlmask
+ null,
+ null,
+ QueryParams.Searchdom.LOCAL,
+ filter,
+ false,
+ null,
+ MultiProtocolURL.TLD_any_zone_filter,
+ "",
+ false,
+ sb.index,
+ sb.getRanking(),
+ "",//userAgent
+ false,
+ false,
+ 0.0, 0.0, -1.0,
+ new String[0]);
+
+ return query(query);
+ }
+
+ /**
+ * Add a search target system/connector to the config file
+ *
+ * @param urlTemplate query template url
+ * @return successful added
+ */
+ public boolean addOpenSearchTarget(String name, String urlTemplate, boolean active, String comment) {
+ if (confFile == null) {
+ return false;
+ }
+
+ try {
+ Configuration conf = new Configuration(confFile);
+ if (name != null && !name.isEmpty()) {
+ conf.add(name, null, active);
+ Configuration.Entry e = conf.get(name);
+ e.setValue(urlTemplate);
+ e.setEnable(active);
+ e.setComment(comment);
+ conf.put(name, e);
+ try {
+ conf.commit();
+ if (active) {
+ OpenSearchConnector osd = new OpenSearchConnector();
+ if (osd.init(name, urlTemplate)) {
+ conlist.add(osd);
+ }
+ }
+ } catch (final IOException ex) {
+ ConcurrentLog.warn("FederateSearchManager", "config file write error");
+ }
+ return true;
+ }
+ } catch (final IOException e1) {
+ ConcurrentLog.logException(e1);
+ return false;
+ }
+ return false;
+ }
+
+ /**
+ * Get the number of active remote query target systems
+ */
+ public int getSize() {
+ return conlist.size();
+ }
+
+ /**
+ * Get best systems from configured targets for this search
+ *
+ * @param theSearch
+ * @return list of searchtargetconnectors
+ */
+ protected Set getBest(final QueryParams query) {
+ HashSet retset = new HashSet();
+ // currently only enforces limits (min access delay, frequency)
+ for (AbstractFederateSearchConnector fsc : conlist) {
+ // check access time
+ if (fsc.lastaccesstime + accessDelay < System.currentTimeMillis()) { // enforce 15 sec delay between searches to same system
+ retset.add(fsc);
+ }
+ }
+ return retset;
+ }
+
+ /**
+ * Discover opensearch description links from local (embedded) Solr index
+ * using meta data field 'outboundlinks_tag_txt' and add found systems to
+ * the config file
+ *
+ * @return true if background discover job was started, false if job not
+ * started
+ */
+ public boolean discoverFromSolrIndex(Switchboard sb) {
+ if (sb == null) {
+ return false;
+ }
+ // check if needed Solr fields are available (selected)
+ if (!sb.index.fulltext().useWebgraph()) {
+ ConcurrentLog.severe("FederateSearchManager", "Error on connecting to embedded Solr webgraph index");
+ return false;
+ }
+ final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
+ final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
+ && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()))
+ && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
+ if (!metafieldavailable) {
+ ConcurrentLog.warn("FederateSearchManager", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on");
+ return false;
+ }
+ // the solr search
+ final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search";
+ final String[] webgraphqueryfields = {WebgraphSchema.target_protocol_s.getSolrFieldName(), WebgraphSchema.target_urlstub_s.getSolrFieldName()};
+ // alternatively target_protocol_s + "://" +target_host_s + target_path_s
+
+ final long numfound;
+ try {
+ SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
+ numfound = docList.getNumFound();
+ if (numfound == 0) {
+ ConcurrentLog.info("FederateSearchManager", "no results found, abort discover job");
+ return true;
+ }
+ ConcurrentLog.info("FederateSearchManager", "start checking " + Long.toString(numfound) + " found index results");
+ } catch (final IOException ex) {
+ ConcurrentLog.logException(ex);
+ return false;
+ }
+
+ final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever
+
+ // job to iterate through Solr index to find links to opensearchdescriptions
+ // started as background job as connect timeouts may cause it run a long time
+ final Thread job = new Thread() {
+ @Override
+ public void run() {
+ try {
+ boolean doloop = true;
+ int loopnr = 0;
+ Set dblmem = new HashSet(); // temp memory for already checked url
+ while (doloop) {
+ ConcurrentLog.info("FederateSearchManager", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
+ SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20, webgraphqueryfields); // check chunk of 20 result documents
+ loopnr++;
+ if (stoptime < System.currentTimeMillis()) {// stop after max 1h
+ doloop = false;
+ ConcurrentLog.info("FederateSearchManager", "long running discover task aborted");
+ }
+ if (docList != null && docList.size() > 0) {
+ Iterator docidx = docList.iterator();
+ while (docidx.hasNext()) {
+ SolrDocument sdoc = docidx.next();
+
+ String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
+ try {
+ URL url = new URL(hrefurltxt);
+ //TODO: check Blacklist
+ if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
+ opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
+ if (os.getRSSorAtomUrl() != null) {
+ // add found system to config file
+ addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
+ ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt);
+ } else {
+ ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
+ }
+ }
+ } catch (final MalformedURLException ex) {
+ }
+ }
+ } else {
+ doloop = false;
+ }
+ }
+ ConcurrentLog.info("FederateSearchManager", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)");
+ } catch (final IOException ex) {
+ ConcurrentLog.logException(ex);
+ }
+ }
+ };
+ job.start();
+ return true;
+ }
+
+ /**
+ * Read or reread opensearch config file and initialize connectors
+ *
+ * @param cfgFileName
+ * @return true if successful
+ */
+ public boolean init(String cfgFileName) {
+ confFile = new File(cfgFileName);
+ if (confFile.exists()) {
+ try {
+ cfg = new Configuration(confFile);
+ if (!this.conlist.isEmpty()) this.conlist.clear(); // prevent double entries
+ Iterator it = cfg.entryIterator();
+ while (it.hasNext()) {
+ Entry cfgentry = it.next();
+ if (cfgentry.enabled()) { // hold only enabled in memory
+ String name = cfgentry.key();
+ String url = cfgentry.getValue();
+ if (url != null && !url.isEmpty()) {
+ if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
+ // config entry has 3 parts separated by : 1=cfgfile 2=connectortype 3=relative path to connector-cfg-file
+ // example cfgfile:solrconnector:testsys.solr.schema
+ String[] parts = url.split(":");
+ if (parts[1].equalsIgnoreCase("solrconnector")) {
+ SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
+ if (sfc.init(name, confFile.getParent()+"/federatecfg/"+parts[2])) {
+ conlist.add(sfc);
+ }
+ } else {
+ ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url);
+ }
+ } else { // handle opensearch url template
+ OpenSearchConnector osd;
+ osd = new OpenSearchConnector();
+ if (osd.init(name, url)) {
+ conlist.add(osd);
+ }
+ }
+ }
+ }
+ }
+ } catch (IOException ex) {
+ ConcurrentLog.logException(ex);
+ }
+ }
+ return true;
+ }
+
+}
diff --git a/source/net/yacy/cora/federate/SolrFederateSearchConnector.java b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java
new file mode 100644
index 000000000..7e9fceaaa
--- /dev/null
+++ b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java
@@ -0,0 +1,119 @@
+/**
+ * SolrFederateSearchConnector.java
+ * Copyright 2015 by Burkhard Buelte
+ * First released 19.01.2015 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option)
+ * any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt If not, see
+ * .
+ */
+package net.yacy.cora.federate;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
+import net.yacy.cora.federate.solr.connector.SolrConnector;
+import net.yacy.cora.federate.solr.instance.RemoteInstance;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.search.query.QueryParams;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.ModifiableSolrParams;
+
+/**
+ * Search connecter to collect query results from remote Solr systems which
+ * provide results as Solr documents
+ */
+public class SolrFederateSearchConnector extends AbstractFederateSearchConnector {
+
+ private String corename;
+
+ @Override
+ public boolean init(String instance, String cfgFileName) {
+ boolean initResult = super.init(instance, cfgFileName); // init local schema cfg
+ if (initResult) {
+ if (this.localcfg.contains("_baseurl")) {
+ setBaseurl(this.localcfg.get("_baseurl").getValue());
+ } else {
+ ConcurrentLog.config(instance, "no _baseurl given in config file "+cfgFileName);
+ initResult = false;
+ }
+ if (this.localcfg.contains("_corename")) {
+ setCoreName(this.localcfg.get("_corename").getValue());
+ } else {
+ ConcurrentLog.config(instance, "no _corename given in config file "); // not mandatory
+ this.corename = "";
+ }
+ }
+ return initResult;
+ }
+
+ public void setBaseurl(String url) {
+ if (url.endsWith("/")) {
+ this.baseurl = url;
+ } else {
+ this.baseurl = url + "/";
+ }
+ }
+
+ public void setCoreName(String core) {
+ this.corename = core;
+ }
+
+ /**
+ * Core query implementation
+ * all query and search routines will use this routine to query the remote system
+ *
+ * @param query
+ * @return list of solr documents (metadata) accordng to local YaCy internal schema
+ */
+ @Override
+ public List query(QueryParams query) {
+
+ List docs = new ArrayList();
+ Collection remotecorename = new ArrayList();
+ remotecorename.add(corename);
+ ModifiableSolrParams msp = new SolrQuery(query.getQueryGoal().getQueryString(false));
+ msp.add(CommonParams.QT, "/"); // important to override default append of /select
+ msp.add(CommonParams.ROWS, Integer.toString(query.itemsPerPage));
+ try {
+ RemoteInstance instance = new RemoteInstance(baseurl, remotecorename, corename, 20000);
+ try {
+ SolrConnector solrConnector = new RemoteSolrConnector(instance, false, null);
+ try {
+ this.lastaccesstime = System.currentTimeMillis();
+ SolrDocumentList docList = solrConnector.getDocumentListByParams(msp);
+ // convert to YaCy schema documentlist
+ for (SolrDocument doc : docList) {
+ URIMetadataNode anew = toYaCySchema(doc);
+ docs.add(anew);
+ }
+ } catch (IOException | SolrException e) {
+ } finally {
+ solrConnector.close();
+ }
+ } catch (Throwable ee) {
+ } finally {
+ instance.close();
+ }
+ } catch (IOException eee) {
+ }
+ return docs;
+ }
+}
diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java
index 68fee2161..defc0e1cc 100644
--- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java
+++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java
@@ -19,107 +19,45 @@
*/
package net.yacy.cora.federate.opensearch;
-import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-import net.yacy.cora.federate.solr.connector.SolrConnector;
-import net.yacy.cora.storage.Configuration;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.cora.document.feed.RSSFeed;
+import net.yacy.cora.document.feed.RSSMessage;
+import net.yacy.cora.document.feed.RSSReader;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.AbstractFederateSearchConnector;
+import net.yacy.cora.federate.FederateSearchConnector;
+import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.cora.util.SpaceExceededException;
-import net.yacy.document.parser.xml.opensearchdescriptionReader;
-import net.yacy.kelondro.blob.Tables;
-import net.yacy.search.Switchboard;
-import net.yacy.search.SwitchboardConstants;
-import net.yacy.search.query.SearchEvent;
-import net.yacy.search.schema.WebgraphSchema;
-
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
+import net.yacy.document.TextParser;
+import net.yacy.kelondro.data.meta.URIMetadataNode;
+import net.yacy.search.query.QueryParams;
+import net.yacy.search.schema.CollectionSchema;
/**
* Handling of queries to remote OpenSearch systems. Iterates to a list of
- * configured systems until number of needed results are available. Uses a
- * temporary work table to store search template urls for the iteration during
- * search.
+ * configured systems until number of needed results are available.
*/
-public class OpenSearchConnector {
-
- private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
- private int size = 0; // remember the size of active opensearch targets
-
- public OpenSearchConnector(Switchboard sb, boolean createworktable) {
- super();
- if (sb == null) {
- return;
- }
-
- confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
-
- if (createworktable) { // read from config file and create worktable
- sb.tables.clear("opensearchsys");
- try {
- Configuration cfg = new Configuration(confFile);
-
- // copy active opensearch systems to a work table (opensearchsys)
- Iterator cfgentries = cfg.entryIterator();
- while (cfgentries.hasNext()) {
- Configuration.Entry e = cfgentries.next();
- if (e.enabled()) {
- String title = e.key(); // get the title
- String urlstr = e.getValue(); // get the search template url
+public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector {
- Tables.Data row = new Tables.Data();
- row.put("title", title);
- row.put("url", urlstr);
- try {
- sb.tables.insert("opensearchsys", row);
- } catch (final SpaceExceededException ex) {
- ConcurrentLog.logException(ex);
- }
- }
- }
- size = sb.tables.size("opensearchsys");
- } catch (final IOException ex) {
- ConcurrentLog.logException(ex);
- }
- }
- }
-
- /**
- * Sends a search request to remote systems listed in worktable until the
- * searchevent contains less than needed results. Depending on already
- * collected search results none to all configured systems are queried to
- * complete available search results.
- * if query search domain is LOCAL procedure does nothing.
- */
- static public void query(Switchboard sb, SearchEvent theSearch) {
- if (theSearch != null && sb != null) {
- if (!theSearch.query.isLocal()) {
- try {
- Iterator ossysworktable = sb.tables.iterator("opensearchsys");
- //int needres = theSearch.query.neededResults(); // get number of needed results
- while (ossysworktable.hasNext() /*&& theSearch.query.getResultCount() < needres*/) {
- Tables.Row row = ossysworktable.next();
- String osurl = row.get("url", "");
- String name = row.get("title", "");
- sb.heuristicRSS(parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0, theSearch.query.itemsPerPage), theSearch, name);
- }
- } catch (final IOException ex) {
- ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys");
- }
- }
- }
+ @Override
+ public boolean init(final String name, final String urltemplate) {
+ this.baseurl = urltemplate;
+ this.instancename = name;
+ this.localcfg = null; // no field mapping needed
+ return true;
}
/**
* replace Opensearchdescription search template parameter with actual values
*/
- private static String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) {
+ private String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) {
String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}='
tmps = tmps.replace("{startIndex}", Integer.toString(start));
tmps = tmps.replace("{startPage}", "");
@@ -131,138 +69,76 @@ public class OpenSearchConnector {
}
/**
- * add a opensearch target system to the config file
+ * queries remote system and returns the resultlist (waits until results
+ * transmitted or timeout) This is the main access routine used for the
+ * serach and query operation For internal access delay time, also the
+ * this.lastaccessed time needs to be set here.
+ *
+ * @return query results (metadata) with fields according to YaCy schema
*/
- public boolean add(String name, String url, boolean active, String comment) {
- if (confFile == null) {
- return false;
- }
+ @Override
+ public List query(QueryParams query) {
+ List docs = new ArrayList();
+ // see http://www.loc.gov/standards/sru/
+ String searchurl = this.parseSearchTemplate(baseurl, query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage);
try {
- Configuration conf = new Configuration(confFile);
- if (name != null && !name.isEmpty()) {
- conf.add(name, null, active);
- Configuration.Entry e = conf.get(name);
- e.setValue(url);
- e.setEnable(active);
- e.setComment(comment);
- conf.put(name, e);
- try {
- conf.commit();
- } catch (final IOException ex) {
- ConcurrentLog.warn("OpenSearchConnector.add", "config file write error");
- }
- return true;
- }
- } catch (final IOException e1) {
- ConcurrentLog.logException(e1);
- return false;
- }
- return false;
- }
-
- /**
- * Get the number of active remote opensearch target systems
- */
- public int getSize() {
- return size;
- }
-
- /**
- * Discover opensearch description links from local (embedded) Solr index using
- * meta data field 'outboundlinks_tag_txt' and add found systems to the
- * config file
- *
- * @return true if background discover job was started, false if job not started
- */
- public boolean discoverFromSolrIndex(final Switchboard sb) {
- if (sb == null) {
- return false;
- }
- // check if needed Solr fields are available (selected)
- if (!sb.index.fulltext().useWebgraph()) {
- ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index");
- return false;
- }
- final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
- final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
- && ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) )
- && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
- if (!metafieldavailable) {
- ConcurrentLog.warn("OpenSearchConnector.Discover", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on");
- return false;
- }
- // the solr query
- final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search";
- final String[] webgraphqueryfields = { WebgraphSchema.target_protocol_s.getSolrFieldName() , WebgraphSchema.target_urlstub_s.getSolrFieldName()};
- // alternatively target_protocol_s + "://" +target_host_s + target_path_s
-
- final long numfound;
- try {
- SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
- numfound = docList.getNumFound();
- if (numfound == 0) {
- ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job");
- return true;
- }
- ConcurrentLog.info("OpenSearchConnector.Discover", "start checking " + Long.toString(numfound) + " found index results");
- } catch (final IOException ex) {
- ConcurrentLog.logException(ex);
- return false;
- }
+ MultiProtocolURL aurl = new MultiProtocolURL(MultiProtocolURL.unescape(searchurl));
+ try {
+ this.lastaccesstime = System.currentTimeMillis();
+ final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyIntranetCrawlerAgent);
+ byte[] result = httpClient.GETbytes(aurl, null, null, false);
+ RSSReader rssReader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
+ if (rssReader != null) {
+ final RSSFeed feed = rssReader.getFeed();
+ if (feed != null) {
+ for (final RSSMessage item : feed) {
+ try {
+ DigestURL uri = new DigestURL(item.getLink());
+
+ URIMetadataNode doc = new URIMetadataNode(uri);
+ doc.setField(CollectionSchema.charset_s.getSolrFieldName(), UTF8.charset.name());
+ doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor());
+ doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle());
+ doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage());
+ doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate());
+ final String mime = TextParser.mimeOf(uri);
+ if (mime != null) {
+ doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime);
+ }
+ if (item.getCategory().isEmpty()) {
+ doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()));
+ } else {
+ doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory());
+ }
+ doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright());
- final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever
+ // TODO: we likely got only a search related snippet (take is as text content)
+ // we need a way to differentiate metadata from full crawl data in the index (would be also good for rwi transferred/received metadata)
+ // or considere to add this to snippet cache, without adding text_t
+ doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions());
- // job to iterate through Solr index to find links to opensearchdescriptions
- // started as background job as connect timeouts may cause it run a long time
- final Thread job = new Thread() {
- @Override
- public void run() {
- try {
- boolean doloop = true;
- int loopnr = 0;
- Set dblmem = new HashSet(); // temp memory for already checked url
- while (doloop) {
- ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
- SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents
- loopnr++;
- if (stoptime < System.currentTimeMillis()) {// stop after max 1h
- doloop = false;
- ConcurrentLog.info("OpenSearchConnector.Discover", "long running discover task aborted");
- }
- if (docList != null && docList.size() > 0) {
- Iterator docidx = docList.iterator();
- while (docidx.hasNext()) {
- SolrDocument sdoc = docidx.next();
-
- String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
- try {
- URL url = new URL(hrefurltxt);
- //TODO: check Blacklist
- if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
- opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
- if (os.getRSSorAtomUrl() != null) {
- // add found system to config file
- add(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
- ConcurrentLog.info("OpenSearchConnector.Discover", "added " + os.getShortName() + " " + hrefurltxt);
- } else {
- ConcurrentLog.info("OpenSearchConnector.Discover", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
- }
- }
- } catch (final MalformedURLException ex) {
+ if (item.getLat() != 0.0 && item.getLon() != 0.0) {
+ doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon());
+ }
+ if (item.getSize() > 0) {
+ doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize());
}
+
+ docs.add(doc);
+ } catch (final MalformedURLException e) {
}
- } else {
- doloop = false;
}
+ ConcurrentLog.info("OpenSerachConnector", "received " + docs.size() + " results from " + this.instancename);
}
- ConcurrentLog.info("OpenSearchConnector.Discover", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)");
- } catch (final IOException ex) {
- ConcurrentLog.logException(ex);
}
+ } catch (IOException ex) {
+ ConcurrentLog.logException(ex);
+ ConcurrentLog.info("OpenSearchConnector", "no connection to " + searchurl);
}
- };
- job.start();
- return true;
+ } catch (MalformedURLException ee) {
+ ConcurrentLog.warn("OpenSearchConnector", "malformed url " + searchurl);
+ }
+ return docs;
}
}
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
index 644763ff7..e2f53f647 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@@ -69,7 +69,7 @@ public class URIMetadataNode extends SolrDocument {
private static final long serialVersionUID = -256046934741561968L;
protected String keywords = null;
- protected DigestURL url = null;
+ protected DigestURL url;
protected Bitfield flags = null;
protected int imagec = -1, audioc = -1, videoc = -1, appc = -1;
protected double lat = Double.NaN, lon = Double.NaN;
@@ -150,7 +150,6 @@ public class URIMetadataNode extends SolrDocument {
for (String name : doc.getFieldNames()) {
this.addField(name, doc.getFieldValue(name));
}
- this.snippet = "";
Float scorex = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result
this.score = scorex == null ? 0.0f : scorex.floatValue();
final byte[] hash = ASCII.getBytes(getString(CollectionSchema.id)); // TODO: can we trust this id ?
@@ -169,6 +168,24 @@ public class URIMetadataNode extends SolrDocument {
this.score = scorex;
}
+ public URIMetadataNode (final String urlstr) {
+ super();
+ try {
+ url = new DigestURL(urlstr);
+ this.setField(CollectionSchema.sku.name(), url.toNormalform(true));
+ this.setField(CollectionSchema.id.name(), ASCII.String(url.hash()));
+ } catch (final MalformedURLException e) {
+ ConcurrentLog.logException(e);
+ this.url = null;
+ }
+ }
+ public URIMetadataNode(DigestURL theurl) {
+ super();
+ url = theurl;
+ this.setField(CollectionSchema.sku.name(), url.toNormalform(true));
+ this.setField(CollectionSchema.id.name(), ASCII.String(url.hash()));
+ }
+
/**
* Get the content domain of a document. This tries to get the content domain from the mime type
* and if this fails it uses alternatively the content domain from the file extension.
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index d735dba3a..084faae18 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -714,16 +714,7 @@ public final class Switchboard extends serverSwitch {
this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath());
this.surrogatesOutPath.mkdirs();
*/
- // copy opensearch heuristic config (if not exist)
- final File osdConfig = new File(getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
- if (!osdConfig.exists()) {
- final File osdDefaultConfig = new File(getAppPath(), "defaults/heuristicopensearch.conf");
- this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath());
- try {
- Files.copy(osdDefaultConfig, osdConfig);
- } catch (final IOException ex) { }
- }
-
+
// create the release download directory
this.releasePath =
getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT);
@@ -3615,7 +3606,9 @@ public final class Switchboard extends serverSwitch {
* @param urlpattern the search query url (e.g. http://search.org?query=searchword)
* @param searchEvent
* @param feedName short/internal name of the remote system
+ * @deprecated use FederateSearchManager(SearchEvent) instead
*/
+ @Deprecated // not used (since 2015-01-18, v1.81)
public final void heuristicRSS(
final String urlpattern,
final SearchEvent searchEvent,