diff --git a/defaults/federatecfg/datacite.solr.schema b/defaults/federatecfg/datacite.solr.schema new file mode 100644 index 000000000..11823deae --- /dev/null +++ b/defaults/federatecfg/datacite.solr.schema @@ -0,0 +1,32 @@ +## API datacite.org +## This service is also available as an API. We use Solr Search Handler for our API calls, the endpoint is: http://search.datacite.org/api. + +## Please check Solr's common query parameters documentation in order to understand how to use API. +## Examples + +## http://search.datacite.org/api?q=wind simple search for wind +## http://search.datacite.org/api?q=wind&fl=doi,title&rows=5 search for wind, retrieve only doi and title, and return (at max.) 5 results +## http://search.datacite.org/api?q=wind&fl=doi,title&wt=csv csv output +## http://search.datacite.org/api?q=wind&fl=doi,title&wt=json&indent=true json output + +## YaCy solrconnector specific settings +## the basic url to acces the system +_baseurl = http://search.datacite.org/ +## Solr core, is appended to the _baseurl +_corename = api +## some systems store a identifier instead of a url for the resource, the prefix is prepended the identifier in _skufieldname +_skuprefix = http://dx.doi.org/ +## the field name of the url of resource (in yacy/solr = sku) +_skufieldname = doi + +## field mappings +## YaCyFieldname = remoteFieldname +keywords = subject +author = creator +publisher_t = publisher +title = title +description_txt = description +language_s = language +text_t = description +size_i = size +coordinate_p = geoLocationPoint \ No newline at end of file diff --git a/defaults/heuristicopensearch.conf b/defaults/heuristicopensearch.conf index 143f25a03..712674aba 100644 --- a/defaults/heuristicopensearch.conf +++ b/defaults/heuristicopensearch.conf @@ -14,8 +14,15 @@ #Blekko = http://blekko.com/ws/{searchTerms}+/rss # get 20 results from blekko #Faroo-News = http://www.faroo.com/api?q={searchTerms}&start={startIndex}&length=20&l=en&src=news&f=rss # get results from Faroo news-search -#openBDB = http://www.openbdb.com/b/{searchTerms}.xml # Open Book Database #WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs #Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv #Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2 #Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web + +## In addition to OpenSearch systems other connectors are available to query foreign systems +## the syntax is +## SystemName = cfgfile:_connectortype_:_schemaconfig_ +## where cfgfile: is a fix prefix (to signal this is not a opensearch url) +## _connectortype_ is the type of connector to use ( available is solrconnector ) +## _schemaconfig_ is the config file with filed name mappings (the file has to exist in DATA/SETTINGS/federatecfg +#datacite.org = cfgfile:solrconnector:datacite.solr.schema # International Consortium for data citation diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index a171ea078..6bd9768c9 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -25,7 +25,6 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -import com.google.common.io.Files; import java.io.File; @@ -37,9 +36,10 @@ import net.yacy.search.Switchboard; import java.io.IOException; import java.util.Iterator; +import net.yacy.cora.federate.FederateSearchManager; -import net.yacy.cora.federate.opensearch.OpenSearchConnector; import net.yacy.cora.federate.solr.SchemaConfiguration; +import net.yacy.cora.storage.Files; import net.yacy.search.SwitchboardConstants; import net.yacy.search.schema.WebgraphSchema; import net.yacy.server.serverObjects; @@ -66,9 +66,9 @@ public class ConfigHeuristics_p { if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false); if (post.containsKey("opensearch_on")) { sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true); - // re-read config (and create work table) - OpenSearchConnector os = new OpenSearchConnector(sb, true); - if (os.getSize() == 0) { + // re-read config + FederateSearchManager.getManager().init(sb.getDataPath().getAbsolutePath()+ "DATA/SETTINGS/heuristicopensearch.conf"); + if (FederateSearchManager.getManager().getSize() == 0) { osderrmsg = "no active search targets are configured"; } } @@ -77,8 +77,8 @@ public class ConfigHeuristics_p { final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name())); if (metafieldavailable) { - OpenSearchConnector osc = new OpenSearchConnector(sb, false); - if (osc.discoverFromSolrIndex(sb)) { + //OpenSearchConnector osc = new OpenSearchConnector(sb, false); + if (FederateSearchManager.getManager().discoverFromSolrIndex(sb)) { osderrmsg = "started background search for target systems, refresh page after some minutes"; } else { osderrmsg = "Error: webgraph Solr index not enabled"; @@ -98,8 +98,7 @@ public class ConfigHeuristics_p { if (tmpname != null && tmpurl !=null) { if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) { final String tmpcomment = post.get("ossys_newcomment"); - OpenSearchConnector osc = new OpenSearchConnector(sb,false); - osc.add (tmpname,tmpurl,false,tmpcomment); + FederateSearchManager.getManager().addOpenSearchTarget(tmpname,tmpurl,false,tmpcomment); } else osderrmsg = "Url template must contain '{searchTerms}'"; } } @@ -143,6 +142,10 @@ public class ConfigHeuristics_p { if ((post.containsKey("resettodefaultosdlist") || !osdConfig.exists()) && osdDefaultConfig.exists()) { try { Files.copy(osdDefaultConfig, osdConfig); + File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg"); + if (!defdir.exists()) { + Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir); + } } catch (final IOException ex) { osderrmsg = "file I/O error during copy"; } @@ -240,7 +243,7 @@ public class ConfigHeuristics_p { // re-read config (and create/update work table) if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) { - new OpenSearchConnector(sb, true); + FederateSearchManager.getManager().init(f.getAbsolutePath()); } } } diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java index 2ca5a194b..97e408c8c 100644 --- a/htroot/ConfigNetwork_p.java +++ b/htroot/ConfigNetwork_p.java @@ -127,8 +127,8 @@ public class ConfigNetwork_p sb.peers.mySeed().setPeerTags(MapTools.string2set(normalizedList(post.get("peertags")), ",")); } - sb.setConfig("cluster.mode", post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER)); - sb.setConfig("cluster.peers.ipport", checkIPPortList(post.get("cluster.peers.ipport", ""))); + sb.setConfig(SwitchboardConstants.CLUSTER_MODE, post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER)); + sb.setConfig(SwitchboardConstants.CLUSTER_PEERS_IPPORT, checkIPPortList(post.get(SwitchboardConstants.CLUSTER_PEERS_IPPORT, ""))); sb.setConfig( "cluster.peers.yacydomain", checkYaCyDomainList(post.get("cluster.peers.yacydomain", ""))); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 24e7688d0..4743c59f7 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -45,7 +45,7 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; -import net.yacy.cora.federate.opensearch.OpenSearchConnector; +import net.yacy.cora.federate.FederateSearchManager; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.lod.vocabulary.Tagging; @@ -719,10 +719,10 @@ public class yacysearch { sb.heuristicSite(theSearch, modifier.sitehost); } if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) { - OpenSearchConnector.query(sb, theSearch); + FederateSearchManager.getManager().search(theSearch); } if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) { - OpenSearchConnector.query(sb, theSearch); + FederateSearchManager.getManager().search(theSearch); } } diff --git a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java new file mode 100644 index 000000000..48fd05134 --- /dev/null +++ b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java @@ -0,0 +1,197 @@ +/** + * AbstractFederateSearchConnector.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import javax.servlet.http.HttpServletResponse; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.SchemaConfiguration; +import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.federate.solr.SolrType; +import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.cora.storage.Configuration; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.Switchboard; +import net.yacy.search.query.SearchEvent; +import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.common.SolrDocument; + +/** + * Base implementation class for Federated Search Connectors providing the basic + * funcitonality to search none YaCy systems + * + * Subclasses should/need to override query() and maybe toYaCySchema() if more + * is needed as a basic field mapping + */ +abstract public class AbstractFederateSearchConnector implements FederateSearchConnector { + + public String instancename; // just a identifying name + protected SchemaConfiguration localcfg; // the schema conversion cfg for each fieldname, yacyname = remote fieldname + public long lastaccesstime = -1; // last time accessed, used for search delay calculation + protected String baseurl; + + /** + * Inits the connector with the remote field names and matches to yacy + * schema and other specific settings from config file. Every connector + * needs at least a query target (where to query) and some definition to + * convert the remote serch result to the internal result presentation + * (field mapping) + * + * @param instanceName internal name + * @param cfgFileName e.g. DATA/SETTINGS/FEDERATECFG/instanceName.SCHEMA + * @return true if success false if not + */ + @Override + public boolean init(String instance, String cfgFileName) { + this.instancename = instance; + File instanceCfgFile = new File(cfgFileName); + if (instanceCfgFile.exists()) { + try { + this.localcfg = new SchemaConfiguration(instanceCfgFile); + } catch (IOException ex) { + ConcurrentLog.config(this.instancename, "error reading schema " + cfgFileName); + return false; + } + // mandatory to contain a mapping for "sku" or alternatively "cfg_skufieldname" for a conversion to a final url + if (this.localcfg.contains(CollectionSchema.sku) || this.localcfg.contains("_skufieldname")) { + return true; + } else { + ConcurrentLog.config(this.instancename, "mandatory mapping for sku or _skufieldname missing in " + cfgFileName); + return false; + } + } else { + this.localcfg = null; + return false; + } + } + + /** + * queries a remote system and adds the results to the searchevent and to + * the crawler if addResultsToLocalIndex is true + * + * @param theSearch receiving the results + */ + @Override + public void search(final SearchEvent theSearch) { + + final Thread job = new Thread() { + @Override + public void run() { + Thread.currentThread().setName("heuristic:" + instancename); + theSearch.oneFeederStarted(); + List doclist = query(theSearch.getQuery()); + if (doclist != null) { + Map> snippets = new HashMap>(); // add nodes doesn't allow null + Map> facets = new HashMap>(); // add nodes doesn't allow null + theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size()); + + for (URIMetadataNode doc : doclist) { + theSearch.addHeuristic(doc.hash(), instancename, false); + } + } + // that's all we need to display serach result + theSearch.oneFeederTerminated(); + + // optional: add to crawler to get the full resource (later) + if (doclist != null && !doclist.isEmpty() && theSearch.addResultsToLocalIndex) { + Collection urls = new ArrayList(); + for (URIMetadataNode doc : doclist) { + urls.add(doc.url()); + } + Switchboard.getSwitchboard().addToCrawler(urls, false); + + } + } + }; + job.start(); + } + + /** + * Converts a remote schema result to YaCy schema using the fieldname + * mapping provided as config file + * + * @param remote result (with remote fieldnames) + * @return SolrDocument with field names according to the YaCy schema + */ + protected URIMetadataNode toYaCySchema(final SolrDocument doc) { + // set YaCy id + String urlstr; + if (localcfg.contains("sku")) { + urlstr = (String) doc.getFieldValue(localcfg.get("sku").getValue()); + } else { + urlstr = (String) doc.getFieldValue(localcfg.get("_skufieldname").getValue()); + if (this.localcfg.contains("_skuprefix")) { + String skuprefix = this.localcfg.get("_skuprefix").getValue(); + urlstr = skuprefix + urlstr; + } + } + + URIMetadataNode newdoc = new URIMetadataNode(urlstr); + Iterator it = localcfg.entryIterator(); + while (it.hasNext()) { + Configuration.Entry et = it.next(); + String yacyfieldname = et.key(); // config defines yacyfieldname = remotefieldname + String remotefieldname = et.getValue(); + if (remotefieldname != null && !remotefieldname.isEmpty()) { + if (Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration().contains(yacyfieldname)) { // check if in local config + + SchemaDeclaration est = CollectionSchema.valueOf(yacyfieldname); + if (est.isMultiValued()) { + if (doc.getFieldValues(remotefieldname) != null) { + newdoc.addField(yacyfieldname, doc.getFieldValues(remotefieldname)); // + } + } else { + if (doc.getFieldValue(remotefieldname) != null) { + Object val = doc.getFirstValue(remotefieldname); + // watch out for type conversion + try { + if (est.getType() == SolrType.num_integer && val instanceof String) { + newdoc.setField(yacyfieldname, Integer.parseInt((String) val)); + } else { + newdoc.setField(yacyfieldname, val); + } + } catch (Exception ex) { + continue; // catch possible parse or type mismatch, skip the field + } + } + } + } + } + } + + newdoc.addField(CollectionSchema.httpstatus_i.name(), HttpServletResponse.SC_OK); // yacy required + return newdoc; + } +} diff --git a/source/net/yacy/cora/federate/FederateSearchConnector.java b/source/net/yacy/cora/federate/FederateSearchConnector.java new file mode 100644 index 000000000..4f7ebfa8e --- /dev/null +++ b/source/net/yacy/cora/federate/FederateSearchConnector.java @@ -0,0 +1,62 @@ +/** + * FederateSearchConnector.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import java.util.List; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.query.QueryParams; +import net.yacy.search.query.SearchEvent; + + +/** + * Interface for a query connector to search and gather query results from none + * YaCy systems (for the YaCy heuristic options) + */ +public interface FederateSearchConnector { + + /** + * Load the configuration for this connector every connector needs at least + * a query target (where to query) and some definition to convert the remote + * serch result to the internal result presentation (field mapping) + * + * @param instanceName is also the name of the config file DATA/SETTINGS/instanceName.schema + * @param cfg config parameter + * @return true if success false if not + */ + abstract boolean init(String instanceName, String cfg); + + /** + * Queries a remote system and adds the result metadata to the search events + * result list. If SearchEvent.addResultsToLocalIndex (=default) result urls + * are added to the crawler. + * @param theSearch + */ + abstract void search(SearchEvent theSearch); + + /** + * Queries a remote system and returns the search result with field names + * according to YaCy schema. + * + * @param query + * @return result (metadata) in YaCy schema format + */ + abstract List query(QueryParams query); + +} diff --git a/source/net/yacy/cora/federate/FederateSearchManager.java b/source/net/yacy/cora/federate/FederateSearchManager.java new file mode 100644 index 000000000..8d2ba017d --- /dev/null +++ b/source/net/yacy/cora/federate/FederateSearchManager.java @@ -0,0 +1,427 @@ +/** + * FederateSearchManager.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import net.yacy.cora.federate.opensearch.OpenSearchConnector; +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.storage.Configuration; +import net.yacy.cora.storage.Configuration.Entry; +import net.yacy.cora.storage.Files; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.document.parser.xml.opensearchdescriptionReader; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.util.Bitfield; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; +import net.yacy.search.query.QueryGoal; +import net.yacy.search.query.QueryModifier; +import net.yacy.search.query.QueryParams; +import net.yacy.search.query.SearchEvent; +import net.yacy.search.schema.WebgraphSchema; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +/** + * Handling of queries to configured remote OpenSearch systems. + */ +public class FederateSearchManager { + + private final int accessDelay = 15000; // delay between connects (in ms) + + private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf + private HashSet conlist; // connector list + protected Configuration cfg;//PropertiesConfiguration cfg; + private static FederateSearchManager manager = null; // self referenc for static .getManager() + + public FederateSearchManager(Switchboard sb) { + super(); + this.conlist = new HashSet(); + + // from here we need Switchboard settings + if (sb == null) { + return; + } + // Data needed active name, url(template), desc, rule-when-to-use, specifics + confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); + if (!confFile.exists()) { + try { + Files.copy(new File(sb.appPath, "defaults/heuristicopensearch.conf"), confFile); + File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg"); + if (!defdir.exists()) { + Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir); + } + } catch (IOException ex) { + } + } + // read settings config file + if (confFile.exists()) { + try { + cfg = new Configuration(confFile); + Iterator it = cfg.entryIterator(); + while (it.hasNext()) { + Entry cfgentry = it.next(); + String url = cfgentry.getValue(); + if (cfgentry.enabled() && url != null && !url.isEmpty()) { + String name = cfgentry.key(); + if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url) + // format prefix:connectortype:configfilename + // example cfgfile:solrconnector:testsys.solr.schema + String[] parts = url.split(":"); + if (parts[1].equalsIgnoreCase("solrconnector")) { + SolrFederateSearchConnector sfc = new SolrFederateSearchConnector(); + if (sfc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + parts[2])) { + conlist.add(sfc); + } + } else { + ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url); + } + } else { // handle opensearch url template + OpenSearchConnector osc = new OpenSearchConnector(); + if (osc.init(name, url)) { + conlist.add(osc); + } + } + } + } + } catch (IOException ex) { + ConcurrentLog.logException(ex); + } + } + manager = this; // reference for static access via .getManager() + } + + /** + * Get instance of this manager. There should be only one instance running, + * use this to get or initialize the manager. + * + * @return + */ + public static FederateSearchManager getManager() { + if (manager == null) { + manager = new FederateSearchManager(Switchboard.getSwitchboard()); + } + return manager; + } + + /** + * Sends a query request to remote systems configured. + * If search query domain is LOCAL procedure does nothing. + * + * @param theSearch + */ + public void search(SearchEvent theSearch) { + if (theSearch != null) { + if (!theSearch.query.isLocal()) { + Set picklist = getBest(theSearch.getQuery()); + for (AbstractFederateSearchConnector fsc : picklist) { + fsc.search(theSearch); + } + } + } + } + + /** + * Sends a query to configured remote systems. + * + * @param query + * @return list of results according to YaCy schema + */ + public List query(QueryParams query) { + if (query.isLocal()) { + List sdl = new ArrayList(); + Set picklist = getBest(query); + for (AbstractFederateSearchConnector fsc : picklist) { + sdl.addAll(fsc.query(query)); + } + return sdl; + } else { + return null; + } + } + + /** + * Takes a search string, converts it to queryparams and calls the + * query(queryparams) + * + * @param querystr + * @return SolrDocumentlist of remote query results according to YaCy schema + */ + public List query(String querystr) { + + final QueryGoal qg = new QueryGoal(querystr); + final Switchboard sb = Switchboard.getSwitchboard(); + Bitfield filter = new Bitfield(); + final QueryParams query = new QueryParams( + qg, + new QueryModifier(), + Integer.MAX_VALUE, + "", + Classification.ContentDomain.ALL, + "", //lang + null, + CacheStrategy.IFFRESH, + 100, 0, //count, offset + ".*", //urlmask + null, + null, + QueryParams.Searchdom.LOCAL, + filter, + false, + null, + MultiProtocolURL.TLD_any_zone_filter, + "", + false, + sb.index, + sb.getRanking(), + "",//userAgent + false, + false, + 0.0, 0.0, -1.0, + new String[0]); + + return query(query); + } + + /** + * Add a search target system/connector to the config file + * + * @param urlTemplate query template url + * @return successful added + */ + public boolean addOpenSearchTarget(String name, String urlTemplate, boolean active, String comment) { + if (confFile == null) { + return false; + } + + try { + Configuration conf = new Configuration(confFile); + if (name != null && !name.isEmpty()) { + conf.add(name, null, active); + Configuration.Entry e = conf.get(name); + e.setValue(urlTemplate); + e.setEnable(active); + e.setComment(comment); + conf.put(name, e); + try { + conf.commit(); + if (active) { + OpenSearchConnector osd = new OpenSearchConnector(); + if (osd.init(name, urlTemplate)) { + conlist.add(osd); + } + } + } catch (final IOException ex) { + ConcurrentLog.warn("FederateSearchManager", "config file write error"); + } + return true; + } + } catch (final IOException e1) { + ConcurrentLog.logException(e1); + return false; + } + return false; + } + + /** + * Get the number of active remote query target systems + */ + public int getSize() { + return conlist.size(); + } + + /** + * Get best systems from configured targets for this search + * + * @param theSearch + * @return list of searchtargetconnectors + */ + protected Set getBest(final QueryParams query) { + HashSet retset = new HashSet(); + // currently only enforces limits (min access delay, frequency) + for (AbstractFederateSearchConnector fsc : conlist) { + // check access time + if (fsc.lastaccesstime + accessDelay < System.currentTimeMillis()) { // enforce 15 sec delay between searches to same system + retset.add(fsc); + } + } + return retset; + } + + /** + * Discover opensearch description links from local (embedded) Solr index + * using meta data field 'outboundlinks_tag_txt' and add found systems to + * the config file + * + * @return true if background discover job was started, false if job not + * started + */ + public boolean discoverFromSolrIndex(Switchboard sb) { + if (sb == null) { + return false; + } + // check if needed Solr fields are available (selected) + if (!sb.index.fulltext().useWebgraph()) { + ConcurrentLog.severe("FederateSearchManager", "Error on connecting to embedded Solr webgraph index"); + return false; + } + final SolrConnector connector = sb.index.fulltext().getWebgraphConnector(); + final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) + && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name())) + && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false); + if (!metafieldavailable) { + ConcurrentLog.warn("FederateSearchManager", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on"); + return false; + } + // the solr search + final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search"; + final String[] webgraphqueryfields = {WebgraphSchema.target_protocol_s.getSolrFieldName(), WebgraphSchema.target_urlstub_s.getSolrFieldName()}; + // alternatively target_protocol_s + "://" +target_host_s + target_path_s + + final long numfound; + try { + SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields); + numfound = docList.getNumFound(); + if (numfound == 0) { + ConcurrentLog.info("FederateSearchManager", "no results found, abort discover job"); + return true; + } + ConcurrentLog.info("FederateSearchManager", "start checking " + Long.toString(numfound) + " found index results"); + } catch (final IOException ex) { + ConcurrentLog.logException(ex); + return false; + } + + final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever + + // job to iterate through Solr index to find links to opensearchdescriptions + // started as background job as connect timeouts may cause it run a long time + final Thread job = new Thread() { + @Override + public void run() { + try { + boolean doloop = true; + int loopnr = 0; + Set dblmem = new HashSet(); // temp memory for already checked url + while (doloop) { + ConcurrentLog.info("FederateSearchManager", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound)); + SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20, webgraphqueryfields); // check chunk of 20 result documents + loopnr++; + if (stoptime < System.currentTimeMillis()) {// stop after max 1h + doloop = false; + ConcurrentLog.info("FederateSearchManager", "long running discover task aborted"); + } + if (docList != null && docList.size() > 0) { + Iterator docidx = docList.iterator(); + while (docidx.hasNext()) { + SolrDocument sdoc = docidx.next(); + + String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName()); + try { + URL url = new URL(hrefurltxt); + //TODO: check Blacklist + if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries + opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt); + if (os.getRSSorAtomUrl() != null) { + // add found system to config file + addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName")); + ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt); + } else { + ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt); + } + } + } catch (final MalformedURLException ex) { + } + } + } else { + doloop = false; + } + } + ConcurrentLog.info("FederateSearchManager", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)"); + } catch (final IOException ex) { + ConcurrentLog.logException(ex); + } + } + }; + job.start(); + return true; + } + + /** + * Read or reread opensearch config file and initialize connectors + * + * @param cfgFileName + * @return true if successful + */ + public boolean init(String cfgFileName) { + confFile = new File(cfgFileName); + if (confFile.exists()) { + try { + cfg = new Configuration(confFile); + if (!this.conlist.isEmpty()) this.conlist.clear(); // prevent double entries + Iterator it = cfg.entryIterator(); + while (it.hasNext()) { + Entry cfgentry = it.next(); + if (cfgentry.enabled()) { // hold only enabled in memory + String name = cfgentry.key(); + String url = cfgentry.getValue(); + if (url != null && !url.isEmpty()) { + if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url) + // config entry has 3 parts separated by : 1=cfgfile 2=connectortype 3=relative path to connector-cfg-file + // example cfgfile:solrconnector:testsys.solr.schema + String[] parts = url.split(":"); + if (parts[1].equalsIgnoreCase("solrconnector")) { + SolrFederateSearchConnector sfc = new SolrFederateSearchConnector(); + if (sfc.init(name, confFile.getParent()+"/federatecfg/"+parts[2])) { + conlist.add(sfc); + } + } else { + ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url); + } + } else { // handle opensearch url template + OpenSearchConnector osd; + osd = new OpenSearchConnector(); + if (osd.init(name, url)) { + conlist.add(osd); + } + } + } + } + } + } catch (IOException ex) { + ConcurrentLog.logException(ex); + } + } + return true; + } + +} diff --git a/source/net/yacy/cora/federate/SolrFederateSearchConnector.java b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java new file mode 100644 index 000000000..7e9fceaaa --- /dev/null +++ b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java @@ -0,0 +1,119 @@ +/** + * SolrFederateSearchConnector.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.federate.solr.instance.RemoteInstance; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.query.QueryParams; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; + +/** + * Search connecter to collect query results from remote Solr systems which + * provide results as Solr documents + */ +public class SolrFederateSearchConnector extends AbstractFederateSearchConnector { + + private String corename; + + @Override + public boolean init(String instance, String cfgFileName) { + boolean initResult = super.init(instance, cfgFileName); // init local schema cfg + if (initResult) { + if (this.localcfg.contains("_baseurl")) { + setBaseurl(this.localcfg.get("_baseurl").getValue()); + } else { + ConcurrentLog.config(instance, "no _baseurl given in config file "+cfgFileName); + initResult = false; + } + if (this.localcfg.contains("_corename")) { + setCoreName(this.localcfg.get("_corename").getValue()); + } else { + ConcurrentLog.config(instance, "no _corename given in config file "); // not mandatory + this.corename = ""; + } + } + return initResult; + } + + public void setBaseurl(String url) { + if (url.endsWith("/")) { + this.baseurl = url; + } else { + this.baseurl = url + "/"; + } + } + + public void setCoreName(String core) { + this.corename = core; + } + + /** + * Core query implementation + * all query and search routines will use this routine to query the remote system + * + * @param query + * @return list of solr documents (metadata) accordng to local YaCy internal schema + */ + @Override + public List query(QueryParams query) { + + List docs = new ArrayList(); + Collection remotecorename = new ArrayList(); + remotecorename.add(corename); + ModifiableSolrParams msp = new SolrQuery(query.getQueryGoal().getQueryString(false)); + msp.add(CommonParams.QT, "/"); // important to override default append of /select + msp.add(CommonParams.ROWS, Integer.toString(query.itemsPerPage)); + try { + RemoteInstance instance = new RemoteInstance(baseurl, remotecorename, corename, 20000); + try { + SolrConnector solrConnector = new RemoteSolrConnector(instance, false, null); + try { + this.lastaccesstime = System.currentTimeMillis(); + SolrDocumentList docList = solrConnector.getDocumentListByParams(msp); + // convert to YaCy schema documentlist + for (SolrDocument doc : docList) { + URIMetadataNode anew = toYaCySchema(doc); + docs.add(anew); + } + } catch (IOException | SolrException e) { + } finally { + solrConnector.close(); + } + } catch (Throwable ee) { + } finally { + instance.close(); + } + } catch (IOException eee) { + } + return docs; + } +} diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index 68fee2161..defc0e1cc 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -19,107 +19,45 @@ */ package net.yacy.cora.federate.opensearch; -import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.net.URL; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; - -import net.yacy.cora.federate.solr.connector.SolrConnector; -import net.yacy.cora.storage.Configuration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.AbstractFederateSearchConnector; +import net.yacy.cora.federate.FederateSearchConnector; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.cora.util.SpaceExceededException; -import net.yacy.document.parser.xml.opensearchdescriptionReader; -import net.yacy.kelondro.blob.Tables; -import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; -import net.yacy.search.query.SearchEvent; -import net.yacy.search.schema.WebgraphSchema; - -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; +import net.yacy.document.TextParser; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.query.QueryParams; +import net.yacy.search.schema.CollectionSchema; /** * Handling of queries to remote OpenSearch systems. Iterates to a list of - * configured systems until number of needed results are available. Uses a - * temporary work table to store search template urls for the iteration during - * search. + * configured systems until number of needed results are available. */ -public class OpenSearchConnector { - - private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf - private int size = 0; // remember the size of active opensearch targets - - public OpenSearchConnector(Switchboard sb, boolean createworktable) { - super(); - if (sb == null) { - return; - } - - confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); - - if (createworktable) { // read from config file and create worktable - sb.tables.clear("opensearchsys"); - try { - Configuration cfg = new Configuration(confFile); - - // copy active opensearch systems to a work table (opensearchsys) - Iterator cfgentries = cfg.entryIterator(); - while (cfgentries.hasNext()) { - Configuration.Entry e = cfgentries.next(); - if (e.enabled()) { - String title = e.key(); // get the title - String urlstr = e.getValue(); // get the search template url +public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector { - Tables.Data row = new Tables.Data(); - row.put("title", title); - row.put("url", urlstr); - try { - sb.tables.insert("opensearchsys", row); - } catch (final SpaceExceededException ex) { - ConcurrentLog.logException(ex); - } - } - } - size = sb.tables.size("opensearchsys"); - } catch (final IOException ex) { - ConcurrentLog.logException(ex); - } - } - } - - /** - * Sends a search request to remote systems listed in worktable until the - * searchevent contains less than needed results. Depending on already - * collected search results none to all configured systems are queried to - * complete available search results. - * if query search domain is LOCAL procedure does nothing. - */ - static public void query(Switchboard sb, SearchEvent theSearch) { - if (theSearch != null && sb != null) { - if (!theSearch.query.isLocal()) { - try { - Iterator ossysworktable = sb.tables.iterator("opensearchsys"); - //int needres = theSearch.query.neededResults(); // get number of needed results - while (ossysworktable.hasNext() /*&& theSearch.query.getResultCount() < needres*/) { - Tables.Row row = ossysworktable.next(); - String osurl = row.get("url", ""); - String name = row.get("title", ""); - sb.heuristicRSS(parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0, theSearch.query.itemsPerPage), theSearch, name); - } - } catch (final IOException ex) { - ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys"); - } - } - } + @Override + public boolean init(final String name, final String urltemplate) { + this.baseurl = urltemplate; + this.instancename = name; + this.localcfg = null; // no field mapping needed + return true; } /** * replace Opensearchdescription search template parameter with actual values */ - private static String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) { + private String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) { String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}=' tmps = tmps.replace("{startIndex}", Integer.toString(start)); tmps = tmps.replace("{startPage}", ""); @@ -131,138 +69,76 @@ public class OpenSearchConnector { } /** - * add a opensearch target system to the config file + * queries remote system and returns the resultlist (waits until results + * transmitted or timeout) This is the main access routine used for the + * serach and query operation For internal access delay time, also the + * this.lastaccessed time needs to be set here. + * + * @return query results (metadata) with fields according to YaCy schema */ - public boolean add(String name, String url, boolean active, String comment) { - if (confFile == null) { - return false; - } + @Override + public List query(QueryParams query) { + List docs = new ArrayList(); + // see http://www.loc.gov/standards/sru/ + String searchurl = this.parseSearchTemplate(baseurl, query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage); try { - Configuration conf = new Configuration(confFile); - if (name != null && !name.isEmpty()) { - conf.add(name, null, active); - Configuration.Entry e = conf.get(name); - e.setValue(url); - e.setEnable(active); - e.setComment(comment); - conf.put(name, e); - try { - conf.commit(); - } catch (final IOException ex) { - ConcurrentLog.warn("OpenSearchConnector.add", "config file write error"); - } - return true; - } - } catch (final IOException e1) { - ConcurrentLog.logException(e1); - return false; - } - return false; - } - - /** - * Get the number of active remote opensearch target systems - */ - public int getSize() { - return size; - } - - /** - * Discover opensearch description links from local (embedded) Solr index using - * meta data field 'outboundlinks_tag_txt' and add found systems to the - * config file - * - * @return true if background discover job was started, false if job not started - */ - public boolean discoverFromSolrIndex(final Switchboard sb) { - if (sb == null) { - return false; - } - // check if needed Solr fields are available (selected) - if (!sb.index.fulltext().useWebgraph()) { - ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index"); - return false; - } - final SolrConnector connector = sb.index.fulltext().getWebgraphConnector(); - final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) - && ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) ) - && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false); - if (!metafieldavailable) { - ConcurrentLog.warn("OpenSearchConnector.Discover", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on"); - return false; - } - // the solr query - final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search"; - final String[] webgraphqueryfields = { WebgraphSchema.target_protocol_s.getSolrFieldName() , WebgraphSchema.target_urlstub_s.getSolrFieldName()}; - // alternatively target_protocol_s + "://" +target_host_s + target_path_s - - final long numfound; - try { - SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields); - numfound = docList.getNumFound(); - if (numfound == 0) { - ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job"); - return true; - } - ConcurrentLog.info("OpenSearchConnector.Discover", "start checking " + Long.toString(numfound) + " found index results"); - } catch (final IOException ex) { - ConcurrentLog.logException(ex); - return false; - } + MultiProtocolURL aurl = new MultiProtocolURL(MultiProtocolURL.unescape(searchurl)); + try { + this.lastaccesstime = System.currentTimeMillis(); + final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyIntranetCrawlerAgent); + byte[] result = httpClient.GETbytes(aurl, null, null, false); + RSSReader rssReader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); + if (rssReader != null) { + final RSSFeed feed = rssReader.getFeed(); + if (feed != null) { + for (final RSSMessage item : feed) { + try { + DigestURL uri = new DigestURL(item.getLink()); + + URIMetadataNode doc = new URIMetadataNode(uri); + doc.setField(CollectionSchema.charset_s.getSolrFieldName(), UTF8.charset.name()); + doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor()); + doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle()); + doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage()); + doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate()); + final String mime = TextParser.mimeOf(uri); + if (mime != null) { + doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime); + } + if (item.getCategory().isEmpty()) { + doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject())); + } else { + doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory()); + } + doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright()); - final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever + // TODO: we likely got only a search related snippet (take is as text content) + // we need a way to differentiate metadata from full crawl data in the index (would be also good for rwi transferred/received metadata) + // or considere to add this to snippet cache, without adding text_t + doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions()); - // job to iterate through Solr index to find links to opensearchdescriptions - // started as background job as connect timeouts may cause it run a long time - final Thread job = new Thread() { - @Override - public void run() { - try { - boolean doloop = true; - int loopnr = 0; - Set dblmem = new HashSet(); // temp memory for already checked url - while (doloop) { - ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound)); - SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents - loopnr++; - if (stoptime < System.currentTimeMillis()) {// stop after max 1h - doloop = false; - ConcurrentLog.info("OpenSearchConnector.Discover", "long running discover task aborted"); - } - if (docList != null && docList.size() > 0) { - Iterator docidx = docList.iterator(); - while (docidx.hasNext()) { - SolrDocument sdoc = docidx.next(); - - String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName()); - try { - URL url = new URL(hrefurltxt); - //TODO: check Blacklist - if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries - opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt); - if (os.getRSSorAtomUrl() != null) { - // add found system to config file - add(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName")); - ConcurrentLog.info("OpenSearchConnector.Discover", "added " + os.getShortName() + " " + hrefurltxt); - } else { - ConcurrentLog.info("OpenSearchConnector.Discover", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt); - } - } - } catch (final MalformedURLException ex) { + if (item.getLat() != 0.0 && item.getLon() != 0.0) { + doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon()); + } + if (item.getSize() > 0) { + doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize()); } + + docs.add(doc); + } catch (final MalformedURLException e) { } - } else { - doloop = false; } + ConcurrentLog.info("OpenSerachConnector", "received " + docs.size() + " results from " + this.instancename); } - ConcurrentLog.info("OpenSearchConnector.Discover", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)"); - } catch (final IOException ex) { - ConcurrentLog.logException(ex); } + } catch (IOException ex) { + ConcurrentLog.logException(ex); + ConcurrentLog.info("OpenSearchConnector", "no connection to " + searchurl); } - }; - job.start(); - return true; + } catch (MalformedURLException ee) { + ConcurrentLog.warn("OpenSearchConnector", "malformed url " + searchurl); + } + return docs; } } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 644763ff7..e2f53f647 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -69,7 +69,7 @@ public class URIMetadataNode extends SolrDocument { private static final long serialVersionUID = -256046934741561968L; protected String keywords = null; - protected DigestURL url = null; + protected DigestURL url; protected Bitfield flags = null; protected int imagec = -1, audioc = -1, videoc = -1, appc = -1; protected double lat = Double.NaN, lon = Double.NaN; @@ -150,7 +150,6 @@ public class URIMetadataNode extends SolrDocument { for (String name : doc.getFieldNames()) { this.addField(name, doc.getFieldValue(name)); } - this.snippet = ""; Float scorex = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result this.score = scorex == null ? 0.0f : scorex.floatValue(); final byte[] hash = ASCII.getBytes(getString(CollectionSchema.id)); // TODO: can we trust this id ? @@ -169,6 +168,24 @@ public class URIMetadataNode extends SolrDocument { this.score = scorex; } + public URIMetadataNode (final String urlstr) { + super(); + try { + url = new DigestURL(urlstr); + this.setField(CollectionSchema.sku.name(), url.toNormalform(true)); + this.setField(CollectionSchema.id.name(), ASCII.String(url.hash())); + } catch (final MalformedURLException e) { + ConcurrentLog.logException(e); + this.url = null; + } + } + public URIMetadataNode(DigestURL theurl) { + super(); + url = theurl; + this.setField(CollectionSchema.sku.name(), url.toNormalform(true)); + this.setField(CollectionSchema.id.name(), ASCII.String(url.hash())); + } + /** * Get the content domain of a document. This tries to get the content domain from the mime type * and if this fails it uses alternatively the content domain from the file extension. diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d735dba3a..084faae18 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -714,16 +714,7 @@ public final class Switchboard extends serverSwitch { this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath()); this.surrogatesOutPath.mkdirs(); */ - // copy opensearch heuristic config (if not exist) - final File osdConfig = new File(getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); - if (!osdConfig.exists()) { - final File osdDefaultConfig = new File(getAppPath(), "defaults/heuristicopensearch.conf"); - this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath()); - try { - Files.copy(osdDefaultConfig, osdConfig); - } catch (final IOException ex) { } - } - + // create the release download directory this.releasePath = getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT); @@ -3615,7 +3606,9 @@ public final class Switchboard extends serverSwitch { * @param urlpattern the search query url (e.g. http://search.org?query=searchword) * @param searchEvent * @param feedName short/internal name of the remote system + * @deprecated use FederateSearchManager(SearchEvent) instead */ + @Deprecated // not used (since 2015-01-18, v1.81) public final void heuristicRSS( final String urlpattern, final SearchEvent searchEvent,