diff --git a/htroot/api/push_p.html b/htroot/api/push_p.html new file mode 100644 index 000000000..febf7eca3 --- /dev/null +++ b/htroot/api/push_p.html @@ -0,0 +1,76 @@ + + + + +

File Upload

+ #(mode)# + +

This form can be used to upload a file and assign it to an url. + Example usage is the direct attachment of a content management system to YaCy to push newly changed files directly to the YaCy indexer.

+
+
+
File Count
+
#[count]#
+
synchronous
+
+
commit
+
+
 
 
+
Files to process:
+ + #{input}# +
 
 
+
+
+
File Number
+
#[count]#
+ +
Data
+
+ +
URL
+
+ +
Last-Modified
+
+ +
Content-Type
+
+ +
Collection
+
+
+
+ #{/input}# +
+ +
+ :: + + Result for the recently submitted file(s). You can also submit the same form using the servlet push_p.json to get push confirmations in json format. +
+
count
#[count]#
+
successall
#(successall)#false::true#(/successall)#
+
countsuccess
#[countsuccess]#
+
countfail
#[countfail]#
+
+ + + #{results}# + + + + + + + #{/results}# +
ItemURLSuccessMessage
#[item]##[url]##(success)#fail::ok#(/success)##(success)##[message]#::#[message]##(/success)#
+

+ If you want to push again files, use this form to pre-define a number of upload forms: +

+ +
+

+ #(/mode)# + + \ No newline at end of file diff --git a/htroot/api/push_p.java b/htroot/api/push_p.java new file mode 100644 index 000000000..6ad5a3867 --- /dev/null +++ b/htroot/api/push_p.java @@ -0,0 +1,134 @@ +/** + * push_p + * Copyright 2014 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany + * First released 12.06.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +import java.net.MalformedURLException; +import java.util.Date; + +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.order.Base64Order; +import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.retrieval.Request; +import net.yacy.crawler.retrieval.Response; +import net.yacy.search.IndexingQueueEntry; +import net.yacy.search.Switchboard; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class push_p { + + // test: http://localhost:8090/api/push_p.json?count=1&synchronous=false&commit=false&url-0=http://nowhere.cc/example.txt&data-0=%22hello%20world%22&lastModified-0=Tue,%2015%20Nov%201994%2012:45:26%20GMT&contentType-0=text/plain&collection-0=testpush + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + + // display mode: this only helps to display a nice input form for test cases + int c = post == null ? 1 : post.getInt("c", 0); + if (c > 0) { + prop.put("mode", 0); + for (int i = 0; i < c; i++) prop.put("mode_input_" + i + "_count", i); + prop.put("mode_input", c); + prop.put("mode_count", c); + return prop; + } + + // push mode: this does a document upload + prop.put("mode", 1); + if (post == null) return prop; + boolean synchronous = post.getBoolean("synchronous"); + boolean commit = post.getBoolean("commit"); + int count = post.getInt("count", 0); + boolean successall = true; + int countsuccess = 0; + int countfail = 0; + for (int i = 0; i < count; i++) { + try { + prop.put("mode_results_" + i + "_item", i); + String u = post.get("url-" + i, ""); + prop.put("mode_results_" + i + "_url", u); + DigestURL url = new DigestURL(u); + String collection = post.get("collection-" + i, ""); + String lastModified = post.get("lastModified-" + i, ""); // must be in RFC1123 format + String contentType = post.get("contentType-" + i, ""); + String data64 = post.get("data-" + i, ""); // file uploads are base64encoded in YaCyDefaultServlet.parseMultipart + byte[] data = Base64Order.standardCoder.decode(data64); + if ((data == null || data.length == 0) && data64.length() > 0) data = UTF8.getBytes(data64); // for test cases + + // create response header + final RequestHeader requestHeader = new RequestHeader(); + final ResponseHeader responseHeader = new ResponseHeader(200); + responseHeader.put(HeaderFramework.LAST_MODIFIED, lastModified); + responseHeader.put(HeaderFramework.CONTENT_TYPE, contentType); + responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(data.length)); + CrawlProfile profile = sb.crawler.getPushCrawlProfile(collection); + + // create requests and artificial response + final Request request = new Request( + ASCII.getBytes(sb.peers.mySeed().hash), + url, + null, // referrer hash + "", // the name of the document to crawl + new Date(), // current date + profile.handle(), // the name of the prefetch profile. This must not be null! + 0, // depth the crawling depth of the entry + 0, // anchors number of anchors of the parent + 0); // forkfactor sum of anchors of all ancestors + Response response = new Response( + request, + requestHeader, + responseHeader, + profile, + false, // from cache? + data); // content + + // asynchronously push the content to the indexing queue + sb.indexingDocumentProcessor.enQueue(new IndexingQueueEntry( + response, + null, + null)); + prop.put("mode_results_" + i + "_success", "1"); + prop.put("mode_results_" + i + "_success_message", "http://" + Domains.myPublicLocalIP().getHostAddress() + ":" + sb.getConfigInt("port", 8090) + "/solr/select?q=sku:%22" + u + "%22"); + countsuccess++; + } catch (MalformedURLException e) { + e.printStackTrace(); + prop.put("mode_results_" + i + "_success", "0"); + prop.put("mode_results_" + i + "_success_message", e.getMessage()); + successall = false; + countfail++; + } + } + prop.put("mode_results", count); + prop.put("mode_successall", successall ? "1" : "0"); + prop.put("mode_count", count); + prop.put("mode_countsuccess", countsuccess); + prop.put("mode_countfail", countfail); + + if (synchronous && commit) sb.index.fulltext().commit(true); + + return prop; + } + +} diff --git a/htroot/api/push_p.json b/htroot/api/push_p.json new file mode 100644 index 000000000..7b69b22e6 --- /dev/null +++ b/htroot/api/push_p.json @@ -0,0 +1,14 @@ +{#(mode)#:: + "count":"#[count]#", + "successall": #(successall)#"false"::"true"#(/successall)#, +#{results}# + "item-#[item]#":{ + "item":"#[item]#", + "url":"#[url]#", + "success": #(success)#"false"::"true"#(/success)#, + "message": #(success)#"#[message]#"::"#[message]#"#(/success)# + }, +#{/results}# + "countsuccess":#[countsuccess]#, + "countfail":#[countfail]# +#(/mode)#} diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 85411379d..bfcb399e1 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -67,6 +67,7 @@ public final class CrawlSwitchboard { public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; public static final String CRAWL_PROFILE_SURROGATE = "surrogates"; + public static final String CRAWL_PROFILE_PUSH_STUB = "push_"; public static Set DEFAULT_PROFILES = new HashSet(); static { @@ -96,12 +97,9 @@ public final class CrawlSwitchboard { private final MapHeap profilesPassiveCrawls; private final Map profilesActiveCrawlsCache; //TreeMap(Base64Order.enhancedCoder); private final Map profilesActiveCrawlsCounter; - public CrawlProfile defaultProxyProfile; - public CrawlProfile defaultRemoteProfile; - public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; - public CrawlProfile defaultTextGreedyLearningProfile; - public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; - public CrawlProfile defaultSurrogateProfile; + public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; + public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile; + private Map defaultPushProfiles; // for each collection one profile private final File queuesRoot; private Switchboard switchboard; @@ -110,6 +108,7 @@ public final class CrawlSwitchboard { this.switchboard = switchboard; this.log = this.switchboard.log; this.queuesRoot = this.switchboard.queuesRoot; + this.defaultPushProfiles = new ConcurrentHashMap<>(); this.log.info("Initializing Word Index for the network '" + networkName + "'."); if ( networkName == null || networkName.isEmpty() ) { @@ -493,7 +492,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, - true, true, true, + true, true, false, true, false, false, @@ -505,6 +504,38 @@ public final class CrawlSwitchboard { UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); } + + public CrawlProfile getPushCrawlProfile(String collection) { + CrawlProfile genericPushProfile = this.defaultPushProfiles.get(collection); + if (genericPushProfile != null) return genericPushProfile; + genericPushProfile = new CrawlProfile( + CRAWL_PROFILE_PUSH_STUB + collection, + CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch + CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + 0, + false, + System.currentTimeMillis(), + -1, + true, true, false, + true, + true, + false, + false, + CacheStrategy.NOCACHE, + collection, + ClientIdentification.yacyIntranetCrawlerAgentName); + this.profilesActiveCrawls.put(UTF8.getBytes(genericPushProfile.handle()), genericPushProfile); + this.defaultPushProfiles.put(collection, genericPushProfile); + return genericPushProfile; + } private void resetProfiles() { this.profilesActiveCrawlsCache.clear(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 28c567155..5677b6cf1 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2069,7 +2069,6 @@ public final class Switchboard extends serverSwitch { CrawlProfile selentry; for ( final byte[] handle : this.crawler.getActive() ) { selentry = this.crawler.getActive(handle); - assert selentry.handle() != null : "profile.name = " + selentry.collectionName(); if ( selentry.handle() == null ) { this.crawler.removeActive(handle); continue;