diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 6ae89a7a1..c05569c73 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -460,6 +460,28 @@ #(/agentSelect)# + #(snapshotSelect)#:: +
+ Snapshot Creation +
+
+
+ info + Snapshots are pictures of web pages that can be created during crawling time. These pictures will be stored as pdf at first into subdirectories + of HTCACHE/SNAPSHOTS/ and are computed to jpg from the pdfs later. Snapshot generation can be controlled using a depth parameter; that + means a snapshot is only be generated if the crawl depth of a document is smaller or equal to the given number here. If the number is set to -1, + no snapshots are generated. + + +
+
+
+ replace old snapshots with new one    + add new versions for each crawl +
+
+
+ #(/snapshotSelect)#
Index Administration
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index b3f2a6640..425ba8b89 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -29,6 +29,7 @@ import java.util.List; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.util.Html2Image; import net.yacy.crawler.data.CrawlProfile; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; @@ -511,6 +512,15 @@ public class CrawlStartExpert { ClientIdentification.yacyInternetCrawlerAgentName); + // ---------- Snapshot generation + if (sb.getConfigBool("isTransparentProxy", false) && + sb.getConfigBool("proxyAlwaysFresh", false) && + Html2Image.wkhtmltopdfAvailable() && Html2Image.convertAvailable()) { + prop.put("snapshotSelect", 1); + } else { + prop.put("snapshotSelect", 0); + } + // ---------- Index Administration // Do Local Indexing if (post == null) { @@ -548,7 +558,7 @@ public class CrawlStartExpert { prop.put("collection", collectionEnabled ? defaultCollection : ""); } } - + // return rewrite properties return prop; } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 5fc6e2d4d..b9e44265a 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -436,6 +436,11 @@ public class Crawler_p { // check crawlurl was given in sitecrawl if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false; } + + String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1"); + int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString); + boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -462,7 +467,8 @@ public class Crawler_p { indexMedia, storeHTCache, crawlOrder, - -1, // temporary; stub commit + snapshotsMaxDepth, + snapshotsReplaceOld, cachePolicy, collection, agentName); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index dfad2b5a2..7f288c821 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -152,7 +152,7 @@ public class QuickCrawlLink_p { obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, indexText, indexMedia, storeHTCache, remoteIndexing, - -1, + -1, true, CacheStrategy.IFFRESH, collection, ClientIdentification.yacyIntranetCrawlerAgentName); diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index 3088e604a..a5e459632 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -37,12 +37,8 @@ import java.awt.Graphics; import java.awt.image.BufferedImage; import java.beans.PropertyChangeEvent; import java.beans.PropertyChangeListener; -import java.io.BufferedReader; import java.io.File; import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.List; public class Html2Image { @@ -58,11 +54,11 @@ public class Html2Image { private final static File convertDebian = new File("/usr/bin/convert"); - public boolean wkhtmltopdfAvailable() { + public static boolean wkhtmltopdfAvailable() { return wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists(); } - public boolean convertAvailable() { + public static boolean convertAvailable() { return convertMac.exists() || convertDebian.exists(); } @@ -77,7 +73,7 @@ public class Html2Image { final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian; try { - OS.execSynchronous(wkhtmltopdf.getAbsolutePath() + (proxy == null ? " " : " --proxy " + proxy + " ") + url + " " + destination.getAbsolutePath()); + OS.execSynchronous(wkhtmltopdf.getAbsolutePath() + " --title " + url + (proxy == null ? " " : " --proxy " + proxy + " ") + url + " " + destination.getAbsolutePath()); return destination.exists(); } catch (IOException e) { e.printStackTrace(); diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 7fb2af71e..11acbcf92 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -293,7 +293,7 @@ public final class CrawlSwitchboard { sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), true, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false), - -1, + -1, true, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName); @@ -323,7 +323,7 @@ public final class CrawlSwitchboard { true, false, false, - -1, + -1, true, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName); @@ -353,7 +353,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -383,7 +383,7 @@ public final class CrawlSwitchboard { true, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -414,7 +414,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName); @@ -444,7 +444,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -474,7 +474,7 @@ public final class CrawlSwitchboard { true, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -504,7 +504,7 @@ public final class CrawlSwitchboard { false, false, false, - -1, + -1, true, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -537,7 +537,7 @@ public final class CrawlSwitchboard { true, false, false, - -1, + -1, true, CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName); diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index c44cd6fea..e4ae27b39 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -86,7 +86,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch"; public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch"; - public static final String LOADPREVIEWMAXDEPTH = "loadpreviewmaxdepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1 + public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1 + public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; @@ -142,7 +143,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M final boolean indexMedia, final boolean storeHTCache, final boolean remoteIndexing, - final int loadPreviewMaxdepth, + final int snapshotsMaxDepth, + final boolean snapshotsReplaceOld, final CacheStrategy cacheStrategy, final String collections, final String userAgentName) { @@ -178,7 +180,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(INDEX_MEDIA, indexMedia); put(STORE_HTCACHE, storeHTCache); put(REMOTE_INDEXING, remoteIndexing); - put(LOADPREVIEWMAXDEPTH, loadPreviewMaxdepth); + put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth); + put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld); put(CACHE_STRAGEGY, cacheStrategy.toString()); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); } @@ -575,8 +578,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } - public int loadPreviewMaxdepth() { - final String r = get(LOADPREVIEWMAXDEPTH); + public int snapshotMaxdepth() { + final String r = get(SNAPSHOTS_MAXDEPTH); if (r == null) return -1; try { final int i = Integer.parseInt(r); @@ -588,6 +591,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M } } + public boolean snapshotReplaceold() { + final String r = get(SNAPSHOTS_REPLACEOLD); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + /** * get a recrawl date for a given age in minutes * @param oldTimeMinutes diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java index 404176af1..fb9fe3f34 100644 --- a/source/net/yacy/crawler/data/Snapshots.java +++ b/source/net/yacy/crawler/data/Snapshots.java @@ -31,6 +31,8 @@ import org.apache.solr.common.SolrDocument; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.util.Html2Image; import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionSchema; @@ -68,13 +70,15 @@ public class Snapshots { * @param proxy - a string of the form 'http://: * @return */ - public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, String proxy) { + public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) { + Collection oldPaths = findPaths(url, depth); + if (replaceOld) { + for (File oldPath: oldPaths) oldPath.delete(); + } File path = definePath(url, "pdf", depth, date); path.getParentFile().mkdirs(); - - // STUB - - return path; + boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path); + return success ? path : null; } /** @@ -122,9 +126,9 @@ public class Snapshots { * @param ext * @return a set of files for snapshots of the url */ - public Collection findPaths(final DigestURL url, final String ext) { + public Collection findPaths(final DigestURL url) { for (int i = 0; i < 100; i++) { - Collection paths = findPaths(url, ext, i); + Collection paths = findPaths(url, i); if (paths.size() > 0) return paths; } return new ArrayList<>(0); @@ -138,20 +142,23 @@ public class Snapshots { * @param depth * @return a set of files for snapshots of the url */ - public Collection findPaths(final DigestURL url, final String ext, final int depth) { + public Collection findPaths(final DigestURL url, final int depth) { String id = ASCII.String(url.hash()); File pathToShard = pathToShard(url, depth); - String[] list = pathToShard.list(); + String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist ArrayList paths = new ArrayList<>(); - for (String f: list) { - if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f)); + if (list != null) { + final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + for (String f: list) { + if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f)); + } } return paths; } private File pathToShard(final DigestURL url, final int depth) { String id = ASCII.String(url.hash()); - File pathToHostDir = new File(storageLocation, url.getHost() + ":" + url.getPort()); + File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort()); File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth)); File pathToShard = new File(pathToDepthDir, id.substring(0, 2)); return pathToShard; diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 748b314f0..844310659 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -24,10 +24,12 @@ package net.yacy.crawler.retrieval; +import java.io.File; import java.io.IOException; import java.util.Date; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; @@ -76,8 +78,11 @@ public final class HTTPLoader { Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results - if (entry.depth() <= profile.loadPreviewMaxdepth() && "html|shtml|php".indexOf(entry.url().getFile()) >= 0) { - sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), "http://127.0.0.1:" + sb.getConfigInt("port", 8090)); + boolean depthok = profile != null && entry.depth() <= profile.snapshotMaxdepth(); + boolean extok = entry.url().getFile().length() == 0 || "html|shtml|php".indexOf(MultiProtocolURL.getFileExtension(entry.url().getFile())) >= 0; + if (depthok && extok) { + File snapshotFile = sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), profile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null); + this.log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + entry.url().toNormalform(true) : "wrote " + snapshotFile + " for " + entry.url().toNormalform(true))); } return doc; } diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index 6b0899a6d..103af2add 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap{ crawlingQ, true, true, true, false, true, true, false, - -1, + -1, true, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard diff --git a/source/net/yacy/http/ProxyCacheHandler.java b/source/net/yacy/http/ProxyCacheHandler.java index 58f3efd03..392372937 100644 --- a/source/net/yacy/http/ProxyCacheHandler.java +++ b/source/net/yacy/http/ProxyCacheHandler.java @@ -70,7 +70,7 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler final net.yacy.crawler.retrieval.Request yacyRequest = new net.yacy.crawler.retrieval.Request( null, url, - proxyHeaders.referer() == null ? null : new DigestURL(proxyHeaders.referer().toString()).hash(), + proxyHeaders.referer() == null ? null : new DigestURL(proxyHeaders.referer().toNormalform(true)).hash(), "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 571b8221f..74bd8293d 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -346,7 +346,6 @@ public final class Switchboard extends serverSwitch { this.htDocsPath = getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT); this.log.config("HTDOCS Path: " + this.htDocsPath.toString()); - this.snapshots = new Snapshots(new File(this.htDocsPath, "SNAPSHOTS")); this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT); this.workPath.mkdirs(); // if default work files exist, copy them (don't overwrite existing!) @@ -695,7 +694,8 @@ public final class Switchboard extends serverSwitch { final long maxCacheSize = 1024L * 1024L * Long.parseLong(getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte Cache.init(this.htCachePath, this.peers.mySeed().hash, maxCacheSize); - + this.snapshots = new Snapshots(new File(this.htCachePath, "SNAPSHOTS")); + // create the surrogates directories this.surrogatesInPath = getDataPath(