From 97f6089a41a4ed40aef84f692690e30f50585f5d Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 1 Dec 2014 15:03:09 +0100 Subject: [PATCH] YaCy can now create web page snapshots as pdf documents which can later be transcoded into jpg for image previews. To create such pdfs you must do: Add wkhtmltopdf and imagemagick to your OS, which you can do: On a Mac download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html and downloadh ttp://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip In Debian do "apt-get install wkhtmltopdf imagemagick" Then check in /Settings_p.html?page=ProxyAccess: "Transparent Proxy" and "Always Fresh" - this is used by wkhtmltopdf to fetch web pages using the YaCy proxy. Using "Always Fresh" it is possible to get all pages from the proxy cache. Finally, you will see a new option when starting an expert web crawl. You can set a maximum depth for crawling which should cause a pdf generation. The resulting pdfs are then available in DATA/HTCACHE/SNAPSHOTS/.///..pdf --- htroot/CrawlStartExpert.html | 22 +++++++++++++ htroot/CrawlStartExpert.java | 12 ++++++- htroot/Crawler_p.java | 8 ++++- htroot/QuickCrawlLink_p.java | 2 +- source/net/yacy/cora/util/Html2Image.java | 10 ++---- source/net/yacy/crawler/CrawlSwitchboard.java | 18 +++++------ .../net/yacy/crawler/data/CrawlProfile.java | 19 +++++++++--- source/net/yacy/crawler/data/Snapshots.java | 31 ++++++++++++------- .../yacy/crawler/retrieval/HTTPLoader.java | 9 ++++-- .../net/yacy/data/ymark/YMarkCrawlStart.java | 2 +- source/net/yacy/http/ProxyCacheHandler.java | 2 +- source/net/yacy/search/Switchboard.java | 4 +-- 12 files changed, 97 insertions(+), 42 deletions(-) diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 6ae89a7a1..c05569c73 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -460,6 +460,28 @@ #(/agentSelect)# + #(snapshotSelect)#:: +
+ Snapshot Creation +
+
+
+ info + Snapshots are pictures of web pages that can be created during crawling time. These pictures will be stored as pdf at first into subdirectories + of HTCACHE/SNAPSHOTS/ and are computed to jpg from the pdfs later. Snapshot generation can be controlled using a depth parameter; that + means a snapshot is only be generated if the crawl depth of a document is smaller or equal to the given number here. If the number is set to -1, + no snapshots are generated. + + +
+
+
+ replace old snapshots with new one    + add new versions for each crawl +
+
+
+ #(/snapshotSelect)#
Index Administration
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index b3f2a6640..425ba8b89 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -29,6 +29,7 @@ import java.util.List; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.util.Html2Image; import net.yacy.crawler.data.CrawlProfile; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; @@ -511,6 +512,15 @@ public class CrawlStartExpert { ClientIdentification.yacyInternetCrawlerAgentName); + // ---------- Snapshot generation + if (sb.getConfigBool("isTransparentProxy", false) && + sb.getConfigBool("proxyAlwaysFresh", false) && + Html2Image.wkhtmltopdfAvailable() && Html2Image.convertAvailable()) { + prop.put("snapshotSelect", 1); + } else { + prop.put("snapshotSelect", 0); + } + // ---------- Index Administration // Do Local Indexing if (post == null) { @@ -548,7 +558,7 @@ public class CrawlStartExpert { prop.put("collection", collectionEnabled ? defaultCollection : ""); } } - + // return rewrite properties return prop; } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 5fc6e2d4d..b9e44265a 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -436,6 +436,11 @@ public class Crawler_p { // check crawlurl was given in sitecrawl if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false; } + + String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1"); + int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString); + boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -462,7 +467,8 @@ public class Crawler_p { indexMedia, storeHTCache, crawlOrder, - -1, // temporary; stub commit + snapshotsMaxDepth, + snapshotsReplaceOld, cachePolicy, collection, agentName); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index dfad2b5a2..7f288c821 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -152,7 +152,7 @@ public class QuickCrawlLink_p { obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, indexText, indexMedia, storeHTCache, remoteIndexing, - -1, + -1, true, CacheStrategy.IFFRESH, collection, ClientIdentification.yacyIntranetCrawlerAgentName); diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index 3088e604a..a5e459632 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -37,12 +37,8 @@ import java.awt.Graphics; import java.awt.image.BufferedImage; import java.beans.PropertyChangeEvent; import java.beans.PropertyChangeListener; -import java.io.BufferedReader; import java.io.File; import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.List; public class Html2Image { @@ -58,11 +54,11 @@ public class Html2Image { private final static File convertDebian = new File("/usr/bin/convert"); - public boolean wkhtmltopdfAvailable() { + public static boolean wkhtmltopdfAvailable() { return wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists(); } - public boolean convertAvailable() { + public static boolean convertAvailable() { return convertMac.exists() || convertDebian.exists(); } @@ -77,7 +73,7 @@ public class Html2Image { final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian; try { - OS.execSynchronous(wkhtmltopdf.getAbsolutePath() + (proxy == null ? " " : " --proxy " + proxy + " ") + url + " " + destination.getAbsolutePath()); + OS.execSynchronous(wkhtmltopdf.getAbsolutePath() + " --title " + url + (proxy == null ? " " : " --proxy " + proxy + " ") + url + " " + destination.getAbsolutePath()); return destination.exists(); } catch (IOException e) { e.printStackTrace(); diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 7fb2af71e..11acbcf92 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -293,7 +293,7 @@ public final class CrawlSwitchboard { sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), true, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false), - -1, + -1, true, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName); @@ -323,7 +323,7 @@ public final class CrawlSwitchboard { true, false, false, - -1, + -1, true, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName); @@ -353,7 +353,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -383,7 +383,7 @@ public final class CrawlSwitchboard { true, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -414,7 +414,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName); @@ -444,7 +444,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -474,7 +474,7 @@ public final class CrawlSwitchboard { true, true, false, - -1, + -1, true, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -504,7 +504,7 @@ public final class CrawlSwitchboard { false, false, false, - -1, + -1, true, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName); @@ -537,7 +537,7 @@ public final class CrawlSwitchboard { true, false, false, - -1, + -1, true, CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName); diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index c44cd6fea..e4ae27b39 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -86,7 +86,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch"; public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch"; - public static final String LOADPREVIEWMAXDEPTH = "loadpreviewmaxdepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1 + public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1 + public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; @@ -142,7 +143,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M final boolean indexMedia, final boolean storeHTCache, final boolean remoteIndexing, - final int loadPreviewMaxdepth, + final int snapshotsMaxDepth, + final boolean snapshotsReplaceOld, final CacheStrategy cacheStrategy, final String collections, final String userAgentName) { @@ -178,7 +180,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(INDEX_MEDIA, indexMedia); put(STORE_HTCACHE, storeHTCache); put(REMOTE_INDEXING, remoteIndexing); - put(LOADPREVIEWMAXDEPTH, loadPreviewMaxdepth); + put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth); + put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld); put(CACHE_STRAGEGY, cacheStrategy.toString()); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); } @@ -575,8 +578,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } - public int loadPreviewMaxdepth() { - final String r = get(LOADPREVIEWMAXDEPTH); + public int snapshotMaxdepth() { + final String r = get(SNAPSHOTS_MAXDEPTH); if (r == null) return -1; try { final int i = Integer.parseInt(r); @@ -588,6 +591,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M } } + public boolean snapshotReplaceold() { + final String r = get(SNAPSHOTS_REPLACEOLD); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + /** * get a recrawl date for a given age in minutes * @param oldTimeMinutes diff --git a/source/net/yacy/crawler/data/Snapshots.java b/source/net/yacy/crawler/data/Snapshots.java index 404176af1..fb9fe3f34 100644 --- a/source/net/yacy/crawler/data/Snapshots.java +++ b/source/net/yacy/crawler/data/Snapshots.java @@ -31,6 +31,8 @@ import org.apache.solr.common.SolrDocument; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.util.Html2Image; import net.yacy.search.index.Fulltext; import net.yacy.search.schema.CollectionSchema; @@ -68,13 +70,15 @@ public class Snapshots { * @param proxy - a string of the form 'http://: * @return */ - public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, String proxy) { + public File downloadPDFSnapshot(final DigestURL url, final int depth, final Date date, boolean replaceOld, String proxy) { + Collection oldPaths = findPaths(url, depth); + if (replaceOld) { + for (File oldPath: oldPaths) oldPath.delete(); + } File path = definePath(url, "pdf", depth, date); path.getParentFile().mkdirs(); - - // STUB - - return path; + boolean success = Html2Image.writeWkhtmltopdf(url.toNormalform(true), proxy, path); + return success ? path : null; } /** @@ -122,9 +126,9 @@ public class Snapshots { * @param ext * @return a set of files for snapshots of the url */ - public Collection findPaths(final DigestURL url, final String ext) { + public Collection findPaths(final DigestURL url) { for (int i = 0; i < 100; i++) { - Collection paths = findPaths(url, ext, i); + Collection paths = findPaths(url, i); if (paths.size() > 0) return paths; } return new ArrayList<>(0); @@ -138,20 +142,23 @@ public class Snapshots { * @param depth * @return a set of files for snapshots of the url */ - public Collection findPaths(final DigestURL url, final String ext, final int depth) { + public Collection findPaths(final DigestURL url, final int depth) { String id = ASCII.String(url.hash()); File pathToShard = pathToShard(url, depth); - String[] list = pathToShard.list(); + String[] list = pathToShard.exists() && pathToShard.isDirectory() ? pathToShard.list() : null; // may be null if path does not exist ArrayList paths = new ArrayList<>(); - for (String f: list) { - if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f)); + if (list != null) { + final String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + for (String f: list) { + if (f.startsWith(id) && f.endsWith(ext)) paths.add(new File(pathToShard, f)); + } } return paths; } private File pathToShard(final DigestURL url, final int depth) { String id = ASCII.String(url.hash()); - File pathToHostDir = new File(storageLocation, url.getHost() + ":" + url.getPort()); + File pathToHostDir = new File(storageLocation, url.getHost() + "." + url.getPort()); File pathToDepthDir = new File(pathToHostDir, depth < 10 ? "0" + depth : Integer.toString(depth)); File pathToShard = new File(pathToDepthDir, id.substring(0, 2)); return pathToShard; diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 748b314f0..844310659 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -24,10 +24,12 @@ package net.yacy.crawler.retrieval; +import java.io.File; import java.io.IOException; import java.util.Date; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; @@ -76,8 +78,11 @@ public final class HTTPLoader { Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results - if (entry.depth() <= profile.loadPreviewMaxdepth() && "html|shtml|php".indexOf(entry.url().getFile()) >= 0) { - sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), "http://127.0.0.1:" + sb.getConfigInt("port", 8090)); + boolean depthok = profile != null && entry.depth() <= profile.snapshotMaxdepth(); + boolean extok = entry.url().getFile().length() == 0 || "html|shtml|php".indexOf(MultiProtocolURL.getFileExtension(entry.url().getFile())) >= 0; + if (depthok && extok) { + File snapshotFile = sb.snapshots.downloadPDFSnapshot(entry.url(), entry.depth(), new Date(), profile.snapshotReplaceold(), sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null); + this.log.info("SNAPSHOT - " + (snapshotFile == null ? "could not generate snapshot for " + entry.url().toNormalform(true) : "wrote " + snapshotFile + " for " + entry.url().toNormalform(true))); } return doc; } diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index 6b0899a6d..103af2add 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap{ crawlingQ, true, true, true, false, true, true, false, - -1, + -1, true, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard diff --git a/source/net/yacy/http/ProxyCacheHandler.java b/source/net/yacy/http/ProxyCacheHandler.java index 58f3efd03..392372937 100644 --- a/source/net/yacy/http/ProxyCacheHandler.java +++ b/source/net/yacy/http/ProxyCacheHandler.java @@ -70,7 +70,7 @@ public class ProxyCacheHandler extends AbstractRemoteHandler implements Handler final net.yacy.crawler.retrieval.Request yacyRequest = new net.yacy.crawler.retrieval.Request( null, url, - proxyHeaders.referer() == null ? null : new DigestURL(proxyHeaders.referer().toString()).hash(), + proxyHeaders.referer() == null ? null : new DigestURL(proxyHeaders.referer().toNormalform(true)).hash(), "", cachedResponseHeader.lastModified(), sb.crawler.defaultProxyProfile.handle(), diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 571b8221f..74bd8293d 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -346,7 +346,6 @@ public final class Switchboard extends serverSwitch { this.htDocsPath = getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT); this.log.config("HTDOCS Path: " + this.htDocsPath.toString()); - this.snapshots = new Snapshots(new File(this.htDocsPath, "SNAPSHOTS")); this.workPath = getDataPath(SwitchboardConstants.WORK_PATH, SwitchboardConstants.WORK_PATH_DEFAULT); this.workPath.mkdirs(); // if default work files exist, copy them (don't overwrite existing!) @@ -695,7 +694,8 @@ public final class Switchboard extends serverSwitch { final long maxCacheSize = 1024L * 1024L * Long.parseLong(getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte Cache.init(this.htCachePath, this.peers.mySeed().hash, maxCacheSize); - + this.snapshots = new Snapshots(new File(this.htCachePath, "SNAPSHOTS")); + // create the surrogates directories this.surrogatesInPath = getDataPath(