reactivated on-demand snapshot loading

pull/1/head
Michael Peter Christen 10 years ago
parent 2362ad7c34
commit 932faafffe

@ -26,6 +26,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Collection; import java.util.Collection;
import java.util.Date;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
@ -248,16 +249,21 @@ public class snapshot {
} }
if (pdf || pngjpg) { if (pdf || pngjpg) {
Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY); Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
File pdfFile = null; File pdfFile = null;
if (pdfSnapshots.size() == 0) { if (pdfSnapshots.size() == 0) {
// if the client is authenticated, we create the pdf on the fly! // if the client is authenticated, we create the pdf on the fly!
if (!authenticated) return null; if (!authenticated) return null;
SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash()); SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
boolean success = false;
if (sd == null) {
success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null));
} else {
SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd); SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd);
boolean success = Transactions.store(sid, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null));
}
if (success) { if (success) {
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY); pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next(); if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next();
} }
} else { } else {

@ -146,7 +146,7 @@ public class Transactions {
} }
} }
public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
// GET METADATA FROM DOC // GET METADATA FROM DOC
final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -160,17 +160,11 @@ public class Transactions {
return false; return false;
} }
// CLEAN UP OLD DATA (if wanted) boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, agent, acceptLanguage) : true;
Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY); if (success) {
if (replaceOld) {
for (File oldPath: oldPaths) oldPath.delete();
}
// STORE METADATA FOR THE IMAGE // STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY); File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs(); metadataPath.getParentFile().mkdirs();
boolean success = true;
try { try {
if (doc != null) { if (doc != null) {
FileOutputStream fos = new FileOutputStream(metadataPath); FileOutputStream fos = new FileOutputStream(metadataPath);
@ -189,11 +183,29 @@ public class Transactions {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
success = false; success = false;
} }
}
return success;
}
public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
// CLEAN UP OLD DATA (if wanted)
Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
if (replaceOld && oldPaths != null) {
for (File oldPath: oldPaths) oldPath.delete();
}
// STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs();
boolean success = true;
// STORE AN IMAGE // STORE AN IMAGE
if (success && loadImage) { final String urls = url.toNormalform(true);
final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY); final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);
if (executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) { if (concurrency && executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) {
Thread t = new Thread(){ Thread t = new Thread(){
@Override @Override
public void run() { public void run() {
@ -209,7 +221,6 @@ public class Transactions {
} else { } else {
success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
} }
}
return success; return success;
} }

@ -580,7 +580,7 @@ public class Segment {
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) { if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
// STORE IMAGE AND METADATA // STORE IMAGE AND METADATA
Transactions.store(vector, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage); Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage);
} }
} }

Loading…
Cancel
Save