reactivated on-demand snapshot loading

pull/1/head
Michael Peter Christen 10 years ago
parent 2362ad7c34
commit 932faafffe

@ -26,6 +26,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Collection; import java.util.Collection;
import java.util.Date;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
@ -248,16 +249,21 @@ public class snapshot {
} }
if (pdf || pngjpg) { if (pdf || pngjpg) {
Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY); Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
File pdfFile = null; File pdfFile = null;
if (pdfSnapshots.size() == 0) { if (pdfSnapshots.size() == 0) {
// if the client is authenticated, we create the pdf on the fly! // if the client is authenticated, we create the pdf on the fly!
if (!authenticated) return null; if (!authenticated) return null;
SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash()); SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd); boolean success = false;
boolean success = Transactions.store(sid, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null)); if (sd == null) {
success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null));
} else {
SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd);
success = Transactions.store(sid, false, true, true, sb.getConfigBool("isTransparentProxy", false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, ClientIdentification.yacyProxyAgent, sb.getConfig("crawler.http.acceptLanguage", null));
}
if (success) { if (success) {
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY); pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next(); if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next();
} }
} else { } else {

@ -146,7 +146,7 @@ public class Transactions {
} }
} }
public static boolean store(final SolrInputDocument doc, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) { public static boolean store(final SolrInputDocument doc, final boolean concurrency, final boolean loadImage, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
// GET METADATA FROM DOC // GET METADATA FROM DOC
final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); final String urls = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -160,55 +160,66 @@ public class Transactions {
return false; return false;
} }
boolean success = loadImage ? store(url, date, depth, concurrency, replaceOld, proxy, agent, acceptLanguage) : true;
if (success) {
// STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs();
try {
if (doc != null) {
FileOutputStream fos = new FileOutputStream(metadataPath);
OutputStreamWriter osw = new OutputStreamWriter(fos);
osw.write(XML_PREFIX);
osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata)
osw.write("<result name=\"response\" numFound=\"1\" start=\"0\">\n");
EnhancedXMLResponseWriter.writeDoc(osw, doc);
osw.write("</result>\n");
osw.write("</response>\n");
osw.close();
fos.close();
Transactions.announceStorage(url, depth, date, State.INVENTORY);
}
} catch (IOException e) {
ConcurrentLog.logException(e);
success = false;
}
}
return success;
}
public static boolean store(final DigestURL url, final Date date, final int depth, final boolean concurrency, final boolean replaceOld, final String proxy, final ClientIdentification.Agent agent, final String acceptLanguage) {
// CLEAN UP OLD DATA (if wanted) // CLEAN UP OLD DATA (if wanted)
Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY); Collection<File> oldPaths = Transactions.findPaths(url, depth, null, Transactions.State.INVENTORY);
if (replaceOld) { if (replaceOld && oldPaths != null) {
for (File oldPath: oldPaths) oldPath.delete(); for (File oldPath: oldPaths) oldPath.delete();
} }
// STORE METADATA FOR THE IMAGE // STORE METADATA FOR THE IMAGE
File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY); File metadataPath = Transactions.definePath(url, depth, date, "xml", Transactions.State.INVENTORY);
metadataPath.getParentFile().mkdirs(); metadataPath.getParentFile().mkdirs();
boolean success = true; boolean success = true;
try {
if (doc != null) {
FileOutputStream fos = new FileOutputStream(metadataPath);
OutputStreamWriter osw = new OutputStreamWriter(fos);
osw.write(XML_PREFIX);
osw.write(WHITESPACE); osw.write("\n-->\n"); // placeholder for transaction information properties (a hack to attach metadata to metadata)
osw.write("<result name=\"response\" numFound=\"1\" start=\"0\">\n");
EnhancedXMLResponseWriter.writeDoc(osw, doc);
osw.write("</result>\n");
osw.write("</response>\n");
osw.close();
fos.close();
Transactions.announceStorage(url, depth, date, State.INVENTORY);
}
} catch (IOException e) {
ConcurrentLog.logException(e);
success = false;
}
// STORE AN IMAGE // STORE AN IMAGE
if (success && loadImage) { final String urls = url.toNormalform(true);
final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY); final File pdfPath = Transactions.definePath(url, depth, date, "pdf", Transactions.State.INVENTORY);
if (executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) { if (concurrency && executorRunning.intValue() < Runtime.getRuntime().availableProcessors()) {
Thread t = new Thread(){ Thread t = new Thread(){
@Override @Override
public void run() { public void run() {
executorRunning.incrementAndGet(); executorRunning.incrementAndGet();
try { try {
Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
} catch (Throwable e) {} finally { } catch (Throwable e) {} finally {
executorRunning.decrementAndGet(); executorRunning.decrementAndGet();
}
} }
}; }
executor.execute(t); };
} else { executor.execute(t);
success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath); } else {
} success = Html2Image.writeWkhtmltopdf(urls, proxy, agent.userAgent, acceptLanguage, pdfPath);
} }
return success; return success;

@ -580,7 +580,7 @@ public class Segment {
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) { if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
// STORE IMAGE AND METADATA // STORE IMAGE AND METADATA
Transactions.store(vector, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage); Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage);
} }
} }

Loading…
Cancel
Save