From 275d65fffe0338145bec8c272f50e3cc6c5e7925 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 5 Aug 2017 22:30:06 +0200 Subject: [PATCH] Patch last_modified date with internal FirstSeenTime() if no date provided to make sure updated documents are indexed with their last-modified date as provided in current crawl. (to patch moddate always with firstseen might bear the risk of miss actual updates). --- .../search/schema/CollectionConfiguration.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 070565cb3..c2cad79f1 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -534,11 +534,18 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.author, author); } if (allAttr || contains(CollectionSchema.last_modified)) { - Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified(); - if (lastModified == null) lastModified = new Date(); - if (document.getLastModified().before(lastModified)) lastModified = document.getLastModified(); - long firstSeen = segment.getFirstSeenTime(digestURL.hash()); - if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier + Date lastModified = responseHeader == null ? document.getLastModified() : responseHeader.lastModified(); + if (lastModified == null) { + long firstSeen = segment.getFirstSeenTime(digestURL.hash()); + if (firstSeen > 0) { + lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier + } else { + lastModified = new Date(); + } + } + if (document.getLastModified().before(lastModified)) { + lastModified = document.getLastModified(); + } add(doc, CollectionSchema.last_modified, lastModified); } if (allAttr || contains(CollectionSchema.dates_in_content_dts) || contains(CollectionSchema.dates_in_content_count_i)) {