diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index fe796e31c..2da0aa29e 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -56,13 +56,15 @@ process_s
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text (mandatory field)
failreason_t
+## fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'
+failtype_s
+
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field)
httpstatus_i
## redirect url if the error code is 299 < httpstatus_i < 310
#httpstatus_redirect_s
-
### optional but highly recommended values, part of the index distribution process
## time when resource was loaded
diff --git a/source/net/yacy/cora/federate/solr/FailType.java b/source/net/yacy/cora/federate/solr/FailType.java
new file mode 100644
index 000000000..59a57e7f8
--- /dev/null
+++ b/source/net/yacy/cora/federate/solr/FailType.java
@@ -0,0 +1,28 @@
+/**
+ * FailType
+ * Copyright 2012 by Michael Peter Christen
+ * First released 23.11.2012 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+package net.yacy.cora.federate.solr;
+
+public enum FailType {
+
+ fail, // failed because of network failure
+ excl; // failed because content had to be excluded
+
+}
diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java
index 32e90fec8..078e15355 100644
--- a/source/net/yacy/cora/federate/solr/YaCySchema.java
+++ b/source/net/yacy/cora/federate/solr/YaCySchema.java
@@ -27,7 +27,7 @@ import java.util.List;
import org.apache.solr.common.SolrInputDocument;
public enum YaCySchema implements Schema {
-
+
// mandatory
id(SolrType.string, true, true, false, "primary key of document, the URL hash **mandatory field**"),
sku(SolrType.text_en_splitting_tight, true, true, false, true, "url of document"),
@@ -44,6 +44,7 @@ public enum YaCySchema implements Schema {
size_i(SolrType.num_integer, true, true, false, "the size of the raw source"),// int size();
process_s(SolrType.string, true, true, false, "index creation comment"),
failreason_t(SolrType.text_general, true, true, false, "fail reason if a page was not loaded. if the page was loaded then this field is empty"),
+ failtype_s(SolrType.string, true, true, false, "fail type if a page was not loaded. This field is either empty, 'excl' or 'fail'"),
httpstatus_i(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
httpstatus_redirect_s(SolrType.num_integer, true, true, false, "html status return code (i.e. \"200\" for ok), -1 if not loaded"),
@@ -192,7 +193,7 @@ public enum YaCySchema implements Schema {
ext_tracker_val(SolrType.num_integer, true, true, true, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, "names matching title expressions"),
ext_title_val(SolrType.num_integer, true, true, true, "number of matching title expressions");
-
+
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type;
private final boolean indexed, stored;
diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java
index 771caab8c..134c71c0d 100644
--- a/source/net/yacy/crawler/data/ZURL.java
+++ b/source/net/yacy/crawler/data/ZURL.java
@@ -38,6 +38,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.UTF8;
+import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.NaturalOrder;
@@ -63,16 +64,18 @@ public class ZURL implements Iterable {
public enum FailCategory {
// TEMPORARY categories are such failure cases that should be tried again
// FINAL categories are such failure cases that are final and should not be tried again
- TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded
- FINAL_PROCESS_CONTEXT(false), // because of a processing context we do not want that url again (i.e. remote crawling)
- FINAL_LOAD_CONTEXT(false), // the crawler configuration does not want to load the entity
- FINAL_ROBOTS_RULE(true), // a remote server denies indexing or loading
- FINAL_REDIRECT_RULE(true); // the remote server redirects this page, thus disallowing reading of content
+ TEMPORARY_NETWORK_FAILURE(true, FailType.fail), // an entity could not been loaded
+ FINAL_PROCESS_CONTEXT(false, FailType.excl), // because of a processing context we do not want that url again (i.e. remote crawling)
+ FINAL_LOAD_CONTEXT(false, FailType.excl), // the crawler configuration does not want to load the entity
+ FINAL_ROBOTS_RULE(true, FailType.excl), // a remote server denies indexing or loading
+ FINAL_REDIRECT_RULE(true, FailType.excl); // the remote server redirects this page, thus disallowing reading of content
public final boolean store;
+ public final FailType failType;
- private FailCategory(boolean store) {
+ private FailCategory(boolean store, FailType failType) {
this.store = store;
+ this.failType = failType;
}
}
@@ -180,7 +183,7 @@ public class ZURL implements Iterable {
if (this.solrConnector != null && failCategory.store) {
// send the error to solr
try {
- SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode);
+ SolrInputDocument errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, failCategory.failType, httpcode);
this.solrConnector.add(errorDoc);
} catch (final IOException e) {
Log.logWarning("SOLR", "failed to send error " + bentry.url().toNormalform(true) + " to solr: " + e.getMessage());
diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java
index 7c0a9154e..7ce8d31dd 100644
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@@ -42,6 +42,7 @@ import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
+import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.ConfigurationSet;
import net.yacy.cora.protocol.Domains;
@@ -822,7 +823,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
* @param httpstatus
* @throws IOException
*/
- public SolrInputDocument err(final DigestURI digestURI, final String failReason, final int httpstatus) throws IOException {
+ public SolrInputDocument err(final DigestURI digestURI, final String failReason, final FailType failType, final int httpstatus) throws IOException {
final SolrInputDocument solrdoc = new SolrInputDocument();
add(solrdoc, YaCySchema.id, ASCII.String(digestURI.hash()));
add(solrdoc, YaCySchema.sku, digestURI.toNormalform(true));
@@ -836,6 +837,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
// fail reason and status
if (contains(YaCySchema.failreason_t)) add(solrdoc, YaCySchema.failreason_t, failReason);
+ if (contains(YaCySchema.failtype_s)) add(solrdoc, YaCySchema.failtype_s, failType.name());
if (contains(YaCySchema.httpstatus_i)) add(solrdoc, YaCySchema.httpstatus_i, httpstatus);
return solrdoc;
}