diff --git a/bin/clearcache.sh b/bin/clearcache.sh
new file mode 100755
index 000000000..209be29c5
--- /dev/null
+++ b/bin/clearcache.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cd "`dirname $0`"
+./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=off&deleteSolr=off&deleteCache=on&deleteCrawlQueues=off&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null
diff --git a/bin/clearindex.sh b/bin/clearindex.sh
new file mode 100755
index 000000000..535371981
--- /dev/null
+++ b/bin/clearindex.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cd "`dirname $0`"
+./apicall.sh "/IndexControlRWIs_p.html?deleteIndex=on&deleteSolr=on&deleteCache=off&deleteCrawlQueues=on&deleteRobots=on&deleteSearchFl=on&deletecomplete=" > /dev/null
diff --git a/bin/importurllist.sh b/bin/importurllist.sh
new file mode 100755
index 000000000..ce943f7e4
--- /dev/null
+++ b/bin/importurllist.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+cd "`dirname $0`"
+./apicall.sh "/Crawler_p.html?bookmarkFolder=/crawlStart&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&crawlingIfOlderUnit=day&cachePolicy=ifexist&indexText=on&crawlingMode=file&crawlingURL=http://&bookmarkTitle=&mustnotmatch=&crawlingstart=import&mustmatch=.*&crawlingIfOlderNumber=7&repeat_unit=seldays&crawlingDepth=0&crawlingFile=$1" > /dev/null
diff --git a/build.xml b/build.xml
index 46c690039..14955a82b 100644
--- a/build.xml
+++ b/build.xml
@@ -405,13 +405,20 @@
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java
index 89fe5dc4f..310a203d6 100644
--- a/htroot/WebStructurePicture_p.java
+++ b/htroot/WebStructurePicture_p.java
@@ -112,7 +112,7 @@ public class WebStructurePicture_p {
// recursively find domains, up to a specific depth
final GraphPlotter graph = new GraphPlotter();
- if (host != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth);
+ if (host != null && hash != null) place(graph, sb.webStructure, hash, host, nodes, timeout, 0.0, 0.0, 0, depth);
//graph.print();
graphPicture = graph.draw(width, height, 40, 40, 16, 16, color_back, color_dot, color_line, color_lineend, color_text);
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index aff6fc76b..3b52f113a 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -98,6 +98,7 @@ import net.yacy.document.TextParser;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.OAIListFriendsLoader;
+import net.yacy.document.parser.html.Evaluation;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@@ -485,7 +486,7 @@ public final class Switchboard extends serverSwitch {
//starting blog
initBlog();
- // Init User DB
+ // init User DB
this.log.logConfig("Loading User DB");
final File userDbFile = new File(getDataPath(), "DATA/SETTINGS/user.heap");
this.userDB = new UserDB(userDbFile);
@@ -493,7 +494,19 @@ public final class Switchboard extends serverSwitch {
", " + this.userDB.size() + " entries" +
", " + ppRamString(userDbFile.length()/1024));
- // Init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark.
+ // init html parser evaluation scheme
+ File parserPropertiesPath = new File("defaults/");
+ String[] settingsList = parserPropertiesPath.list();
+ for (String l: settingsList) {
+ if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l));
+ }
+ parserPropertiesPath = new File(getDataPath(), "DATA/SETTINGS/");
+ settingsList = parserPropertiesPath.list();
+ for (String l: settingsList) {
+ if (l.startsWith("parser.") && l.endsWith(".properties")) Evaluation.add(new File(parserPropertiesPath, l));
+ }
+
+ // init bookmarks DB: needs more time since this does a DNS lookup for each Bookmark.
// Can be started concurrently
new Thread(){
@Override
diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
index 32a891878..b080849db 100644
--- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrScheme.java
@@ -61,29 +61,6 @@ public enum SolrScheme {
InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) solrdoc.addField("attr_ip", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost());
- /*
- private final String charset; // the charset of the document
- private final List keywords; // most resources provide a keyword field
- private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
- private final StringBuilder creator; // author or copyright
- private final String publisher; // publisher
- private final List sections; // if present: more titles/headlines appearing in the document
- private final StringBuilder description; // an abstract, if present: short content description
- private Object text; // the clear text, all that is visible
- private final Map anchors; // all links embedded as clickeable entities (anchor tags)
- private final Map rss; // all embedded rss feeds
- private final Map images; // all visible pictures in document
- // the anchors and images - Maps are URL-to-EntityDescription mappings.
- // The EntityDescription appear either as visible text in anchors or as alternative
- // text in image tags.
- private Map hyperlinks, audiolinks, videolinks, applinks;
- private Map emaillinks;
- private MultiProtocolURI favicon;
- private boolean resorted;
- private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
- private Set languages;
- private boolean indexingDenied;
- */
solrdoc.addField("title", yacydoc.dc_title());
solrdoc.addField("author", yacydoc.dc_creator());
solrdoc.addField("description", yacydoc.dc_description());
@@ -166,9 +143,17 @@ public enum SolrScheme {
// bold, italic
String[] bold = html.getBold();
- if (bold.length > 0) solrdoc.addField("attr_bold", bold);
+ solrdoc.addField("boldcount_i", bold.length);
+ if (bold.length > 0) {
+ solrdoc.addField("attr_bold", bold);
+ solrdoc.addField("attr_boldcount", html.getBoldCount(bold));
+ }
String[] italic = html.getItalic();
- if (bold.length > 0) solrdoc.addField("attr_italic", italic);
+ solrdoc.addField("italiccount_i", italic.length);
+ if (italic.length > 0) {
+ solrdoc.addField("attr_italic", italic);
+ solrdoc.addField("attr_italiccount", html.getItalicCount(italic));
+ }
String[] li = html.getLi();
solrdoc.addField("licount_i", li.length);
if (li.length > 0) solrdoc.addField("attr_li", li);
@@ -225,6 +210,15 @@ public enum SolrScheme {
// flash embedded
solrdoc.addField("flash_b", html.containsFlash());
+
+ // generic evaluation pattern
+ for (String model: html.getEvaluationModelNames()) {
+ String[] scorenames = html.getEvaluationModelScoreNames(model);
+ if (scorenames.length > 0) {
+ solrdoc.addField("attr_" + model, scorenames);
+ solrdoc.addField("attr_" + model + "count", html.getEvaluationModelScoreCounts(model, scorenames));
+ }
+ }
}
return solrdoc;
}
diff --git a/source/net/yacy/cora/storage/AbstractScoreMap.java b/source/net/yacy/cora/storage/AbstractScoreMap.java
new file mode 100644
index 000000000..59339b1ce
--- /dev/null
+++ b/source/net/yacy/cora/storage/AbstractScoreMap.java
@@ -0,0 +1,39 @@
+/**
+ * AbstractScoreMap
+ * Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
+ * First released 28.04.2011 at http://yacy.net
+ *
+ * $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
+ * $LastChangedRevision: 7653 $
+ * $LastChangedBy: orbiter $
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+package net.yacy.cora.storage;
+
+public abstract class AbstractScoreMap implements ScoreMap {
+
+ /**
+ * apply all E/int mappings from an external ScoreMap to this ScoreMap
+ */
+ public void inc(ScoreMap map) {
+ if (map == null) return;
+ for (E entry: map) {
+ this.inc(entry, map.get(entry));
+ }
+ }
+
+}
diff --git a/source/net/yacy/cora/storage/ClusteredScoreMap.java b/source/net/yacy/cora/storage/ClusteredScoreMap.java
index b066013db..148be9b1c 100644
--- a/source/net/yacy/cora/storage/ClusteredScoreMap.java
+++ b/source/net/yacy/cora/storage/ClusteredScoreMap.java
@@ -35,7 +35,7 @@ import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
-public final class ClusteredScoreMap implements ReversibleScoreMap {
+public final class ClusteredScoreMap extends AbstractScoreMap implements ReversibleScoreMap {
protected final Map map; // a mapping from a reference to the cluster key
protected final TreeMap pam; // a mapping from the cluster key to the reference
@@ -48,6 +48,10 @@ public final class ClusteredScoreMap implements ReversibleScoreMap {
gcount = 0;
encnt = 0;
}
+
+ public Iterator iterator() {
+ return map.keySet().iterator();
+ }
public synchronized void clear() {
map.clear();
diff --git a/source/net/yacy/cora/storage/ConcurrentScoreMap.java b/source/net/yacy/cora/storage/ConcurrentScoreMap.java
index 3cdcebb44..973817f80 100644
--- a/source/net/yacy/cora/storage/ConcurrentScoreMap.java
+++ b/source/net/yacy/cora/storage/ConcurrentScoreMap.java
@@ -35,7 +35,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
-public class ConcurrentScoreMap implements ScoreMap {
+public class ConcurrentScoreMap extends AbstractScoreMap implements ScoreMap {
protected final ConcurrentHashMap map; // a mapping from a reference to the cluster key
private long gcount;
@@ -44,6 +44,10 @@ public class ConcurrentScoreMap implements ScoreMap {
map = new ConcurrentHashMap();
gcount = 0;
}
+
+ public Iterator iterator() {
+ return map.keySet().iterator();
+ }
public synchronized void clear() {
map.clear();
diff --git a/source/net/yacy/cora/storage/OrderedScoreMap.java b/source/net/yacy/cora/storage/OrderedScoreMap.java
index 50d7d4ed1..5ecf3e5e9 100644
--- a/source/net/yacy/cora/storage/OrderedScoreMap.java
+++ b/source/net/yacy/cora/storage/OrderedScoreMap.java
@@ -38,7 +38,7 @@ import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
-public class OrderedScoreMap implements ScoreMap {
+public class OrderedScoreMap extends AbstractScoreMap implements ScoreMap {
protected final Map map; // a mapping from a reference to the cluster key
@@ -49,6 +49,10 @@ public class OrderedScoreMap implements ScoreMap {
map = new TreeMap(comparator);
}
}
+
+ public Iterator iterator() {
+ return map.keySet().iterator();
+ }
public synchronized void clear() {
map.clear();
diff --git a/source/net/yacy/cora/storage/ScoreMap.java b/source/net/yacy/cora/storage/ScoreMap.java
index c94c4e5c3..7c76280af 100644
--- a/source/net/yacy/cora/storage/ScoreMap.java
+++ b/source/net/yacy/cora/storage/ScoreMap.java
@@ -26,7 +26,7 @@ package net.yacy.cora.storage;
import java.util.Iterator;
-public interface ScoreMap {
+public interface ScoreMap extends Iterable {
public void clear();
@@ -65,4 +65,5 @@ public interface ScoreMap {
public void dec(final E obj);
public void dec(final E obj, final int incrementScore);
+ public void inc(ScoreMap map);
}
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index 6ce544a46..3f7562e44 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -402,7 +402,10 @@ dc_rights
for (Map.Entry entry: anchors.entrySet()) {
url = entry.getKey();
if (url == null) continue;
- if (url.getHost() != null && thishost != null && url.getHost().equals(thishost)) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor");
+ if (url.getHost() != null && thishost != null &&
+ url.getHost().endsWith(thishost) ||
+ (thishost.startsWith("www.") && url.getHost().endsWith(thishost.substring(4)))
+ ) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor");
u = url.toNormalform(true, false);
String name = entry.getValue().getProperty("name", "");
if (u.startsWith("mailto:")) {
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 52a6502b3..83b75d9ad 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -44,8 +44,10 @@ import java.util.regex.Pattern;
import javax.swing.event.EventListenerList;
import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.htmlParser;
+import net.yacy.document.parser.html.Evaluation.Element;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@@ -60,33 +62,51 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// statics: for initialization of the HTMLFilterAbstractScraper
private static final Set linkTags0 = new HashSet(9,0.99f);
private static final Set linkTags1 = new HashSet(7,0.99f);
-
+
+ public enum TagType {
+ singleton, pair;
+ }
+
+ public enum Tag {
+ html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
+ body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
+ div(TagType.singleton), // scraped as singleton to get attached properties like 'id'
+ img(TagType.singleton),
+ base(TagType.singleton),
+ frame(TagType.singleton),
+ meta(TagType.singleton),
+ area(TagType.singleton),
+ link(TagType.singleton),
+ embed(TagType.singleton), //added by [MN]
+ param(TagType.singleton), //added by [MN]
+
+ a(TagType.pair),
+ h1(TagType.pair),
+ h2(TagType.pair),
+ h3(TagType.pair),
+ h4(TagType.pair),
+ h5(TagType.pair),
+ h6(TagType.pair),
+ title(TagType.pair),
+ b(TagType.pair),
+ strong(TagType.pair),
+ i(TagType.pair),
+ li(TagType.pair),
+ iframe(TagType.pair),
+ script(TagType.pair);
+
+ public TagType type;
+ private Tag(TagType type) {
+ this.type = type;
+ }
+ }
+
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static {
- linkTags0.add("html"); // scraped as tag 0 to get attached properties like 'lang'
- linkTags0.add("img");
- linkTags0.add("base");
- linkTags0.add("frame");
- linkTags0.add("meta");
- linkTags0.add("area");
- linkTags0.add("link");
- linkTags0.add("script");
- linkTags0.add("embed"); //added by [MN]
- linkTags0.add("param"); //added by [MN]
-
- linkTags1.add("a");
- linkTags1.add("h1");
- linkTags1.add("h2");
- linkTags1.add("h3");
- linkTags1.add("h4");
- linkTags1.add("h5");
- linkTags1.add("h6");
- linkTags1.add("title");
- linkTags1.add("b");
- linkTags1.add("strong");
- linkTags1.add("i");
- linkTags1.add("li");
- linkTags1.add("iframe");
+ for (Tag tag: Tag.values()) {
+ if (tag.type == TagType.singleton) linkTags0.add(tag.name());
+ if (tag.type == TagType.pair) linkTags1.add(tag.name());
+ }
//