diff --git a/source/de/anomic/crawler/RobotsEntry.java b/source/de/anomic/crawler/RobotsEntry.java index cfec8c15b..87848bc0d 100644 --- a/source/de/anomic/crawler/RobotsEntry.java +++ b/source/de/anomic/crawler/RobotsEntry.java @@ -40,7 +40,6 @@ import net.yacy.cora.document.MultiProtocolURI; public class RobotsEntry { - public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; public static final String HOST_NAME = "hostname"; public static final String ALLOW_PATH_LIST = "allow"; public static final String DISALLOW_PATH_LIST = "disallow"; @@ -64,7 +63,7 @@ public class RobotsEntry { this.denyPathList = new LinkedList(); final String csPl = new String(this.mem.get(DISALLOW_PATH_LIST)); if (csPl.length() > 0){ - final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); + final String[] pathArray = csPl.split(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { this.denyPathList.addAll(Arrays.asList(pathArray)); } @@ -76,7 +75,7 @@ public class RobotsEntry { this.allowPathList = new LinkedList(); final String csPl = new String(this.mem.get(ALLOW_PATH_LIST)); if (csPl.length() > 0){ - final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); + final String[] pathArray = csPl.split(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { this.allowPathList.addAll(Arrays.asList(pathArray)); } @@ -116,7 +115,7 @@ public class RobotsEntry { final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30); for (String element : allowPathList) { pathListStr.append(element) - .append(ROBOTS_DB_PATH_SEPARATOR); + .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); } this.mem.put(ALLOW_PATH_LIST, pathListStr.substring(0,pathListStr.length()-1).getBytes()); } @@ -127,7 +126,7 @@ public class RobotsEntry { final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30); for (String element : disallowPathList) { pathListStr.append(element) - .append(ROBOTS_DB_PATH_SEPARATOR); + .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); } this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0, pathListStr.length()-1).getBytes()); } @@ -197,7 +196,7 @@ public class RobotsEntry { // if the path is null or empty we set it to / if ((path == null) || (path.length() == 0)) path = "/"; // escaping all occurences of ; because this char is used as special char in the Robots DB - else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B"); + else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B"); for (String element : this.denyPathList) { diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 3646c7e63..5a835278b 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; import org.apache.log4j.Logger; @@ -50,6 +51,7 @@ public class RobotsTxt { private static Logger log = Logger.getLogger(RobotsTxt.class); public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; + public static final Pattern ROBOTS_DB_PATH_SEPARATOR_MATCHER = Pattern.compile(ROBOTS_DB_PATH_SEPARATOR); BEncodedHeap robotsTable; private final ConcurrentHashMap syncObjects; diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java index 61db83193..6689f75f7 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/robotsParser.java @@ -33,6 +33,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.net.URLDecoder; import java.util.ArrayList; +import java.util.regex.Pattern; /* * A class for Parsing robots.txt files. @@ -55,6 +56,8 @@ import java.util.ArrayList; public final class robotsParser { + private static final Pattern patternTab = Pattern.compile("\t"); + public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase(); public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase(); public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); @@ -109,7 +112,7 @@ public final class robotsParser { try { lineparser: while ((line = reader.readLine()) != null) { // replacing all tabs with spaces - line = line.replaceAll("\t"," ").trim(); + line = patternTab.matcher(line).replaceAll(" ").trim(); lineUpper = line.toUpperCase(); // parse empty line @@ -218,7 +221,7 @@ public final class robotsParser { } // escaping all occurences of ; because this char is used as special char in the Robots DB - path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B"); + path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B"); // adding it to the pathlist if (isDisallowRule) { diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index 0ab280ac5..60fca4e57 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -67,6 +67,8 @@ import de.anomic.search.MetadataRepository.Export; public class URLAnalysis { + private static final Pattern patternMinus = Pattern.compile("-"); + /** * processes to analyse URL lists */ @@ -99,7 +101,7 @@ public class URLAnalysis { try { url = in.take(); if (url == poison) break; - update(url.getHost().replaceAll("-", "\\.").split("\\.")); + update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\.")); update(p.matcher(url.getPath()).replaceAll("/").split("/")); } catch (InterruptedException e) { Log.logException(e); diff --git a/source/de/anomic/server/serverObjects.java b/source/de/anomic/server/serverObjects.java index 36c644a55..61abdaaef 100644 --- a/source/de/anomic/server/serverObjects.java +++ b/source/de/anomic/server/serverObjects.java @@ -51,6 +51,7 @@ import java.net.InetAddress; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; +import java.util.regex.Pattern; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.html.CharacterCoding; @@ -62,6 +63,14 @@ import de.anomic.search.Switchboard; public class serverObjects extends HashMap implements Cloneable { + private final static Pattern patternNewline = Pattern.compile("\n"); + private final static Pattern patternDoublequote = Pattern.compile("\""); + private final static Pattern patternSlash = Pattern.compile("/"); + private final static Pattern patternB = Pattern.compile("\b"); + private final static Pattern patternF = Pattern.compile("\f"); + private final static Pattern patternR = Pattern.compile("\r"); + private final static Pattern patternT = Pattern.compile("\t"); + private static final long serialVersionUID = 1L; private boolean localized = true; @@ -164,15 +173,15 @@ public class serverObjects extends HashMap implements Cloneable * @param value a String that will be reencoded for JSON output. * @return the modified String that was added to the map. */ - public String putJSON(final String key, String value) { - value = value.replaceAll("\"", "'"); - value = value.replaceAll("/", "\\/"); - // value = value.replaceAll("\\", "\\\\"); - value = value.replaceAll("\b", "\\b"); - value = value.replaceAll("\f", "\\f"); - value = value.replaceAll("\n", "\\r"); - value = value.replaceAll("\r", "\\r"); - value = value.replaceAll("\t", "\\t"); + public String putJSON(final String key, String value) { + // value = value.replaceAll("\\", "\\\\"); + value = patternDoublequote.matcher(value).replaceAll("'"); + value = patternSlash.matcher(value).replaceAll("\\/"); + value = patternB.matcher(value).replaceAll("\\b"); + value = patternF.matcher(value).replaceAll("\\f"); + value = patternNewline.matcher(value).replaceAll("\\r"); + value = patternR.matcher(value).replaceAll("\\r"); + value = patternT.matcher(value).replaceAll("\\t"); return put(key, value); } public String putJSON(final String key, final byte[] value) { @@ -333,7 +342,7 @@ public class serverObjects extends HashMap implements Cloneable String key, value; for (Map.Entry entry: entrySet()) { key = entry.getKey(); - value = entry.getValue().replaceAll("\n", "\\\\n"); + value = patternNewline.matcher(entry.getValue()).replaceAll("\\\\n"); fos.write((key + "=" + value + "\r\n").getBytes()); } } finally { diff --git a/source/net/yacy/ai/example/SchwarzerPeter.java b/source/net/yacy/ai/example/SchwarzerPeter.java new file mode 100644 index 000000000..e30cf90a4 --- /dev/null +++ b/source/net/yacy/ai/example/SchwarzerPeter.java @@ -0,0 +1,209 @@ +package net.yacy.ai.example; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import net.yacy.ai.greedy.AbstractFinding; +import net.yacy.ai.greedy.AbstractModel; +import net.yacy.ai.greedy.Finding; +import net.yacy.ai.greedy.Model; +import net.yacy.ai.greedy.Role; + +public class SchwarzerPeter { + + public static enum Kartentyp { + A, B, C, D, E, F, G, H, P; + } + + public static enum Kartenzaehler { + p, q; + } + + public static class Karte { + private Kartentyp kartentyp; + private Kartenzaehler kartenzaehler; + public Karte(Kartentyp kartentyp, Kartenzaehler kartenzaehler) { + this.kartentyp = kartentyp; this.kartenzaehler = kartenzaehler; + } + public boolean equals(Object obj) { + return this.kartentyp == ((Karte) obj).kartentyp && this.kartenzaehler == ((Karte) obj).kartenzaehler; + } + public int hashCode() { + return this.kartentyp.hashCode() + 16 + this.kartenzaehler.hashCode(); + } + public boolean istSchwarzerPeter() { + return this.kartentyp == Kartentyp.P; + } + public static boolean istPaar(Karte k1, Karte k2) { + return k1.kartentyp == k2.kartentyp; + } + } + + public static final List alleKarten; + static { + alleKarten = new ArrayList(33); + for (Kartentyp typ: Kartentyp.values()) { + alleKarten.add(new Karte(typ, Kartenzaehler.p)); + alleKarten.add(new Karte(typ, Kartenzaehler.q)); + } + alleKarten.add(new Karte(Kartentyp.P, Kartenzaehler.p)); + } + + public static final List neuerStapel(Random r) { + List stapel0 = new ArrayList(); + for (Karte karte: alleKarten) stapel0.add(karte); + List stapel1 = new ArrayList(); + while (stapel0.size() > 0) stapel1.add(stapel0.remove(r.nextInt(stapel0.size()))); + return stapel1; + } + + public static class Spieler implements Role { + + private int spielernummer; + private int spieleranzahl; + + public Spieler(int spielernummer, int spieleranzahl) { + this.spielernummer = spielernummer; + this.spieleranzahl = spieleranzahl; + } + + @Override + public Spieler nextRole() { + int n = (this.spielernummer == this.spieleranzahl - 1) ? 0 : this.spielernummer + 1; + return new Spieler(n, this.spieleranzahl); + } + public Spieler linkerNachbar() { + int n = (this.spielernummer == 0) ? this.spieleranzahl - 1 : this.spielernummer - 1; + return new Spieler(n, this.spieleranzahl); + } + + public boolean equals(Object obj) { + return this.spielernummer == ((Spieler) obj).spielernummer; + } + + public int hashCode() { + return this.spielernummer; + } + } + + public static enum Strategy { + nichtsortieren_linksziehen, + nichtsortieren_zufallsziehen, + sortieren_linksziehen, + sortieren_zufallsziehen; + } + + public static class Hand extends ArrayList { + private static final long serialVersionUID = -5274023237476645059L; + private Strategy strategy; + public Hand(Strategy strategy) { + this.strategy = strategy; + } + public void annehmen(Random r, Karte karte) { + if (this.strategy == Strategy.nichtsortieren_linksziehen || this.strategy == Strategy.nichtsortieren_zufallsziehen) { + this.add(this.set(r.nextInt(this.size()), karte)); + } else { + this.add(karte); + } + } + public Karte abgeben(Random r) { + if (this.strategy == Strategy.nichtsortieren_linksziehen || this.strategy == Strategy.sortieren_linksziehen) { + return this.remove(0); + } else { + return this.remove(r.nextInt(this.size())); + } + } + public boolean paerchenAblegen() { +return true; + } + } + + public static class Zug extends AbstractFinding implements Finding { + + public Zug(Spieler spieler, int priority) { + super(spieler, priority); + } + + @Override + public Object clone() { + return this; + } + + @Override + public boolean equals(Object other) { + return true; + } + + @Override + public int hashCode() { + return 0; + } + + } + + public static class Spiel extends AbstractModel implements Model, Cloneable { + + private Hand[] haende; + private Random random; + + public Spiel(Spieler spieler, Random r) { + super(spieler); + this.random = r; + haende = new Hand[spieler.spieleranzahl]; + for (int i = 0; i < spieler.spieleranzahl; i++) haende[i] = new Hand(Strategy.nichtsortieren_linksziehen); + List geben = neuerStapel(r); + while (geben.size() > 0) { + haende[spieler.spielernummer].annehmen(r, geben.remove(0)); + spieler = spieler.nextRole(); + } + } + + @Override + public List explore() { + return new ArrayList(0); + } + + @Override + public void applyFinding(Zug finding) { + haende[this.currentRole().spielernummer].annehmen(random, this.haende[this.currentRole().linkerNachbar().spielernummer].abgeben(random)); + + } + + @Override + public int getRanking(int findings, Spieler role) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public boolean isTermination(Spieler role) { + // TODO Auto-generated method stub + return false; + } + + @Override + public Spieler isTermination() { + // TODO Auto-generated method stub + return null; + } + + @Override + public Object clone() { + // TODO Auto-generated method stub + return null; + } + + @Override + public boolean equals(Object other) { + // TODO Auto-generated method stub + return false; + } + + @Override + public int hashCode() { + // TODO Auto-generated method stub + return 0; + } + } +} diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 32be6b19a..33e594668 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -56,6 +56,7 @@ public class MultiProtocolURI implements Serializable { private static final Pattern backPathPattern = Pattern.compile("(/[^/]+(? 1 && url.charAt(1) == ':') { @@ -684,10 +685,11 @@ public class MultiProtocolURI implements Serializable { return toNormalform(excludeReference, stripAmp, false); } + private static final Pattern ampPattern = Pattern.compile("&"); public String toNormalform(final boolean excludeReference, final boolean stripAmp, final boolean removeSessionID) { String result = toNormalform0(excludeReference, removeSessionID); if (stripAmp) { - result = result.replaceAll("&", "&"); + result = ampPattern.matcher(result).replaceAll("&"); } return result; } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 46fd08481..6ffeece8b 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -27,6 +27,7 @@ import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; +import java.util.regex.Pattern; import de.anomic.crawler.retrieval.HTTPLoader; @@ -43,6 +44,8 @@ import net.yacy.kelondro.util.FileUtils; public class htmlParser extends AbstractParser implements Parser { + private static final Pattern patternUnderline = Pattern.compile("_"); + public htmlParser() { super("HTML Parser"); SUPPORTED_EXTENSIONS.add("htm"); @@ -176,7 +179,7 @@ public class htmlParser extends AbstractParser implements Parser { if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman"; // fix wrong fill characters - encoding = encoding.replaceAll("_", "-"); + encoding = patternUnderline.matcher(encoding).replaceAll("-"); if (encoding.matches("GB[_-]?2312([-_]80)?")) return "GB2312"; if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";