diff --git a/htroot/BlacklistImpExp_p.html b/htroot/BlacklistImpExp_p.html index 48ac2558d..58b4ebe71 100644 --- a/htroot/BlacklistImpExp_p.html +++ b/htroot/BlacklistImpExp_p.html @@ -80,7 +80,7 @@ -
+
plain text file: Here you can export a blacklist as a regular text file with one blacklist entry per line. diff --git a/htroot/api/blacklists_p.txt b/htroot/api/blacklists_p.txt new file mode 100644 index 000000000..e19c4f510 --- /dev/null +++ b/htroot/api/blacklists_p.txt @@ -0,0 +1,4 @@ +#{lists}# +#{items}##[item]# +#{/items}# +#{/lists}# \ No newline at end of file diff --git a/htroot/yacy/list.java b/htroot/yacy/list.java index b5c634d11..871e91b55 100644 --- a/htroot/yacy/list.java +++ b/htroot/yacy/list.java @@ -51,6 +51,7 @@ public final class list { // return variable that accumulates replacements final serverObjects prop = new serverObjects(); + prop.put("list", ""); // init a empty return (error case) if ((post == null) || (env == null)) return prop; if (!Protocol.authentifyRequest(post, env)) return prop; @@ -66,7 +67,7 @@ public final class list { if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) { // if we are a robinson cluster, answer only if this client is known by our network definition - return null; + return prop; } if (col.equals("black")) { @@ -85,8 +86,6 @@ public final class list { } prop.put("list",out.toString()); - } else { - prop.put("list",""); } return prop; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index b7c741cd6..6c4137cd0 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -224,18 +224,19 @@ public final class Condenser extends Tokenizer { try { int pip = 0; while (wordenum.hasMoreElements()) { - word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); - if (useForLanguageIdentification) this.languageIdentificator.add(word); - if (word.length() < 2) continue; + word = wordenum.nextElement().toString(); + if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive + if (word.length() < 2) continue; + word = word.toLowerCase(Locale.ENGLISH); wprop = this.words.get(word); if (wprop == null) wprop = new Word(0, pip, phrase); if (wprop.flags == null) wprop.flags = flagstemplate.clone(); wprop.flags.set(flagpos, true); - this.words.put(word.toLowerCase(), wprop); + this.words.put(word, wprop); pip++; this.RESULT_NUMB_WORDS++; //this.RESULT_DIFF_WORDS++; - } + } } finally { wordenum.close(); wordenum = null; diff --git a/source/net/yacy/document/DateDetection.java b/source/net/yacy/document/DateDetection.java index 781101665..c5ecb0a1c 100644 --- a/source/net/yacy/document/DateDetection.java +++ b/source/net/yacy/document/DateDetection.java @@ -127,7 +127,7 @@ public class DateDetection { private final static Date TODAY = new Date(); private final static int CURRENT_YEAR = Integer.parseInt(CONFORM.format(TODAY).substring(0, 4)); // we need that to parse dates without given years, see the ShortStyle class - private final static String BODNCG = "(?:\\b|^)"; // begin of date non-capturing group + private final static String BODNCG = "(?:\\s|^)"; // begin of date non-capturing group private final static String EODNCG = "(?:[).:;! ]|$)"; // end of date non-capturing group private final static String SEPARATORNCG = "(?:/|-| - |\\.\\s|,\\s|\\.|,|\\s)"; // separator non-capturing group private final static String DAYCAPTURE = "(\\d{1,2})"; diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index 0420194f5..8d0c8ad05 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -56,7 +56,7 @@ public class Tokenizer { public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file //private Properties analysis; - protected final Map words; // a string (the words) to (indexWord) - relation + protected final Map words; // a string (the words) to (indexWord) - relation (key: words are lowercase) private final Set synonyms; // a set of synonyms to the words protected final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging @@ -68,7 +68,6 @@ public class Tokenizer { this.words = new TreeMap(NaturalOrder.naturalComparator); this.synonyms = new LinkedHashSet(); assert text != null; - final Set currsentwords = new HashSet(); String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; for (int i = 0; i < wordcache.length; i++) wordcache[i] = ""; String k; @@ -89,9 +88,9 @@ public class Tokenizer { // handle punktuation (start new sentence) if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) { // store sentence - currsentwords.clear(); + if (wordInSentenceCounter > 1) // if no word in sentence repeated punktuation ".....", don't count as sentence + allsentencecounter++; wordInSentenceCounter = 1; - allsentencecounter++; continue; } if (word.length() < wordminsize) continue; @@ -160,7 +159,6 @@ public class Tokenizer { // store word allwordcounter++; - currsentwords.add(word); Word wsp = this.words.get(word); if (wsp != null) { // word already exists @@ -169,7 +167,7 @@ public class Tokenizer { // word does not yet exist, create new word entry wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 ! wsp.flags = this.RESULT_FLAGS.clone(); - this.words.put(word.toLowerCase(), wsp); + this.words.put(word, wsp); } // we now have the unique handle of the word, put it into the sentence: wordInSentenceCounter++; @@ -214,9 +212,12 @@ public class Tokenizer { // store result this.RESULT_NUMB_WORDS = allwordcounter; // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text. - this.RESULT_NUMB_SENTENCES = allsentencecounter + (currsentwords.size() > 0 ? 1 : 0); + this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0); } - + + /** + * @return returns the words as word/indexWord relation map. All words are lowercase. + */ public Map words() { // returns the words as word/indexWord relation map return this.words; diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java index 98dcb5f34..6528f0182 100644 --- a/source/net/yacy/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -50,6 +50,11 @@ public final class Identificator { } } + /** + * Append a word to the text to be analyzed. + * Analysis takes letter case into account (this means word should not be upper- or lower cased) + * @param word + */ public void add(final String word) { if (word == null) return; this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars diff --git a/test/java/net/yacy/document/DateDetectionTest.java b/test/java/net/yacy/document/DateDetectionTest.java index cedcea6de..eba322d7a 100644 --- a/test/java/net/yacy/document/DateDetectionTest.java +++ b/test/java/net/yacy/document/DateDetectionTest.java @@ -28,6 +28,9 @@ public class DateDetectionTest { testtext.add("1.1.2016"); testtext.add("1. Januar 2016"); testtext.add("2016, January 1."); + + testtext.add("beginning text 1.1.2016"); + testtext.add("line break\n1.1.2016"); for (String text : testtext) { Date d = DateDetection.parseLine(text, 0); @@ -82,4 +85,23 @@ public class DateDetectionTest { } } + /** + * Negative test of parseLine method, of class DateDetection + * with cases that represent NOT a date + */ + @Test + public void testParseLineNoDate() { + + // test input representations + Set testtext = new LinkedHashSet(); + testtext.add("3.1.2.0102"); // example of a program version string + // testtext.add("3.1.20.0102"); // date end-capture not working (on modification conflict with YMD parser) + testtext.add("v3.1.21"); + testtext.add("v3.1.22."); + + for (String text : testtext) { + Date d = DateDetection.parseLine(text, 0); + assertNull("not a date: " + text, d); + } + } } diff --git a/test/java/net/yacy/document/TokenizerTest.java b/test/java/net/yacy/document/TokenizerTest.java index e54807105..23e2fbb5f 100644 --- a/test/java/net/yacy/document/TokenizerTest.java +++ b/test/java/net/yacy/document/TokenizerTest.java @@ -2,7 +2,9 @@ package net.yacy.document; import java.net.MalformedURLException; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import net.yacy.cora.document.WordCache; import net.yacy.kelondro.data.word.Word; import org.junit.Test; @@ -36,4 +38,23 @@ public class TokenizerTest { assertEquals("occurence of 'words' ", 2, w.occurrences()); } + /** + * Test of RESULT_NUMB_SENTENCES, of class Tokenizer. + */ + @Test + public void testNumberOfSentences() { + Set testText = new HashSet(); + // text with 5 sentences + testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................"); + testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text"); + testText.add("!!! ! ! ! Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence 5 ! ! ! !!!"); + + WordCache meaningLib = new WordCache(null); + boolean doAutotagging = false; + VocabularyScraper scraper = null; + for (String text : testText) { + Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper); + assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES); + } + } }