luccioman 9 years ago
commit 094aed8664

@ -80,7 +80,7 @@
</fieldset> </fieldset>
</form> </form>
<form action="yacy/list.html" method="get" accept-charset="UTF-8"> <form action="api/blacklists_p.txt" method="get" accept-charset="UTF-8">
<fieldset> <fieldset>
<legend>plain text file:</legend> <legend>plain text file:</legend>
Here you can export a blacklist as a regular text file with one blacklist entry per line. Here you can export a blacklist as a regular text file with one blacklist entry per line.

@ -0,0 +1,4 @@
#{lists}#
#{items}##[item]#
#{/items}#
#{/lists}#

@ -51,6 +51,7 @@ public final class list {
// return variable that accumulates replacements // return variable that accumulates replacements
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
prop.put("list", ""); // init a empty return (error case)
if ((post == null) || (env == null)) return prop; if ((post == null) || (env == null)) return prop;
if (!Protocol.authentifyRequest(post, env)) return prop; if (!Protocol.authentifyRequest(post, env)) return prop;
@ -66,7 +67,7 @@ public final class list {
if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) { if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) {
// if we are a robinson cluster, answer only if this client is known by our network definition // if we are a robinson cluster, answer only if this client is known by our network definition
return null; return prop;
} }
if (col.equals("black")) { if (col.equals("black")) {
@ -85,8 +86,6 @@ public final class list {
} }
prop.put("list",out.toString()); prop.put("list",out.toString());
} else {
prop.put("list","");
} }
return prop; return prop;

@ -224,18 +224,19 @@ public final class Condenser extends Tokenizer {
try { try {
int pip = 0; int pip = 0;
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); word = wordenum.nextElement().toString();
if (useForLanguageIdentification) this.languageIdentificator.add(word); if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive
if (word.length() < 2) continue; if (word.length() < 2) continue;
word = word.toLowerCase(Locale.ENGLISH);
wprop = this.words.get(word); wprop = this.words.get(word);
if (wprop == null) wprop = new Word(0, pip, phrase); if (wprop == null) wprop = new Word(0, pip, phrase);
if (wprop.flags == null) wprop.flags = flagstemplate.clone(); if (wprop.flags == null) wprop.flags = flagstemplate.clone();
wprop.flags.set(flagpos, true); wprop.flags.set(flagpos, true);
this.words.put(word.toLowerCase(), wprop); this.words.put(word, wprop);
pip++; pip++;
this.RESULT_NUMB_WORDS++; this.RESULT_NUMB_WORDS++;
//this.RESULT_DIFF_WORDS++; //this.RESULT_DIFF_WORDS++;
} }
} finally { } finally {
wordenum.close(); wordenum.close();
wordenum = null; wordenum = null;

@ -127,7 +127,7 @@ public class DateDetection {
private final static Date TODAY = new Date(); private final static Date TODAY = new Date();
private final static int CURRENT_YEAR = Integer.parseInt(CONFORM.format(TODAY).substring(0, 4)); // we need that to parse dates without given years, see the ShortStyle class private final static int CURRENT_YEAR = Integer.parseInt(CONFORM.format(TODAY).substring(0, 4)); // we need that to parse dates without given years, see the ShortStyle class
private final static String BODNCG = "(?:\\b|^)"; // begin of date non-capturing group private final static String BODNCG = "(?:\\s|^)"; // begin of date non-capturing group
private final static String EODNCG = "(?:[).:;! ]|$)"; // end of date non-capturing group private final static String EODNCG = "(?:[).:;! ]|$)"; // end of date non-capturing group
private final static String SEPARATORNCG = "(?:/|-| - |\\.\\s|,\\s|\\.|,|\\s)"; // separator non-capturing group private final static String SEPARATORNCG = "(?:/|-| - |\\.\\s|,\\s|\\.|,|\\s)"; // separator non-capturing group
private final static String DAYCAPTURE = "(\\d{1,2})"; private final static String DAYCAPTURE = "(\\d{1,2})";

@ -56,7 +56,7 @@ public class Tokenizer {
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
//private Properties analysis; //private Properties analysis;
protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation (key: words are lowercase)
private final Set<String> synonyms; // a set of synonyms to the words private final Set<String> synonyms; // a set of synonyms to the words
protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
@ -68,7 +68,6 @@ public class Tokenizer {
this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator); this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
this.synonyms = new LinkedHashSet<String>(); this.synonyms = new LinkedHashSet<String>();
assert text != null; assert text != null;
final Set<String> currsentwords = new HashSet<String>();
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
for (int i = 0; i < wordcache.length; i++) wordcache[i] = ""; for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
String k; String k;
@ -89,9 +88,9 @@ public class Tokenizer {
// handle punktuation (start new sentence) // handle punktuation (start new sentence)
if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) { if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
// store sentence // store sentence
currsentwords.clear(); if (wordInSentenceCounter > 1) // if no word in sentence repeated punktuation ".....", don't count as sentence
allsentencecounter++;
wordInSentenceCounter = 1; wordInSentenceCounter = 1;
allsentencecounter++;
continue; continue;
} }
if (word.length() < wordminsize) continue; if (word.length() < wordminsize) continue;
@ -160,7 +159,6 @@ public class Tokenizer {
// store word // store word
allwordcounter++; allwordcounter++;
currsentwords.add(word);
Word wsp = this.words.get(word); Word wsp = this.words.get(word);
if (wsp != null) { if (wsp != null) {
// word already exists // word already exists
@ -169,7 +167,7 @@ public class Tokenizer {
// word does not yet exist, create new word entry // word does not yet exist, create new word entry
wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 ! wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
wsp.flags = this.RESULT_FLAGS.clone(); wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word.toLowerCase(), wsp); this.words.put(word, wsp);
} }
// we now have the unique handle of the word, put it into the sentence: // we now have the unique handle of the word, put it into the sentence:
wordInSentenceCounter++; wordInSentenceCounter++;
@ -214,9 +212,12 @@ public class Tokenizer {
// store result // store result
this.RESULT_NUMB_WORDS = allwordcounter; this.RESULT_NUMB_WORDS = allwordcounter;
// if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text. // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
this.RESULT_NUMB_SENTENCES = allsentencecounter + (currsentwords.size() > 0 ? 1 : 0); this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
} }
/**
* @return returns the words as word/indexWord relation map. All words are lowercase.
*/
public Map<String, Word> words() { public Map<String, Word> words() {
// returns the words as word/indexWord relation map // returns the words as word/indexWord relation map
return this.words; return this.words;

@ -50,6 +50,11 @@ public final class Identificator {
} }
} }
/**
* Append a word to the text to be analyzed.
* Analysis takes letter case into account (this means word should not be upper- or lower cased)
* @param word
*/
public void add(final String word) { public void add(final String word) {
if (word == null) return; if (word == null) return;
this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars

@ -28,6 +28,9 @@ public class DateDetectionTest {
testtext.add("1.1.2016"); testtext.add("1.1.2016");
testtext.add("1. Januar 2016"); testtext.add("1. Januar 2016");
testtext.add("2016, January 1."); testtext.add("2016, January 1.");
testtext.add("beginning text 1.1.2016");
testtext.add("line break\n1.1.2016");
for (String text : testtext) { for (String text : testtext) {
Date d = DateDetection.parseLine(text, 0); Date d = DateDetection.parseLine(text, 0);
@ -82,4 +85,23 @@ public class DateDetectionTest {
} }
} }
/**
* Negative test of parseLine method, of class DateDetection
* with cases that represent NOT a date
*/
@Test
public void testParseLineNoDate() {
// test input representations
Set<String> testtext = new LinkedHashSet();
testtext.add("3.1.2.0102"); // example of a program version string
// testtext.add("3.1.20.0102"); // date end-capture not working (on modification conflict with YMD parser)
testtext.add("v3.1.21");
testtext.add("v3.1.22.");
for (String text : testtext) {
Date d = DateDetection.parseLine(text, 0);
assertNull("not a date: " + text, d);
}
}
} }

@ -2,7 +2,9 @@
package net.yacy.document; package net.yacy.document;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.WordCache; import net.yacy.cora.document.WordCache;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import org.junit.Test; import org.junit.Test;
@ -36,4 +38,23 @@ public class TokenizerTest {
assertEquals("occurence of 'words' ", 2, w.occurrences()); assertEquals("occurence of 'words' ", 2, w.occurrences());
} }
/**
* Test of RESULT_NUMB_SENTENCES, of class Tokenizer.
*/
@Test
public void testNumberOfSentences() {
Set<String> testText = new HashSet();
// text with 5 sentences
testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................");
testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text");
testText.add("!!! ! ! ! Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence 5 ! ! ! !!!");
WordCache meaningLib = new WordCache(null);
boolean doAutotagging = false;
VocabularyScraper scraper = null;
for (String text : testText) {
Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
}
}
} }

Loading…
Cancel
Save