added autotagging to document condenser:

- tags that are automatically generated now enrich the dc:subject
- auto-generated tags have a '$' at the beginning of the tag
- auto-generated tags lead the tag name with a vocabulary name
each tag has the form
$<vocabulary-name>:<tag-printname-space-replaced-by-'_'>
pull/1/head
Michael Peter Christen 13 years ago
parent 0d6176804b
commit a58dc4a91f

@ -20,6 +20,7 @@
package net.yacy.document; package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -27,7 +28,9 @@ import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
import net.yacy.document.WordCache.Dictionary; import net.yacy.document.WordCache.Dictionary;
import net.yacy.document.geolocalization.Localization; import net.yacy.document.geolocalization.Localization;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -117,13 +120,30 @@ public class Autotagging {
* @param text * @param text
* @return * @return
*/ */
public Set<String> tags(String text) { public Set<String> getPrintTagsFromText(String text) {
Set<String> as = new HashSet<String>(); Set<String> as = new HashSet<String>();
if (this.vocabularies.isEmpty()) return as;
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), LibraryProvider.dymLib);
String tag;
while (tokens.hasMoreElements()) {
tag = getPrintTagFromWord(tokens.nextElement().toString());
if (tag != null) as.add(tag);
}
return as; return as;
} }
public static class Vocabulary { public String getPrintTagFromWord(String word) {
if (this.vocabularies.isEmpty()) return null;
Metatag tag;
word = normalizeWord(word);
for (Map.Entry<String, Vocabulary> v: this.vocabularies.entrySet()) {
tag = v.getValue().getMetatag(word);
if (tag != null) return tag.getMetatag();
}
return null;
}
public class Vocabulary {
final String navigatorName; final String navigatorName;
final Map<String, String> tag2print, print2tag; final Map<String, String> tag2print, print2tag;
@ -137,7 +157,7 @@ public class Autotagging {
public Vocabulary(String name, File propFile) throws IOException { public Vocabulary(String name, File propFile) throws IOException {
this(name); this(name);
ArrayList<String> list = FileUtils.getListArray(propFile); ArrayList<String> list = FileUtils.getListArray(propFile);
String k, v; String k, kn, v;
String[] tags; String[] tags;
int p; int p;
vocloop: for (String line: list) { vocloop: for (String line: list) {
@ -161,15 +181,16 @@ public class Autotagging {
v = line.substring(p + 1); v = line.substring(p + 1);
tags = v.split(","); tags = v.split(",");
tagloop: for (String t: tags) { tagloop: for (String t: tags) {
t = t.trim().toLowerCase(); t = normalizeWord(t);
if (t.length() == 0) { if (t.length() == 0) {
continue tagloop; continue tagloop;
} }
this.tag2print.put(t, k); this.tag2print.put(t, k);
this.print2tag.put(k, t); this.print2tag.put(k, t);
} }
this.tag2print.put(k.toLowerCase(), k); kn = normalizeWord(k);
this.print2tag.put(k, k.toLowerCase()); this.tag2print.put(kn, k);
this.print2tag.put(k, kn);
} }
} }
@ -197,12 +218,10 @@ public class Autotagging {
return this.navigatorName; return this.navigatorName;
} }
public String getPrint(final String tag) { public Metatag getMetatag(final String word) {
return this.tag2print.get(tag); String printname = this.tag2print.get(word);
} if (printname == null) return null;
return metatag(this.navigatorName, printname);
public String getTag(final String print) {
return this.print2tag.get(print);
} }
public Set<String> tags() { public Set<String> tags() {
@ -215,6 +234,20 @@ public class Autotagging {
} }
} }
private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching
private final static Pattern PATTERN_OE = Pattern.compile("\u00F6");
private final static Pattern PATTERN_UE = Pattern.compile("\u00FC");
private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF");
private static final String normalizeWord(String word) {
word = word.trim().toLowerCase();
word = PATTERN_AE.matcher(word).replaceAll("ae");
word = PATTERN_OE.matcher(word).replaceAll("oe");
word = PATTERN_UE.matcher(word).replaceAll("ue");
word = PATTERN_SZ.matcher(word).replaceAll("ss");
return word;
}
public class Metatag { public class Metatag {
private final String vocName; private final String vocName;
private final String print; private final String print;
@ -253,6 +286,8 @@ public class Autotagging {
for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) { for (Map.Entry<String, Vocabulary> entry: a.vocabularies.entrySet()) {
System.out.println(entry); System.out.println(entry);
} }
Set<String> tags = a.getPrintTagsFromText("In die Tueren und Fluchttueren muessen noch Schloesser eingebaut werden");
System.out.println(tags);
} }
} }

@ -86,6 +86,7 @@ public final class Condenser {
//private Properties analysis; //private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Set<String> tags = new HashSet<String>(); // a set of tags, discovered from Autotagging
//public int RESULT_NUMB_TEXT_BYTES = -1; //public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1; public int RESULT_NUMB_WORDS = -1;
@ -222,6 +223,11 @@ public final class Condenser {
} }
} }
} }
// extend the tags in the document object with autotagging tags
if (!this.tags.isEmpty()) {
document.addTags(this.tags);
}
} }
private void insertTextToWords( private void insertTextToWords(
@ -283,7 +289,7 @@ public final class Condenser {
assert is != null; assert is != null;
final Set<String> currsentwords = new HashSet<String>(); final Set<String> currsentwords = new HashSet<String>();
String word = ""; String word = "";
String k; String k, tag;
int wordlen; int wordlen;
Word wsp; Word wsp;
final Word wsp1; final Word wsp1;
@ -304,6 +310,10 @@ public final class Condenser {
if (this.languageIdentificator != null) this.languageIdentificator.add(word); if (this.languageIdentificator != null) this.languageIdentificator.add(word);
if (word.length() < wordminsize) continue; if (word.length() < wordminsize) continue;
// get tags from autotagging
tag = LibraryProvider.autotagging.getPrintTagFromWord(word);
if (tag != null) this.tags.add(tag);
// distinguish punctuation and words // distinguish punctuation and words
wordlen = word.length(); wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {

@ -103,7 +103,8 @@ public class Document {
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset; this.charset = charset;
this.parserObject = parserObject; this.parserObject = parserObject;
this.keywords = (keywords == null) ? new LinkedList<String>() : Arrays.asList(keywords); this.keywords = new LinkedList<String>();
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title); this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title);
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections); this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
@ -188,6 +189,20 @@ dc_rights
return (this.creator == null) ? "" : this.creator.toString(); return (this.creator == null) ? "" : this.creator.toString();
} }
/**
* add the given words to the set of keywords.
* These keywords will appear in dc_subject
* @param tags
*/
public void addTags(Set<String> tags) {
for (String s: this.keywords) {
tags.remove(s);
}
for (String s: tags) {
this.keywords.add(s);
}
}
public String[] dc_subject() { public String[] dc_subject() {
// sort out doubles and empty words // sort out doubles and empty words
final TreeSet<String> hs = new TreeSet<String>(); final TreeSet<String> hs = new TreeSet<String>();
@ -195,7 +210,7 @@ dc_rights
for (int i = 0; i < this.keywords.size(); i++) { for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue; if (this.keywords.get(i) == null) continue;
s = (this.keywords.get(i)).trim(); s = (this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase()); if (s.length() > 0) hs.add(s);
} }
final String[] t = new String[hs.size()]; final String[] t = new String[hs.size()];
int i = 0; int i = 0;

@ -115,7 +115,7 @@ public class LibraryProvider
Set<String> allTags = new HashSet<String>() ; Set<String> allTags = new HashSet<String>() ;
allTags.addAll(autotagging.allTags()); // we must copy this into a clone to prevent circularity allTags.addAll(autotagging.allTags()); // we must copy this into a clone to prevent circularity
autotagging.addLocalization(geoLoc); autotagging.addLocalization(geoLoc);
autotagging.addDictionaries(dymLib.getDictionaries()); //autotagging.addDictionaries(dymLib.getDictionaries()); // strange results with this: normal word lists are 'too full'
WordCache.learn(allTags); WordCache.learn(allTags);
} }

Loading…
Cancel
Save