Fixed language detector initialization and NullPointerException cases.

NullPointerException occurred when using and Identificator instance
which encountered and error in its constructor.
This error could be caused by a missing "langdetect" folder in the
current folder of the main process, or by simultaneous first calls to
the constructor, initializing concurrently the DetectorFactory.langlist.

Fixes the mantis 714 (http://mantis.tokeek.de/view.php?id=714)
pull/97/head
luccioman 9 years ago
parent 8146b97e9b
commit d27adc2b92

@ -24,8 +24,8 @@
package net.yacy.document.language; package net.yacy.document.language;
import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
import com.cybozu.labs.langdetect.Detector; import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException; import com.cybozu.labs.langdetect.LangDetectException;
@ -41,9 +41,11 @@ public final class Identificator {
private Detector detector; private Detector detector;
private Language language; private Language language;
/**
* Default constructor. Requires the DetectorFactory language profiles to be loaded before.
*/
public Identificator() { public Identificator() {
try { try {
if(DetectorFactory.getLangList().isEmpty()) DetectorFactory.loadProfile(new File("langdetect").toString());
this.detector = DetectorFactory.create(); this.detector = DetectorFactory.create();
} catch (LangDetectException e) { } catch (LangDetectException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
@ -56,33 +58,37 @@ public final class Identificator {
* @param word * @param word
*/ */
public void add(final String word) { public void add(final String word) {
if (word == null) return; if (word == null || this.detector == null) {
return;
}
this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
} }
/** /**
* Get the detected language with highest probability * Get the detected language with highest probability
* if detection probability is above 0.3 (30%) * if detection probability is above 0.3 (30%)
* Underlaying detector differentiates zh-cn and zh-tw, these are returned as zh here. * Underlying detector differentiates zh-cn and zh-tw, these are returned as zh here.
* @return 2 char language code (ISO 639-1) * @return 2 char language code (ISO 639-1)
*/ */
public String getLanguage() { public String getLanguage() {
try { if(this.detector != null) {
ArrayList<Language> probabilities = this.detector.getProbabilities(); try {
if(probabilities.isEmpty()) return null; ArrayList<Language> probabilities = this.detector.getProbabilities();
this.language = this.detector.getProbabilities().get(0); if(probabilities.isEmpty()) return null;
} catch (LangDetectException e) { this.language = this.detector.getProbabilities().get(0);
// this contains mostly the message "no features in text" } catch (LangDetectException e) {
//ConcurrentLog.logException(e); // this contains mostly the message "no features in text"
return null; //ConcurrentLog.logException(e);
} return null;
// Return language only if probability is higher than 30% to account for missing language profiles }
if (this.language.prob > 0.3) { // Return language only if probability is higher than 30% to account for missing language profiles
if (this.language.lang.length() == 2) if (this.language.prob > 0.3) {
return this.language.lang; if (this.language.lang.length() == 2) {
else return this.language.lang;
return this.language.lang.substring(0,2); }
} return this.language.lang.substring(0,2);
}
}
return null; return null;
@ -95,8 +101,8 @@ public final class Identificator {
public double getProbability() { public double getProbability() {
if (language != null) { if (language != null) {
return language.prob; return language.prob;
} else }
return 0.0; return 0.0;
} }
} }

@ -217,6 +217,8 @@ import net.yacy.utils.crypt;
import net.yacy.utils.upnp.UPnP; import net.yacy.utils.upnp.UPnP;
import net.yacy.visualization.CircleTool; import net.yacy.visualization.CircleTool;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.google.common.io.Files; import com.google.common.io.Files;
@ -411,6 +413,14 @@ public final class Switchboard extends serverSwitch {
} }
}.start(); }.start();
// init the language detector
this.log.config("Loading language profiles");
try {
DetectorFactory.loadProfile(new File(appPath, "langdetect").toString());
} catch (LangDetectException e) {
ConcurrentLog.logException(e);
}
// init global host name cache // init global host name cache
Domains.init(new File(this.workPath, "globalhosts.list")); Domains.init(new File(this.workPath, "globalhosts.list"));

Loading…
Cancel
Save