Merge remote-tracking branch 'origin/master' into docker

pull/52/head
luc 9 years ago
commit 1cd6adb0da

@ -30,6 +30,7 @@ import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -83,7 +84,7 @@ public class Vocabulary_p {
final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); final boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv"); final boolean discoverFromCSV = post.get("discovermethod", "").equals("csv");
final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " "); final String discoverFromCSVPath = post.get("discoverpath", "").replaceAll("%20", " ");
final String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name()); String discoverFromCSVCharset = post.get("charset", StandardCharsets.UTF_8.name());
final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0); final int discovercolumnliteral = post.getInt("discovercolumnliteral", 0);
final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1); final int discovercolumnsynonyms = post.getInt("discovercolumnsynonyms", -1);
final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1); final int discovercolumnobjectlink = post.getInt("discovercolumnobjectlink", -1);
@ -95,7 +96,11 @@ public class Vocabulary_p {
if (!discoverNot) { if (!discoverNot) {
if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) { if (discoverFromCSV && discoverFromCSVFile != null && discoverFromCSVFile.exists()) {
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
FileUtils.checkCharset(discoverFromCSVFile, discoverFromCSVCharset, true); if (discoverFromCSVCharset.equals("autodetect")) {
List<String> charsets = FileUtils.detectCharset(discoverFromCSVFile);
discoverFromCSVCharset = charsets.get(0);
ConcurrentLog.info("FileUtils", "detected charset: " + discoverFromCSVCharset + " used to read " + discoverFromCSVFile.toString());
}
// read file // read file
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset)); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(discoverFromCSVFile), discoverFromCSVCharset));
String line = null; String line = null;
@ -304,10 +309,12 @@ public class Vocabulary_p {
} }
// make charset list for import method selector // make charset list for import method selector
int c = 0; prop.putHTML("create_charset_" + 0 + "_name", "autodetect");
prop.put("create_charset_" + 0 + "_selected", 1);
int c = 1;
for (String cs: Charset.availableCharsets().keySet()) { for (String cs: Charset.availableCharsets().keySet()) {
prop.putHTML("create_charset_" + c + "_name", cs); prop.putHTML("create_charset_" + c + "_name", cs);
prop.put("create_charset_" + c + "_selected", cs.equals("windows-1252") ? 1 : 0); prop.put("create_charset_" + c + "_selected", 0);
c++; c++;
} }
prop.put("create_charset", c); prop.put("create_charset", c);

File diff suppressed because it is too large Load Diff

@ -53,7 +53,6 @@ import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
@ -935,10 +934,10 @@ public final class FileUtils {
* used code from http://jchardet.sourceforge.net/; * used code from http://jchardet.sourceforge.net/;
* see also: http://www-archive.mozilla.org/projects/intl/chardet.html * see also: http://www-archive.mozilla.org/projects/intl/chardet.html
* @param file * @param file
* @return a set of probable charsets * @return a list of probable charsets
* @throws IOException * @throws IOException
*/ */
public static Set<String> detectCharset(File file) throws IOException { public static List<String> detectCharset(File file) throws IOException {
// auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html // auto-detect charset, used code from http://jchardet.sourceforge.net/; see also: http://www-archive.mozilla.org/projects/intl/chardet.html
nsDetector det = new nsDetector(nsPSMDetector.ALL); nsDetector det = new nsDetector(nsPSMDetector.ALL);
BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file)); BufferedInputStream imp = new BufferedInputStream(new FileInputStream(file));
@ -953,11 +952,11 @@ public final class FileUtils {
if (!isAscii && !done) done = det.DoIt(buf,len, false); if (!isAscii && !done) done = det.DoIt(buf,len, false);
} }
det.DataEnd(); det.DataEnd();
Set<String> result = new HashSet<>(); List<String> result = new ArrayList<>();
if (isAscii) { if (isAscii) {
result.add("ASCII"); result.add(StandardCharsets.US_ASCII.name());
} else { } else {
for (String c: det.getProbableCharsets()) result.add(c); for (String c: det.getProbableCharsets()) result.add(c); // worst case this returns "nomatch"
} }
return result; return result;
@ -976,7 +975,7 @@ public final class FileUtils {
@Override @Override
public void run() { public void run() {
try { try {
Set<String> charsets = FileUtils.detectCharset(file); List<String> charsets = FileUtils.detectCharset(file);
if (charsets.contains(givenCharset)) { if (charsets.contains(givenCharset)) {
ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets); ConcurrentLog.info("checkCharset", "appropriate charset '" + givenCharset + "' for import of " + file + ", is part one detected " + charsets);
} else { } else {

Loading…
Cancel
Save