diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index b1d854387..1f584547e 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -211,7 +211,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); if (h.length() > 0) headlines[3].add(h); } - if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8 + if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) + title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString(this.charset)); } private static String cleanLine(String s) { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 9b3ffd6ab..d65573b3b 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -725,11 +725,11 @@ public final class plasmaParser { serverFileUtils.write(contentBytes, contentFile); } - if ((args.length == 4)&&(args[2].equalsIgnoreCase("-m"))) { + if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) { contentMimeType = args[3]; } - if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) { + if ((args.length >= 6)&&(args[4].equalsIgnoreCase("-c"))) { charSet = args[5]; } @@ -747,6 +747,9 @@ public final class plasmaParser { // printing out all parsed sentences if (document != null) { + System.out.print("Document titel: "); + System.out.println(document.getMainLongTitle()); + // found text String[] sentences = document.getSentences(); if (sentences != null) {