|
|
|
@ -498,7 +498,6 @@ public final class plasmaParser {
|
|
|
|
|
// ... otherwise we make a html scraper and transformer
|
|
|
|
|
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
|
|
|
|
|
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
|
|
|
|
|
|
|
|
|
|
hfos.write(source);
|
|
|
|
|
hfos.close();
|
|
|
|
|
return transformScraper(location, mimeType, scraper);
|
|
|
|
@ -665,7 +664,7 @@ public final class plasmaParser {
|
|
|
|
|
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
|
|
|
|
|
try {
|
|
|
|
|
File in = new File(args[0]);
|
|
|
|
|
File out = new File(args[1]);
|
|
|
|
|
//File out = new File(args[1]);
|
|
|
|
|
plasmaParser theParser = new plasmaParser();
|
|
|
|
|
theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
|
|
|
|
|
theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2");
|
|
|
|
@ -674,8 +673,10 @@ public final class plasmaParser {
|
|
|
|
|
serverFileUtils.copy(theInput, theOutput);
|
|
|
|
|
plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
|
|
|
|
|
//plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
|
|
|
|
|
byte[] theText = document.getText();
|
|
|
|
|
serverFileUtils.write(theText, out);
|
|
|
|
|
//byte[] theText = document.getText();
|
|
|
|
|
//serverFileUtils.write(theText, out);
|
|
|
|
|
String[] sentences = document.getSentences();
|
|
|
|
|
for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
}
|
|
|
|
|