bugfixed utf-8 decoding and parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@346 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 63f9570d3a
commit 712fe9ef18

@ -121,8 +121,9 @@ public class CacheAdmin_p {
else { else {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(url); htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
serverFileUtils.copy(file, os); serverFileUtils.copy(file, os);
os.flush();
plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>"; info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>"; info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>"; info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
@ -130,7 +131,7 @@ public class CacheAdmin_p {
info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>"; info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
info += "<b>LINES:</b><br><span class=\"small\">"; info += "<b>LINES:</b><br><span class=\"small\">";
String[] sentences = document.getSentences(); String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) info += sentences + "<br>"; for (int i = 0; i < sentences.length; i++) info += sentences[i] + "<br>";
info += "</span><br>"; info += "</span><br>";
} }
} catch (Exception e) { } catch (Exception e) {

File diff suppressed because one or more lines are too long

@ -41,6 +41,7 @@
package de.anomic.htmlFilter; package de.anomic.htmlFilter;
import java.net.URL; import java.net.URL;
import java.net.MalformedURLException;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
@ -178,4 +179,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
System.out.println("TEXT :" + new String(text.getBytes())); System.out.println("TEXT :" + new String(text.getBytes()));
} }
public static void main(String[] args) {
String test = "Nokia kürzt bei Forschung und Entwicklung";
try {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
scraper.scrapeText(test.getBytes());
System.out.println(new String(scraper.getText()));
} catch (MalformedURLException e) {}
}
} }

@ -498,7 +498,6 @@ public final class plasmaParser {
// ... otherwise we make a html scraper and transformer // ... otherwise we make a html scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
hfos.write(source); hfos.write(source);
hfos.close(); hfos.close();
return transformScraper(location, mimeType, scraper); return transformScraper(location, mimeType, scraper);
@ -665,7 +664,7 @@ public final class plasmaParser {
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
try { try {
File in = new File(args[0]); File in = new File(args[0]);
File out = new File(args[1]); //File out = new File(args[1]);
plasmaParser theParser = new plasmaParser(); plasmaParser theParser = new plasmaParser();
theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain"); theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2"); theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2");
@ -674,8 +673,10 @@ public final class plasmaParser {
serverFileUtils.copy(theInput, theOutput); serverFileUtils.copy(theInput, theOutput);
plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray()); plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
//plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
byte[] theText = document.getText(); //byte[] theText = document.getText();
serverFileUtils.write(theText, out); //serverFileUtils.write(theText, out);
String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} }

@ -105,6 +105,7 @@ public class plasmaSnippetCache {
} }
public result retrieve(URL url, Set queryhashes, boolean fetchOnline) { public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
// heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) { if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for url retrieve " + url); //System.out.println("found no queryhashes for url retrieve " + url);
return new result(null, SOURCE_ERROR, "no query hashes given"); return new result(null, SOURCE_ERROR, "no query hashes given");
@ -250,7 +251,7 @@ public class plasmaSnippetCache {
} catch (IOException e) {} } catch (IOException e) {}
if (header == null) { if (header == null) {
String filename = url.getFile(); String filename = cacheManager.getCachePath(url).getName();
int p = filename.lastIndexOf('.'); int p = filename.lastIndexOf('.');
if ((p < 0) || if ((p < 0) ||
((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) { ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {

@ -371,6 +371,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38", deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000); new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
} }
// test routine for snippet fetch
// url = /www.heise.de/mobil/newsticker/meldung/mail/54980
Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise'
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
} }
private static String ppRamString(int bytes) { private static String ppRamString(int bytes) {

@ -148,6 +148,10 @@ public final class yacy {
plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf"); plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
// hardcoded, forced, temporary value-migration
sb.setConfig("htTemplatePath", "htroot/env/templates");
sb.setConfig("parseableExt", "html,htm,txt,php,shtml,asp");
// if we are running an SVN version, we try to detect the used svn revision now ... // if we are running an SVN version, we try to detect the used svn revision now ...
if (vString.equals("@" + "REPL_VERSION" + "@")) { if (vString.equals("@" + "REPL_VERSION" + "@")) {
Properties buildProp = new Properties(); Properties buildProp = new Properties();
@ -188,9 +192,6 @@ public final class yacy {
if (timeout < 60000) timeout = 60000; if (timeout < 60000) timeout = 60000;
int maxSessions = Integer.parseInt(sb.getConfig("httpdMaxSessions", "100")); int maxSessions = Integer.parseInt(sb.getConfig("httpdMaxSessions", "100"));
// hardcoded, forced, temporary value-migration
sb.setConfig("htTemplatePath", "htroot/env/templates");
// create some directories // create some directories
File htRootPath = new File(sb.getRootPath(), sb.getConfig("htRootPath", "htroot")); File htRootPath = new File(sb.getRootPath(), sb.getConfig("htRootPath", "htroot"));
File htDocsPath = new File(sb.getRootPath(), sb.getConfig("htDocsPath", "DATA/HTDOCS")); File htDocsPath = new File(sb.getRootPath(), sb.getConfig("htDocsPath", "DATA/HTDOCS"));

@ -100,7 +100,7 @@ parseableMimeTypes=
# this is important to recognize <a href> - tags as not-html reference # this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_ # These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip
parseableExt=html,htm,txt parseableExt=html,htm,txt,php,shtml,asp
# Promotion Strings # Promotion Strings
# These strings appear in the Web Mask of the YACY search client # These strings appear in the Web Mask of the YACY search client

Loading…
Cancel
Save