bugfixed utf-8 decoding and parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@346 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 712fe9ef18
parent 63f9570d3a
commit 712fe9ef18
9 changed files with 94 additions and 506 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -121,8 +121,9 @@ public class CacheAdmin_p {
                    else {
                        htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
                        OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
                        plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
                        serverFileUtils.copy(file, os);
                        os.flush();
                        plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
                        info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
                        info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
                        info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
@ -130,7 +131,7 @@ public class CacheAdmin_p {
                        info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
                        info += "<b>LINES:</b><br><span class=\"small\">";
                        String[] sentences = document.getSentences();
-                        for (int i = 0; i < sentences.length; i++) info += sentences + "<br>";
+                        for (int i = 0; i < sentences.length; i++) info += sentences[i] + "<br>";
                        info += "</span><br>";
                    }
                } catch (Exception e) {
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -41,6 +41,7 @@
 package de.anomic.htmlFilter;
 import java.net.URL;
 import java.net.MalformedURLException;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
@ -178,4 +179,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 	System.out.println("TEXT    :" + new String(text.getBytes()));
    }
    public static void main(String[] args) {
 	String test = "Nokia kürzt bei Forschung und Entwicklung";
        try {
            htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
            scraper.scrapeText(test.getBytes());
            System.out.println(new String(scraper.getText()));
        } catch (MalformedURLException e) {}
    }
 }
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -498,7 +498,6 @@ public final class plasmaParser {
                // ... otherwise we make a html scraper and transformer
                htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
                OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
                hfos.write(source);
                hfos.close();
                return transformScraper(location, mimeType, scraper);
@ -665,7 +664,7 @@ public final class plasmaParser {
        //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
        try {
            File in = new File(args[0]);
-			File out = new File(args[1]);
+            //File out = new File(args[1]);
            plasmaParser theParser = new plasmaParser();
            theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
            theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2");
@ -674,8 +673,10 @@ public final class plasmaParser {
            serverFileUtils.copy(theInput, theOutput);
            plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
            //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
-			byte[] theText = document.getText();
+            //byte[] theText = document.getText();
-			serverFileUtils.write(theText, out);
+            //serverFileUtils.write(theText, out);
            String[] sentences = document.getSentences();
            for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
        } catch (Exception e) {
            e.printStackTrace();
        }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -105,6 +105,7 @@ public class plasmaSnippetCache {
    }
    public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
        // heise = "0OQUNU3JSs05"
        if (queryhashes.size() == 0) {
            //System.out.println("found no queryhashes for url retrieve " + url);
            return new result(null, SOURCE_ERROR, "no query hashes given");
@ -250,7 +251,7 @@ public class plasmaSnippetCache {
        } catch (IOException e) {}
        if (header == null) {
-            String filename = url.getFile();
+            String filename = cacheManager.getCachePath(url).getName();
            int p = filename.lastIndexOf('.');
            if ((p < 0) ||
                ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -371,6 +371,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
            new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
        }
        // test routine for snippet fetch
        // url = /www.heise.de/mobil/newsticker/meldung/mail/54980
        Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise'
        //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
        plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
    }
    private static String ppRamString(int bytes) {
--- a/source/yacy.java
+++ b/source/yacy.java
@ -148,6 +148,10 @@ public final class yacy {
            plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
            // hardcoded, forced, temporary value-migration
            sb.setConfig("htTemplatePath", "htroot/env/templates");
            sb.setConfig("parseableExt", "html,htm,txt,php,shtml,asp");
            // if we are running an SVN version, we try to detect the used svn revision now ...
            if (vString.equals("@" + "REPL_VERSION" + "@")) {
                Properties buildProp = new Properties();
@ -188,9 +192,6 @@ public final class yacy {
            if (timeout < 60000) timeout = 60000;
            int maxSessions   = Integer.parseInt(sb.getConfig("httpdMaxSessions", "100"));
            // hardcoded, forced, temporary value-migration
            sb.setConfig("htTemplatePath", "htroot/env/templates");
            // create some directories
            File htRootPath = new File(sb.getRootPath(), sb.getConfig("htRootPath", "htroot"));
            File htDocsPath = new File(sb.getRootPath(), sb.getConfig("htDocsPath", "DATA/HTDOCS"));
--- a/yacy.init
+++ b/yacy.init
@ -100,7 +100,7 @@ parseableMimeTypes=
 # this is important to recognize <a href> - tags as not-html reference
 # These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
 mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip
-parseableExt=html,htm,txt
+parseableExt=html,htm,txt,php,shtml,asp
 # Promotion Strings
 # These strings appear in the Web Mask of the YACY search client