From de4f30bb2eb85608f6d3be71c6c44af8ddf0c1dc Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 16 Jun 2010 15:22:31 +0000 Subject: [PATCH] UTF-8 fix git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6923 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/net/yacy/document/parser/htmlParser.java | 2 ++ .../net/yacy/kelondro/data/meta/URIMetadataRow.java | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index b201baab4..0e325b3bd 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -42,6 +42,7 @@ import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; +import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.TransformerWriter; @@ -260,6 +261,7 @@ public class htmlParser extends AbstractParser implements Idiom { Document document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); String title = document.dc_title(); System.out.println(title); + System.out.println(CharacterCoding.unicode2html(title, false)); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index 907e3a2b7..d185e2988 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -376,13 +376,23 @@ public class URIMetadataRow implements URIMetadata { // parse elements from comp field; byte[] c = this.entry.getColBytes(col_comp, true); List cl = ByteBuffer.split(c, (byte) 10); - this.comp = new Components( + try { + this.comp = new Components( + (cl.size() > 0) ? new String(cl.get(0), "UTF-8") : "", + hash(), + (cl.size() > 1) ? new String(cl.get(1), "UTF-8") : "", + (cl.size() > 2) ? new String(cl.get(2), "UTF-8") : "", + (cl.size() > 3) ? new String(cl.get(3), "UTF-8") : "", + (cl.size() > 4) ? new String(cl.get(4), "UTF-8") : ""); + } catch (UnsupportedEncodingException e) { + this.comp = new Components( (cl.size() > 0) ? new String(cl.get(0)) : "", hash(), (cl.size() > 1) ? new String(cl.get(1)) : "", (cl.size() > 2) ? new String(cl.get(2)) : "", (cl.size() > 3) ? new String(cl.get(3)) : "", (cl.size() > 4) ? new String(cl.get(4)) : ""); + } return this.comp; }