diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index f077feb7c..b4e27a421 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -66,6 +66,7 @@ import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.zipParser; import net.yacy.document.parser.images.genericImageParser; import net.yacy.document.parser.images.metadataImageParser; +import net.yacy.document.parser.images.svgParser; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; @@ -105,6 +106,7 @@ public final class TextParser { initParser(new rtfParser()); initParser(new sevenzipParser()); initParser(new sidAudioParser()); + initParser(new svgParser()); initParser(new swfParser()); initParser(new tarParser()); initParser(new torrentParser()); diff --git a/source/net/yacy/document/parser/images/svgParser.java b/source/net/yacy/document/parser/images/svgParser.java new file mode 100644 index 000000000..dda4ff7b5 --- /dev/null +++ b/source/net/yacy/document/parser/images/svgParser.java @@ -0,0 +1,257 @@ +/** + * svgParser.java + * Copyright 2015 by Burkhard Buelte + * First released 26.09.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.document.parser.images; + +import java.io.EOFException; +import java.io.InputStream; +import java.util.LinkedHashMap; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.NumberTools; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.VocabularyScraper; +import net.yacy.document.parser.html.ImageEntry; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Metadata parser for svg image files (which are xml files) SVG 1.1 (Second Edition) + * http://www.w3.org/TR/SVG/metadata.html#MetadataElement according to SVG 1.1 + * parser stops parsing after the first metadata elment has been read and + * document level metadata are expected picture data (as proposed in spec) like + * + * + * + * + * <... other/> + * + */ +public class svgParser extends AbstractParser implements Parser { + + public svgParser() { + super("SVG Image Parser"); + this.SUPPORTED_EXTENSIONS.add("svg"); + this.SUPPORTED_MIME_TYPES.add("image/svg+xml"); + } + + private static final ThreadLocal tlSax = new ThreadLocal(); + + private static SAXParser getParser() throws SAXException { + SAXParser parser = tlSax.get(); + if (parser == null) { + try { + parser = SAXParserFactory.newInstance().newSAXParser(); + } catch (final ParserConfigurationException e) { + throw new SAXException(e.getMessage(), e); + } + tlSax.set(parser); + } + return parser; + } + + @Override + public Document[] parse( + final AnchorURL location, + final String mimeType, + final String charset, + final VocabularyScraper scraper, + final int timezoneOffset, + final InputStream source) throws Parser.Failure, InterruptedException { + + try { + final SAXParser saxParser = getParser(); + final svgMetaDataHandler metaData = new svgMetaDataHandler(); + try { + saxParser.parse(source, metaData); + } catch (SAXException e) { + // catch EOFException which is intentionally thrown after capturing metadata to skip further reading (not a error, just a way to get out of SAX) + if (e.getException() == null || !(e.getException() instanceof EOFException)) { + throw new Parser.Failure("Unexpected error while parsing svg file. " + e.getMessage(), location); + } + } + + String docTitle = metaData.getTitle(); + if (docTitle == null) { // use filename like in genericParser + docTitle = location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(location.getFileName()); // + } + String docDescription = metaData.getDescription(); + if (docDescription == null) { // use url token as in genericParser + docDescription = location.toTokens(); + } + + LinkedHashMap images = null; + // add this image to the map of images to register size (as in genericImageParser) + if (metaData.getHeight() != null && metaData.getWidth() != null) { + images = new LinkedHashMap(); + images.put(location, new ImageEntry(location, "", metaData.getWidth(), metaData.getHeight(), -1)); + } + + // create the parser document + Document[] docs = new Document[]{new Document( + location, + mimeType, + "UTF-8", + this, + null, + null, + AbstractParser.singleList(docTitle), + null, + "", + null, + null, + 0.0f, 0.0f, + docDescription, // text - for this image description is best text we have + null, + null, + images, + false, + null)}; + return docs; + } catch (final Exception e) { + if (e instanceof InterruptedException) { + throw (InterruptedException) e; + } + if (e instanceof Parser.Failure) { + throw (Parser.Failure) e; + } + + ConcurrentLog.logException(e); + throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(), location); + } + } + + /** + * SAX handler for svg metadata + */ + public class svgMetaDataHandler extends DefaultHandler { + + private final StringBuilder buffer = new StringBuilder(); + private boolean scrapeMetaData = false; // true if within metadata tag + + private String docTitle = null; // document level title + private String docDescription = null; // document level description + private String imgWidth = null; // size in pixel + private String imgHeight = null; + + public svgMetaDataHandler() { + } + + @Override + public void characters(final char ch[], final int start, final int length) { + buffer.append(ch, start, length); + } + + @Override + public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { + if (scrapeMetaData) { + // not implemented yet TODO: interprete RDF content + // may contain RDF + DC, DC, CC ... + } else { + if (null != tag) { + switch (tag) { + case "svg": + imgHeight = atts.getValue("height"); + imgWidth = atts.getValue("width"); + break; + case "metadata": + scrapeMetaData = true; + break; + // some common graph elements as stop condition (skip reading remainder of input), metadata is expected before graphic content + case "g": + case "line": + case "path": + case "rect": + throw new SAXException("EOF svg Metadata", new EOFException()); + } + } + } + buffer.delete(0, buffer.length()); + } + + @Override + public void endElement(final String uri, final String name, final String tag) throws SAXException { + if (scrapeMetaData) { + // stop condition, scrape only first metadata element + if ("metadata".equals(tag)) { + scrapeMetaData = false; + buffer.delete(0, buffer.length()); + // we have read metadate, other data are not of interest here, end parsing + throw new SAXException("EOF svg Metadata", new EOFException()); + } + } else if ("title".equals(tag)) { + this.docTitle = buffer.toString(); + } else if ("desc".equals(tag)) { + this.docDescription = buffer.toString(); + } + buffer.delete(0, buffer.length()); + } + + /** + * @return document level title or null + */ + public String getTitle() { + return docTitle; + } + + /** + * @return document level description or null + */ + public String getDescription() { + return docDescription; + } + + /** + * @return image width in pixel or null + */ + public Integer getWidth() { + if (imgWidth != null) { + // return number if given in pixel or a number only, return nothing for size like "100%" + if ((imgWidth.indexOf("px") > 0) || ((imgWidth.charAt(imgWidth.length() - 1) >= '0' && imgWidth.charAt(imgWidth.length() - 1) <= '9'))) { + return NumberTools.parseIntDecSubstring(imgWidth); + } + } + return null; + } + + /** + * @return image height in pixel or null + */ + public Integer getHeight() { + if (imgHeight != null) { + // return number if given in pixel or a number only, return nothing for size like "100%" + if ((imgHeight.indexOf("px") > 0) || ((imgHeight.charAt(imgHeight.length() - 1) >= '0' && imgHeight.charAt(imgHeight.length() - 1) <= '9'))) { + return NumberTools.parseIntDecSubstring(imgHeight); + } + } + return null; + } + } +}