allow multiple parser options instead of printing an error

pull/1/head
Michael Peter Christen 13 years ago
parent c02d742e53
commit 50c576599b

@ -26,10 +26,8 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
@ -75,8 +73,8 @@ public final class TextParser {
private static final Object v = new Object(); private static final Object v = new Object();
private static final Parser genericIdiom = new genericParser(); private static final Parser genericIdiom = new genericParser();
private static final Map<String, Parser> mime2parser = new ConcurrentHashMap<String, Parser>(); private static final Map<String, Set<Parser>> mime2parser = new ConcurrentHashMap<String, Set<Parser>>();
private static final Map<String, Parser> ext2parser = new ConcurrentHashMap<String, Parser>(); private static final Map<String, Set<Parser>> ext2parser = new ConcurrentHashMap<String, Set<Parser>>();
private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>(); private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>(); private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>();
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>(); private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>();
@ -86,7 +84,7 @@ public final class TextParser {
initParser(new csvParser()); initParser(new csvParser());
initParser(new docParser()); initParser(new docParser());
initParser(new gzipParser()); initParser(new gzipParser());
initParser(new htmlParser("HTML Parser")); initParser(new htmlParser());
initParser(new genericImageParser()); initParser(new genericImageParser());
initParser(new mmParser()); initParser(new mmParser());
initParser(new odtParser()); initParser(new odtParser());
@ -105,17 +103,17 @@ public final class TextParser {
initParser(new vsdParser()); initParser(new vsdParser());
initParser(new xlsParser()); initParser(new xlsParser());
initParser(new zipParser()); initParser(new zipParser());
initParser(new RDFaParser("RDFa Parser")); initParser(new RDFaParser());
initParser(new rdfParser()); initParser(new rdfParser());
if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser("RDFa Parser")); if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser());
if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser("Augment Parser")); if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser());
} }
public static Set<Parser> parsers() { public static Set<Parser> parsers() {
final Set<Parser> c = new HashSet<Parser>(); final Set<Parser> c = new HashSet<Parser>();
c.addAll(ext2parser.values()); for (Set<Parser> pl: ext2parser.values()) c.addAll(pl);
c.addAll(mime2parser.values()); for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
return c; return c;
} }
@ -125,25 +123,31 @@ public final class TextParser {
// process the mime types // process the mime types
final String mimeType = normalizeMimeType(mime); final String mimeType = normalizeMimeType(mime);
if (prototypeMime == null) prototypeMime = mimeType; if (prototypeMime == null) prototypeMime = mimeType;
final Parser p0 = mime2parser.get(mimeType); Set<Parser> p0 = mime2parser.get(mimeType);
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); if (p0 == null) {
mime2parser.put(mimeType, parser); p0 = new HashSet<Parser>();
mime2parser.put(mimeType, p0);
}
p0.add(parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
} }
if (prototypeMime != null) for (String ext: parser.supportedExtensions()) { if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
ext = ext.toLowerCase(); ext = ext.toLowerCase();
final String s = ext2mime.get(ext); final String s = ext2mime.get(ext);
if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); if (s != null && !s.equals(prototypeMime)) log.logWarning("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
ext2mime.put(ext, prototypeMime); ext2mime.put(ext, prototypeMime);
} }
for (String ext: parser.supportedExtensions()) { for (String ext: parser.supportedExtensions()) {
// process the extensions // process the extensions
ext = ext.toLowerCase(); ext = ext.toLowerCase();
final Parser p0 = ext2parser.get(ext); Set<Parser> p0 = ext2parser.get(ext);
if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); if (p0 == null) {
ext2parser.put(ext, parser); p0 = new HashSet<Parser>();
ext2parser.put(ext, p0);
}
p0.add(parser);
Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName()); Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName());
} }
} }
@ -187,7 +191,7 @@ public final class TextParser {
) throws Parser.Failure { ) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
List<Parser> idioms = null; Set<Parser> idioms = null;
try { try {
idioms = parsers(location, mimeType); idioms = parsers(location, mimeType);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
@ -211,7 +215,7 @@ public final class TextParser {
) throws Parser.Failure { ) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
List<Parser> idioms = null; Set<Parser> idioms = null;
try { try {
idioms = parsers(location, mimeType); idioms = parsers(location, mimeType);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
@ -225,7 +229,7 @@ public final class TextParser {
// then we use only one stream-oriented parser. // then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser // use a specific stream-oriented parser
return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); return parseSource(location, mimeType, idioms.iterator().next(), charset, contentLength, sourceStream);
} }
// in case that we know more parsers we first transform the content into a byte[] and use that as base // in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -267,7 +271,7 @@ public final class TextParser {
private static Document[] parseSource( private static Document[] parseSource(
final DigestURI location, final DigestURI location,
final String mimeType, final String mimeType,
final List<Parser> parsers, final Set<Parser> parsers,
final String charset, final String charset,
final byte[] sourceArray final byte[] sourceArray
) throws Parser.Failure { ) throws Parser.Failure {
@ -334,8 +338,8 @@ public final class TextParser {
public static String supports(final MultiProtocolURI url, final String mimeType) { public static String supports(final MultiProtocolURI url, final String mimeType) {
try { try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
final List<Parser> idioms = parsers(url, mimeType); final Set<Parser> idioms = parsers(url, mimeType);
return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null; return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.iterator().next().getName().equals(genericIdiom.getName()))) ? "no parser found" : null;
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
// in case that a parser is not available, return a error string describing the problem. // in case that a parser is not available, return a error string describing the problem.
return e.getMessage(); return e.getMessage();
@ -355,17 +359,17 @@ public final class TextParser {
* @return a list of Idiom parsers that may be appropriate for the given criteria * @return a list of Idiom parsers that may be appropriate for the given criteria
* @throws Parser.Failure * @throws Parser.Failure
*/ */
private static List<Parser> parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure { private static Set<Parser> parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure {
final List<Parser> idioms = new ArrayList<Parser>(2); final Set<Parser> idioms = new HashSet<Parser>(2);
// check extension // check extension
String ext = url.getFileExtension(); String ext = url.getFileExtension();
Parser idiom; Set<Parser> idiom;
if (ext != null && ext.length() > 0) { if (ext != null && ext.length() > 0) {
ext = ext.toLowerCase(); ext = ext.toLowerCase();
if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
idiom = ext2parser.get(ext); idiom = ext2parser.get(ext);
if (idiom != null) idioms.add(idiom); if (idiom != null) idioms.addAll(idiom);
} }
// check given mime type // check given mime type
@ -373,12 +377,12 @@ public final class TextParser {
mimeType1 = normalizeMimeType(mimeType1); mimeType1 = normalizeMimeType(mimeType1);
if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url); if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url);
idiom = mime2parser.get(mimeType1); idiom = mime2parser.get(mimeType1);
if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); if (idiom != null && !idioms.contains(idiom)) idioms.addAll(idiom);
} }
// check mime type computed from extension // check mime type computed from extension
final String mimeType2 = ext2mime.get(ext); final String mimeType2 = ext2mime.get(ext);
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom); if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.addAll(idiom);
// always add the generic parser // always add the generic parser
idioms.add(genericIdiom); idioms.add(genericIdiom);
@ -412,9 +416,9 @@ public final class TextParser {
if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)"; if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)";
final String mimeType = ext2mime.get(ext); final String mimeType = ext2mime.get(ext);
if (mimeType == null) return "no parser available"; if (mimeType == null) return "no parser available";
final Parser idiom = mime2parser.get(mimeType); final Set<Parser> idiom = mime2parser.get(mimeType);
assert idiom != null; assert idiom != null;
if (idiom == null) return "no parser available (internal error!)"; if (idiom == null || idiom.size() == 0) return "no parser available (internal error!)";
return null; return null;
} }

@ -6,45 +6,24 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set; import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.rdfa.impl.RDFaParser; import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import de.anomic.data.ymark.YMarkUtil; import de.anomic.data.ymark.YMarkUtil;
public class AugmentParser extends RDFaParser { public class AugmentParser extends AbstractParser implements Parser {
public AugmentParser(String name) { RDFaParser rdfaParser;
super(name);
System.out.println("augmented parser was initialized"); public AugmentParser() {
super("AugmentParser");
this.rdfaParser = new RDFaParser();
this.SUPPORTED_EXTENSIONS.remove("htm"); System.out.println("augmented parser was initialized");
this.SUPPORTED_EXTENSIONS.remove("html");
this.SUPPORTED_EXTENSIONS.remove("shtml");
this.SUPPORTED_EXTENSIONS.remove("xhtml");
this.SUPPORTED_EXTENSIONS.remove("php");
this.SUPPORTED_EXTENSIONS.remove("php3");
this.SUPPORTED_EXTENSIONS.remove("php4");
this.SUPPORTED_EXTENSIONS.remove("php5");
this.SUPPORTED_EXTENSIONS.remove("cfm");
this.SUPPORTED_EXTENSIONS.remove("asp");
this.SUPPORTED_EXTENSIONS.remove("aspx");
this.SUPPORTED_EXTENSIONS.remove("tex");
this.SUPPORTED_EXTENSIONS.remove("txt");
this.SUPPORTED_EXTENSIONS.remove("jsp");
this.SUPPORTED_EXTENSIONS.remove("mf");
this.SUPPORTED_EXTENSIONS.remove("pl");
this.SUPPORTED_EXTENSIONS.remove("py");
this.SUPPORTED_MIME_TYPES.remove("text/html");
this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
this.SUPPORTED_MIME_TYPES.remove("text/plain");
this.SUPPORTED_MIME_TYPES.remove("text/sgml");
this.SUPPORTED_MIME_TYPES.remove("text/csv");
this.SUPPORTED_EXTENSIONS.add("html"); this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("php"); this.SUPPORTED_EXTENSIONS.add("php");
@ -59,7 +38,7 @@ public class AugmentParser extends RDFaParser {
String charset, InputStream source) throws Failure, String charset, InputStream source) throws Failure,
InterruptedException { InterruptedException {
Document[] htmlDocs = super.parse(url, mimeType, charset, source); Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
try { try {
source.reset(); source.reset();
} catch (IOException e) { } catch (IOException e) {

@ -53,8 +53,8 @@ public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_"); private static final Pattern patternUnderline = Pattern.compile("_");
public htmlParser(String name) { public htmlParser() {
super(name); super("Streaming HTML Parser");
this.SUPPORTED_EXTENSIONS.add("htm"); this.SUPPORTED_EXTENSIONS.add("htm");
this.SUPPORTED_EXTENSIONS.add("html"); this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("phtml"); this.SUPPORTED_EXTENSIONS.add("phtml");
@ -299,7 +299,7 @@ public class htmlParser extends AbstractParser implements Parser {
try { try {
url = new DigestURI(args[0]); url = new DigestURI(args[0]);
final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000); final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
final Document[] document = new htmlParser("HTML Parser").parse(url, "text/html", null, new ByteArrayInputStream(content)); final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
final String title = document[0].dc_title(); final String title = document[0].dc_title();
System.out.println(title); System.out.println(title);
System.out.println(CharacterCoding.unicode2html(title, false)); System.out.println(CharacterCoding.unicode2html(title, false));

@ -3,13 +3,21 @@
*/ */
package net.yacy.document.parser.rdfa.impl; package net.yacy.document.parser.rdfa.impl;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.rdfa.IRDFaTriple; import net.yacy.document.parser.rdfa.IRDFaTriple;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
@ -19,35 +27,13 @@ import net.yacy.kelondro.logging.Log;
* @author fgandon * @author fgandon
* *
*/ */
public class RDFaParser extends htmlParser { public class RDFaParser extends AbstractParser implements Parser {
public RDFaParser(String name) { private final htmlParser hp;
super(name);
this.SUPPORTED_EXTENSIONS.remove("htm"); public RDFaParser() {
this.SUPPORTED_EXTENSIONS.remove("html"); super("RDFa Parser");
this.SUPPORTED_EXTENSIONS.remove("shtml"); this.hp = new htmlParser();
this.SUPPORTED_EXTENSIONS.remove("xhtml");
this.SUPPORTED_EXTENSIONS.remove("php");
this.SUPPORTED_EXTENSIONS.remove("php3");
this.SUPPORTED_EXTENSIONS.remove("php4");
this.SUPPORTED_EXTENSIONS.remove("php5");
this.SUPPORTED_EXTENSIONS.remove("cfm");
this.SUPPORTED_EXTENSIONS.remove("asp");
this.SUPPORTED_EXTENSIONS.remove("aspx");
this.SUPPORTED_EXTENSIONS.remove("tex");
this.SUPPORTED_EXTENSIONS.remove("txt");
this.SUPPORTED_EXTENSIONS.remove("jsp");
this.SUPPORTED_EXTENSIONS.remove("mf");
this.SUPPORTED_EXTENSIONS.remove("pl");
this.SUPPORTED_EXTENSIONS.remove("py");
this.SUPPORTED_MIME_TYPES.remove("text/html");
this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
this.SUPPORTED_MIME_TYPES.remove("text/plain");
this.SUPPORTED_MIME_TYPES.remove("text/sgml");
this.SUPPORTED_MIME_TYPES.remove("text/csv");
this.SUPPORTED_EXTENSIONS.add("html"); this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("php"); this.SUPPORTED_EXTENSIONS.add("php");
@ -116,7 +102,7 @@ public class RDFaParser extends htmlParser {
Document[] htmlDocs = null; Document[] htmlDocs = null;
try { try {
htmlDocs = super.parse(url, mimeType, charset, source); htmlDocs = this.hp.parse(url, mimeType, charset, source);
source.reset(); source.reset();
} catch (IOException e1) { } catch (IOException e1) {
@ -129,9 +115,9 @@ public class RDFaParser extends htmlParser {
private Document convertAllTriplesToDocument(DigestURI url, private Document convertAllTriplesToDocument(DigestURI url,
String mimeType, String charset, IRDFaTriple[] allTriples) { String mimeType, String charset, IRDFaTriple[] allTriples) {
Set<String> languages = new HashSet<String>(2); //Set<String> languages = new HashSet<String>(2);
Set<String> keywords = new HashSet<String>(allTriples.length); Set<String> keywords = new HashSet<String>(allTriples.length);
Set<String> sections = new HashSet<String>(5); //Set<String> sections = new HashSet<String>(5);
String all = ""; String all = "";
for (IRDFaTriple irdFaTriple : allTriples) { for (IRDFaTriple irdFaTriple : allTriples) {
@ -166,4 +152,51 @@ public class RDFaParser extends htmlParser {
} }
} }
public static void main(String[] args) {
URL aURL = null;
if (args.length < 1) {
System.out
.println("Usage: one and only one argument giving a file path or a URL.");
} else {
File aFile = new File(args[0]);
Reader aReader = null;
if (aFile.exists()) {
try {
aReader = new FileReader(aFile);
} catch (FileNotFoundException e) {
aReader = null;
}
} else {
try {
aURL = new URL(args[0]);
aReader = new InputStreamReader(aURL.openStream());
} catch (MalformedURLException e) {
} catch (IOException e) {
e.printStackTrace();
aReader = null;
}
}
if (aReader != null) {
RDFaParser aParser = new RDFaParser();
try {
aParser.parse(new DigestURI(args[0]),"","",aURL.openStream());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Failure e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else
System.out.println("File or URL not recognized.");
}
}
} }

@ -1,67 +0,0 @@
package net.yacy.document.parser.rdfa;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import net.yacy.document.Parser.Failure;
import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.kelondro.data.meta.DigestURI;
public class main {
/**
* @param args
*/
public static void main(String[] args) {
URL aURL = null;
if (args.length < 1) {
System.out
.println("Usage: one and only one argument giving a file path or a URL.");
} else {
File aFile = new File(args[0]);
Reader aReader = null;
if (aFile.exists()) {
try {
aReader = new FileReader(aFile);
} catch (FileNotFoundException e) {
aReader = null;
}
} else {
try {
aURL = new URL(args[0]);
aReader = new InputStreamReader(aURL.openStream());
} catch (MalformedURLException e) {
} catch (IOException e) {
e.printStackTrace();
aReader = null;
}
}
if (aReader != null) {
RDFaParser aParser = new RDFaParser("html");
try {
aParser.parse(new DigestURI(args[0]),"","",aURL.openStream());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Failure e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else
System.out.println("File or URL not recognized.");
}
}
}
Loading…
Cancel
Save