fix for pattern matcher in html parser

pull/1/head
Michael Peter Christen 13 years ago
parent 8a6edc0031
commit b1e7c11fba

@ -68,6 +68,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f); private static final Set<String> linkTags0 = new HashSet<String>(12,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f); private static final Set<String> linkTags1 = new HashSet<String>(15,0.99f);
private static final Pattern LB = Pattern.compile("\n");
public enum TagType { public enum TagType {
singleton, pair; singleton, pair;
} }
@ -167,7 +169,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.frames = new SizeLimitedSet<MultiProtocolURI>(maxLinks); this.frames = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.iframes = new SizeLimitedSet<MultiProtocolURI>(maxLinks); this.iframes = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks); this.metas = new SizeLimitedMap<String, String>(maxLinks);
this.script = new HashSet<MultiProtocolURI>(); this.script = new SizeLimitedSet<MultiProtocolURI>(maxLinks);
this.title = EMPTY_STRING; this.title = EMPTY_STRING;
this.headlines = new ArrayList[6]; this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>(); for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
@ -498,7 +500,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.script.add(absolutePath(src)); this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src); this.evaluationScores.match(Element.scriptpath, src);
} else { } else {
this.evaluationScores.match(Element.scriptcode, text); this.evaluationScores.match(Element.scriptcode, LB.matcher(new String(text)).replaceAll(" "));
} }
} }
@ -509,7 +511,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
@Override @Override
public void scrapeComment(final char[] comment) { public void scrapeComment(final char[] comment) {
this.evaluationScores.match(Element.comment, comment); this.evaluationScores.match(Element.comment, LB.matcher(new String(comment)).replaceAll(" "));
} }
private String recursiveParse(final char[] inlineHtml) { private String recursiveParse(final char[] inlineHtml) {

Loading…
Cancel
Save