diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index a5579dbd5..68630eb1d 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -365,7 +365,7 @@ public class MultiProtocolURI implements Serializable, Comparable. + */ + +package net.yacy.cora.storage; + +import java.io.Serializable; +import java.util.LinkedHashMap; +import java.util.Map; + +public class SizeLimitedMap extends LinkedHashMap implements Map, Cloneable, Serializable { + + private static final long serialVersionUID = 6088727126150060068L; + + final int sizeLimit; + + public SizeLimitedMap(int sizeLimit) { + this.sizeLimit = sizeLimit; + } + + @Override protected boolean removeEldestEntry(final Map.Entry eldest) { + return size() > this.sizeLimit; + } +} diff --git a/source/net/yacy/cora/storage/SizeLimitedSet.java b/source/net/yacy/cora/storage/SizeLimitedSet.java new file mode 100644 index 000000000..c95d70709 --- /dev/null +++ b/source/net/yacy/cora/storage/SizeLimitedSet.java @@ -0,0 +1,80 @@ +/** + * SizeLimitedSet + * Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany + * First released 04.07.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.storage; + +import java.io.Serializable; +import java.util.AbstractSet; +import java.util.Iterator; +import java.util.Set; + +public class SizeLimitedSet extends AbstractSet implements Set, Cloneable, Serializable { + + private static final long serialVersionUID = -1674392695322189500L; + + private transient SizeLimitedMap map; + + private static final Object OBJECT = new Object(); + + public SizeLimitedSet(int sizeLimit) { + map = new SizeLimitedMap(sizeLimit); + } + + public Iterator iterator() { + return map.keySet().iterator(); + } + + public int size() { + return map.size(); + } + + public boolean isEmpty() { + return map.isEmpty(); + } + + public boolean contains(Object o) { + return map.containsKey(o); + } + + public boolean add(E e) { + return map.put(e, OBJECT) == null; + } + + public boolean remove(Object o) { + return map.remove(o) == OBJECT; + } + + public void clear() { + map.clear(); + } + + @SuppressWarnings("unchecked") + public Object clone() { + try { + SizeLimitedSet n = (SizeLimitedSet) super.clone(); + n.map = (SizeLimitedMap) map.clone(); + return n; + } catch (CloneNotSupportedException e) { + throw new InternalError(); + } + } + + +} diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index f7e944f6e..9015f00b3 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -46,7 +46,8 @@ import javax.swing.event.EventListenerList; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.sorting.ClusteredScoreMap; -import net.yacy.cora.storage.HashARC; +import net.yacy.cora.storage.SizeLimitedMap; +import net.yacy.cora.storage.SizeLimitedSet; import net.yacy.cora.util.NumberTools; import net.yacy.document.SentenceReader; import net.yacy.document.parser.htmlParser; @@ -158,14 +159,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.root = root; this.maxLinks = maxLinks; this.evaluationScores = new Evaluation(); - this.rss = new HashARC(maxLinks); - this.css = new HashARC(maxLinks); - this.anchors = new HashARC(maxLinks); - this.images = new HashARC(maxLinks); - this.embeds = new HashARC(maxLinks); - this.frames = new HashSet(); - this.iframes = new HashSet(); - this.metas = new HashARC(maxLinks); + this.rss = new SizeLimitedMap(maxLinks); + this.css = new SizeLimitedMap(maxLinks); + this.anchors = new SizeLimitedMap(maxLinks); + this.images = new SizeLimitedMap(maxLinks); + this.embeds = new SizeLimitedMap(maxLinks); + this.frames = new SizeLimitedSet(maxLinks); + this.iframes = new SizeLimitedSet(maxLinks); + this.metas = new SizeLimitedMap(maxLinks); this.script = new HashSet(); this.title = EMPTY_STRING; this.headlines = new ArrayList[6];