From 86adfef30f6863b484e21f9a24ba72b279c81b37 Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 13 Jan 2017 16:10:59 +0100 Subject: [PATCH] Added automated unit tests and perfs test for WebStructureGraph class. Fixed references count when multiple links target the same domain name in one document. --- .../peers/graphics/WebStructureGraph.java | 138 ++++----- .../peers/graphics/WebStructureGraphTest.java | 263 ++++++++++++++++++ 2 files changed, 337 insertions(+), 64 deletions(-) create mode 100644 test/java/net/yacy/peers/graphics/WebStructureGraphTest.java diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index 769545ab4..28a335842 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -78,8 +78,9 @@ public class WebStructureGraph { private final static ConcurrentLog log = new ConcurrentLog("WebStructureGraph"); - /** Backup file */ + /** Eventual backup file */ private final File structureFile; + /** Older structure entries (notably loaded from the backup file) */ private final TreeMap structure_old; // ',' to {}* @@ -95,11 +96,17 @@ public class WebStructureGraph { /** Entry used to terminate the worker thread */ private final static LearnObject leanrefObjectPOISON = new LearnObject(null, null); - private static class LearnObject { + /** + * Used to feed a new entry to this web structure + */ + protected static class LearnObject { + /** Source URL */ private final DigestURL url; + + /** Target link URLs */ private final Set globalRefURLs; - private LearnObject(final DigestURL url, final Set globalRefURLs) { + protected LearnObject(final DigestURL url, final Set globalRefURLs) { this.url = url; this.globalRefURLs = globalRefURLs; } @@ -118,20 +125,19 @@ public class WebStructureGraph { this.structureFile = structureFile; this.publicRefDNSResolvingQueue = new LinkedBlockingQueue(); - // load web structure + // load web structure from file if exists Map loadedStructureB; try { - loadedStructureB = - (this.structureFile.exists()) - ? FileUtils.loadMapB(this.structureFile) - : new TreeMap(); + if(this.structureFile != null && this.structureFile.exists()) { + loadedStructureB = FileUtils.loadMapB(this.structureFile); + log.info("loaded dump of " + loadedStructureB.size() + " entries from " + this.structureFile.toString()); + } else { + loadedStructureB = new TreeMap(); + } } catch (final OutOfMemoryError e ) { loadedStructureB = new TreeMap(); } - if ( loadedStructureB != null ) { - this.structure_old.putAll(loadedStructureB); - } - log.info("loaded dump of " + loadedStructureB.size() + " entries from " + this.structureFile.toString()); + this.structure_old.putAll(loadedStructureB); // delete out-dated entries in case the structure is too big if ( this.structure_old.size() > maxhosts ) { @@ -611,33 +617,29 @@ public class WebStructureGraph { } - private void learnrefs(final LearnObject lro) { - final Set refhosts = new HashSet(); - String hosthash; - for ( final DigestURL u : lro.globalRefURLs ) { - if (Switchboard.getSwitchboard().shallTerminate()) break; - hosthash = ASCII.String(u.hash(), 6, 6); - if (!exists(hosthash)) { - // this must be recorded as an host with no references - synchronized ( this.structure_new ) { - this.structure_new.put(hosthash + "," + u.getHost(), UTF8.getBytes(none2refstr())); - } - } - refhosts.add(hosthash); - } + protected void learnrefs(final LearnObject lro) { final DigestURL url = lro.url; - hosthash = ASCII.String(url.hash(), 6, 6); + final String sourceHosthash = ASCII.String(url.hash(), 6, 6); // parse the new reference string and join it with the stored references - final StructureEntry structure = outgoingReferences(hosthash); + final StructureEntry structure = outgoingReferences(sourceHosthash); final Map refs = (structure == null) ? new HashMap() : structure.references; int c; - for (String dom: refhosts) { + for (final DigestURL u : lro.globalRefURLs) { + String domain = ASCII.String(u.hash(), 6, 6); + if (Switchboard.getSwitchboard() != null && Switchboard.getSwitchboard().shallTerminate()) break; + if (!exists(domain)) { + // this must be recorded as an host with no references + synchronized ( this.structure_new ) { + this.structure_new.put(domain + "," + u.getHost(), UTF8.getBytes(none2refstr())); + } + } c = 0; - if ( refs.containsKey(dom) ) { - c = (refs.get(dom)).intValue(); + Integer existingCount = refs.get(domain); + if ( existingCount != null) { + c = existingCount.intValue(); } - refs.put(dom, Integer.valueOf(++c)); + refs.put(domain, Integer.valueOf(++c)); } // check if the maxref is exceeded @@ -667,7 +669,7 @@ public class WebStructureGraph { // store the map back to the structure synchronized ( this.structure_new ) { - this.structure_new.put(hosthash + "," + url.getHost(), UTF8.getBytes(map2refstr(refs))); + this.structure_new.put(sourceHosthash + "," + url.getHost(), UTF8.getBytes(map2refstr(refs))); } } @@ -781,11 +783,17 @@ public class WebStructureGraph { } public static class StructureEntry implements Comparable { + /** the tail of the host hash */ + public String hosthash; - public String hosthash; // the tail of the host hash - public String hostname; // the host name - public String date; // date of latest change - public Map references; // a map from the referenced host hash to the number of referenced to that host + /** the host name */ + public String hostname; + + /** date of latest change */ + public String date; + + /** a map from the referenced host hash to the number of referenced to that host */ + public Map references; private StructureEntry(final String hosthash, final String hostname) { this(hosthash, hostname, GenericFormatter.SHORT_DAY_FORMATTER.format(), new HashMap()); @@ -831,33 +839,35 @@ public class WebStructureGraph { } // save to web structure file - log.info("Saving Web Structure File: new = " - + this.structure_new.size() - + " entries, old = " - + this.structure_old.size() - + " entries"); - final long time = System.currentTimeMillis(); - joinOldNew(); - log.info("dumping " + structure_old.size() + " entries to " + structureFile.toString()); - if ( !this.structure_old.isEmpty() ) { - synchronized ( this.structure_old ) { - if ( !this.structure_old.isEmpty() ) { - FileUtils - .saveMapB( - this.structureFile, - this.structure_old, - "Web Structure Syntax: ',' to {}*"); - final long t = Math.max(1, System.currentTimeMillis() - time); - log.info("Saved Web Structure File: " - + this.structure_old.size() - + " entries in " - + t - + " milliseconds, " - + (this.structure_old.size() * 1000 / t) - + " entries/second"); - } - this.structure_old.clear(); - } + if(this.structureFile != null) { + log.info("Saving Web Structure File: new = " + + this.structure_new.size() + + " entries, old = " + + this.structure_old.size() + + " entries"); + final long time = System.currentTimeMillis(); + joinOldNew(); + log.info("dumping " + structure_old.size() + " entries to " + structureFile.toString()); + if ( !this.structure_old.isEmpty() ) { + synchronized ( this.structure_old ) { + if ( !this.structure_old.isEmpty() ) { + FileUtils + .saveMapB( + this.structureFile, + this.structure_old, + "Web Structure Syntax: ',' to {}*"); + final long t = Math.max(1, System.currentTimeMillis() - time); + log.info("Saved Web Structure File: " + + this.structure_old.size() + + " entries in " + + t + + " milliseconds, " + + (this.structure_old.size() * 1000 / t) + + " entries/second"); + } + this.structure_old.clear(); + } + } } } } diff --git a/test/java/net/yacy/peers/graphics/WebStructureGraphTest.java b/test/java/net/yacy/peers/graphics/WebStructureGraphTest.java new file mode 100644 index 000000000..5d095fcd5 --- /dev/null +++ b/test/java/net/yacy/peers/graphics/WebStructureGraphTest.java @@ -0,0 +1,263 @@ +// WebStructureGraphTest.java +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.peers.graphics; + +import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Set; + +import org.junit.Assert; +import org.junit.Test; + +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.peers.graphics.WebStructureGraph.LearnObject; +import net.yacy.peers.graphics.WebStructureGraph.StructureEntry; + +/** + * Unit tests for {@link WebStructureGraph} + * + * @author luccioman + * + */ +public class WebStructureGraphTest { + + /** + * Most basic out going references unit test + */ + @Test + public void testOutgoingReferences() throws MalformedURLException { + WebStructureGraph graph = new WebStructureGraph(null); + try { + final DigestURL source = new DigestURL("http://source.net/index.html"); + final String sourceHash = source.hosthash(); + final Set targets = new HashSet<>(); + + final DigestURL target = new DigestURL("http://target.com/index.html"); + final String targetHash = target.hosthash(); + targets.add(target); + + LearnObject lro = new LearnObject(source, targets); + graph.learnrefs(lro); + + /* Check that reference from the exact source URL is retrieved from structure */ + StructureEntry outRefs = graph.outgoingReferences(sourceHash); + + Assert.assertNotNull(outRefs); + Assert.assertEquals("source.net", outRefs.hostname); + Assert.assertNotNull(outRefs.references); + Assert.assertEquals(1, outRefs.references.size()); + Assert.assertEquals(Integer.valueOf(1), outRefs.references.get(targetHash)); + + /* Check that reference from the host name URL is retrieved from structure */ + outRefs = graph.outgoingReferences(new DigestURL("http://source.net").hosthash()); + + Assert.assertNotNull(outRefs); + Assert.assertEquals("source.net", outRefs.hostname); + Assert.assertNotNull(outRefs.references); + Assert.assertEquals(1, outRefs.references.size()); + Assert.assertEquals(Integer.valueOf(1), outRefs.references.get(targetHash)); + + } finally { + graph.close(); + } + } + + /** + * Out going references from one source document to different resources on the same target host + */ + @Test + public void testOutgoingFromOneToMultipleSameTargeHost() throws MalformedURLException { + WebStructureGraph graph = new WebStructureGraph(null); + try { + final DigestURL source = new DigestURL("http://source.net/index.html"); + final String sourceHash = source.hosthash(); + final Set targets = new HashSet<>(); + + final DigestURL indexTarget = new DigestURL("http://target.com/index.html"); + targets.add(indexTarget); + + final DigestURL pathTarget = new DigestURL("http://target.com/path/doc.html"); + targets.add(pathTarget); + + final DigestURL queryTarget = new DigestURL("http://target.com/path/query?param=value"); + targets.add(queryTarget); + + LearnObject lro = new LearnObject(source, targets); + graph.learnrefs(lro); + + /* Check that accumulated references from the host name URL is retrieved from structure */ + StructureEntry outRefs = graph.outgoingReferences(sourceHash); + + Assert.assertNotNull(outRefs); + Assert.assertEquals("source.net", outRefs.hostname); + Assert.assertNotNull(outRefs.references); + /* One accumulated host target reference */ + Assert.assertEquals(1, outRefs.references.size()); + /* 3 accumulated links to that target host */ + Assert.assertEquals(Integer.valueOf(3), outRefs.references.get(indexTarget.hosthash())); + + } finally { + graph.close(); + } + } + + /** + * Most basic incoming references unit test + */ + @Test + public void testIncomingReferences() throws MalformedURLException { + + WebStructureGraph graph = new WebStructureGraph(null); + try { + final DigestURL source = new DigestURL("http://source.net/index.html"); + final String sourceHash = source.hosthash(); + final Set targets = new HashSet<>(); + + final DigestURL target = new DigestURL("http://target.com/index.html"); + final String targetHash = target.hosthash(); + targets.add(target); + + LearnObject lro = new LearnObject(source, targets); + graph.learnrefs(lro); + + /* Check that reference to the exact target URL is retrieved from structure */ + StructureEntry inRefs = graph.incomingReferences(targetHash); + + Assert.assertNotNull(inRefs); + Assert.assertEquals("target.com", inRefs.hostname); + Assert.assertNotNull(inRefs.references); + Assert.assertEquals(1, inRefs.references.size()); + Assert.assertEquals(Integer.valueOf(1), inRefs.references.get(sourceHash)); + + /* Check that reference to the host name target URL is retrieved from structure */ + inRefs = graph.incomingReferences(new DigestURL("http://target.com").hosthash()); + + Assert.assertNotNull(inRefs); + Assert.assertEquals("target.com", inRefs.hostname); + Assert.assertNotNull(inRefs.references); + Assert.assertEquals(1, inRefs.references.size()); + Assert.assertEquals(Integer.valueOf(1), inRefs.references.get(sourceHash)); + + } finally { + graph.close(); + } + } + + /** + * Incoming references from multiple sources on the same host to one target URL + */ + @Test + public void testIncomingReferencesFromMultipleSourcesOnOneHost() throws MalformedURLException { + + WebStructureGraph graph = new WebStructureGraph(null); + try { + final DigestURL indexSource = new DigestURL("http://source.net/index.html"); + final String sourceHash = indexSource.hosthash(); + Set targets = new HashSet<>(); + + final DigestURL target = new DigestURL("http://target.com/index.html"); + final String targetHash = target.hosthash(); + targets.add(target); + + LearnObject lro = new LearnObject(indexSource, targets); + graph.learnrefs(lro); + + final DigestURL pathSource = new DigestURL("http://source.net/path/doc.html"); + targets = new HashSet<>(); + targets.add(target); + + lro = new LearnObject(pathSource, targets); + graph.learnrefs(lro); + + final DigestURL querySource = new DigestURL("http://source.net/query?param=value"); + targets = new HashSet<>(); + targets.add(target); + + lro = new LearnObject(querySource, targets); + graph.learnrefs(lro); + + /* Check that reference to the exact target URL is retrieved from structure */ + StructureEntry inRefs = graph.incomingReferences(targetHash); + + Assert.assertNotNull(inRefs); + Assert.assertEquals("target.com", inRefs.hostname); + Assert.assertNotNull(inRefs.references); + /* One accumulated host source reference */ + Assert.assertEquals(1, inRefs.references.size()); + /* 3 accumulated links from that host */ + Assert.assertEquals(Integer.valueOf(3), inRefs.references.get(sourceHash)); + + } finally { + graph.close(); + } + } + + /** + * Simple performance measurements with a test structure filled to its limits. + */ + public static void main(String args[]) throws MalformedURLException { + WebStructureGraph graph = new WebStructureGraph(null); + try { + long beginTime = System.nanoTime(); + /* Generate maxhosts structure entries */ + for(int i = 0; i < WebStructureGraph.maxhosts; i++) { + final DigestURL source = new DigestURL("http://source" + i + ".net/index.html"); + final Set targets = new HashSet<>(); + + /* Generate maxref targets */ + for(int j = 0; j < WebStructureGraph.maxref; j++) { + final DigestURL target = new DigestURL("http://target" + String.valueOf(j) + ".com/index.html"); + targets.add(target); + } + + LearnObject lro = new LearnObject(source, targets); + graph.learnrefs(lro); + } + long endTime = System.nanoTime(); + System.out.println("testPerfs test structure initialisation time : " + ((endTime - beginTime) / 1000000000) + " seconds"); + + beginTime = System.nanoTime(); + /* Loop and look for incoming references on each sample generated target */ + for(int j = 0; j < WebStructureGraph.maxref; j++) { + String targetHash = new DigestURL("http://target" + j + ".com/index.html").hosthash(); + graph.incomingReferences(targetHash); + } + endTime = System.nanoTime(); + System.out.println("testPerfs incomingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds"); + + beginTime = System.nanoTime(); + /* Loop and look for outgoing references on each sample generated source */ + for(int i = 0; i < WebStructureGraph.maxhosts; i++) { + String sourceHash = new DigestURL("http://source" + i + ".net/index.html").hosthash(); + graph.outgoingReferences(sourceHash); + } + endTime = System.nanoTime(); + System.out.println("testPerfs outgoingReferences running time : " + ((endTime - beginTime) / 1000000000) + " seconds"); + + } finally { + graph.close(); + ConcurrentLog.shutdown(); + } + } + +}