diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index a35888020..b39cca1b4 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -204,7 +204,7 @@ public class MultiProtocolURL implements Serializable, Comparable 7 && url.substring(0,7).equalsIgnoreCase("mailto:")) { p = 6; } else { url = "http://" + url; diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 54dcf21c3..62f518d74 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -491,6 +491,9 @@ dc_rights return this.lat; } + /** + * sorts all links (anchors) into individual collections + */ private void resortLinks() { if (this.resorted) return; synchronized (this) { @@ -513,6 +516,14 @@ dc_rights } for (final AnchorURL url: this.anchors) { if (url == null) continue; + u = url.toNormalform(true); + final String name = url.getNameProperty(); + // check mailto scheme first (not suppose to get into in/outboundlinks or hyperlinks -> crawler can't process) + if (url.getProtocol().equals("mailto")) { + this.emaillinks.put(u.substring(7), name); // TODO: check why key as string instead of Disgest/AnchorURL + continue; + } + final boolean noindex = url.getRelProperty().toLowerCase().indexOf("noindex",0) >= 0; final boolean nofollow = url.getRelProperty().toLowerCase().indexOf("nofollow",0) >= 0; if ((thishost == null && url.getHost() == null) || @@ -523,31 +534,24 @@ dc_rights } else { this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); } - u = url.toNormalform(true); - final String name = url.getNameProperty(); - if (u.startsWith("mailto:")) { - this.emaillinks.put(u.substring(7), name); - } else { - extpos = u.lastIndexOf('.'); - if (extpos > 0) { - if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) { - ext = u.substring(extpos + 1, qpos).toLowerCase(); - } else { - ext = u.substring(extpos + 1).toLowerCase(); - } - if (Classification.isMediaExtension(ext)) { - // this is not a normal anchor, its a media link - if (Classification.isImageExtension(ext)) { - collectedImages.put(url, new ImageEntry(url, name, -1, -1, -1)); - } - else if (Classification.isAudioExtension(ext)) this.audiolinks.put(url, name); - else if (Classification.isVideoExtension(ext)) this.videolinks.put(url, name); - else if (Classification.isApplicationExtension(ext)) this.applinks.put(url, name); - } + extpos = u.lastIndexOf('.'); + if (extpos > 0) { + if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) { + ext = u.substring(extpos + 1, qpos).toLowerCase(); + } else { + ext = u.substring(extpos + 1).toLowerCase(); + } + if (Classification.isMediaExtension(ext)) { + // this is not a normal anchor, its a media link + if (Classification.isImageExtension(ext)) { // TODO: guess on a-tag href extension (may not be correct) + collectedImages.put(url, new ImageEntry(url, name, -1, -1, -1)); + } else if (Classification.isAudioExtension(ext)) this.audiolinks.put(url, name); + else if (Classification.isVideoExtension(ext)) this.videolinks.put(url, name); + else if (Classification.isApplicationExtension(ext)) this.applinks.put(url, name); } - // in any case we consider this as a link and let the parser decide if that link can be followed - this.hyperlinks.put(url, name); } + // in any case we consider this as a link and let the parser decide if that link can be followed + this.hyperlinks.put(url, name); } // add image links that we collected from the anchors to the image map diff --git a/test/net/yacy/cora/document/id/MultiProtocolURLTest.java b/test/net/yacy/cora/document/id/MultiProtocolURLTest.java index 94d0d9118..b3c3abf14 100644 --- a/test/net/yacy/cora/document/id/MultiProtocolURLTest.java +++ b/test/net/yacy/cora/document/id/MultiProtocolURLTest.java @@ -144,6 +144,28 @@ public class MultiProtocolURLTest { } } + /** + * Test getProtocol() + */ + @Test + public void testGetProtocol() throws MalformedURLException { + Map testurls = new HashMap(); + // ( 1. parameter = urlstring to test, 2. parameter = expected protocol) + testurls.put("http://host.com", "http"); + testurls.put("HTTPS://host.com", "https"); + testurls.put("Ftp://host.com", "ftp"); + testurls.put("SMB://host.com", "smb"); + testurls.put("/file.com", "file"); + testurls.put("file://host.com/file.com", "file"); + testurls.put("MailTo:Abc@host.com", "mailto"); + + for (String txt : testurls.keySet()) { + MultiProtocolURL url = new MultiProtocolURL(txt); + assertEquals("test " + txt, url.getProtocol(), testurls.get(txt)); + + } + } + /** * Test of toNormalform method, of class MultiProtocolURL. */