From ce237aefad10e631d39d45bff6579772d6c609fb Mon Sep 17 00:00:00 2001 From: karlchenofhell Date: Thu, 19 Oct 2006 15:27:24 +0000 Subject: [PATCH] - assortment-sizes table from PerformanceQueues_p.html is not shown if not used - escape query- and fragment-part of an url as well - new resolveBackpath for urls: http://www.yacy-forum.de/viewtopic.php?t=2679#24867 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2815 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/net/URL.java | 160 ++++++++++++------- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- 2 files changed, 101 insertions(+), 61 deletions(-) diff --git a/source/de/anomic/net/URL.java b/source/de/anomic/net/URL.java index c6a9f9874..a20bafc48 100644 --- a/source/de/anomic/net/URL.java +++ b/source/de/anomic/net/URL.java @@ -72,11 +72,11 @@ public class URL { path = url.substring(q); } - resolveBackpath(); + path = resolveBackpath(path); identPort(url); identRef(); identQuest(); - if (path.indexOf('%')==-1) escapePath(); + escape(); } else { // this is not a http or ftp url if (protocol.equals("mailto")) { @@ -142,10 +142,10 @@ public class URL { this.quest = baseURL.quest; this.ref = baseURL.ref; - resolveBackpath(); + path = resolveBackpath(path); identRef(); identQuest(); - if (path.indexOf('%')==-1) escapePath(); + escape(); } } @@ -157,27 +157,74 @@ public class URL { this.path = path; identRef(); identQuest(); - if (path.indexOf('%')==-1) escapePath(); + escape(); } - private void resolveBackpath() throws MalformedURLException { - // resolve '..' + // resolve '..' + private String resolveBackpath(String path) /* throws MalformedURLException */ { + /* original version by [MC] int p; while ((p = path.indexOf("/..")) >= 0) { String head = path.substring(0, p); int q = head.lastIndexOf('/'); if (q < 0) throw new MalformedURLException("backpath cannot be resolved in path = " + path); path = head.substring(0, q) + path.substring(p + 3); + }*/ + + /* by [MT] */ + if (path.length() == 0 || path.charAt(0) != '/') { path = "/" + path; } + + Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)[.]{2}(?=/)|/\\.(?=/)|/(?=/)"); + Matcher matcher = pathPattern.matcher(path); + while (matcher.find()) { + path = matcher.replaceAll(""); + matcher.reset(path); } + + /* another version at http://www.yacy-forum.de/viewtopic.php?p=26871#26871 */ + + return path; + } + + /** + * Escapes the following parts of the url, this object already contains: + * + */ + private void escape() { + if (path != null && path.indexOf('%') == -1) escapePath(); + if (quest != null && quest.indexOf('%') == -1) escapeQuest(); + if (ref != null && ref.indexOf('%') == -1) escapeRef(); } private void escapePath() { - String[] pathp = path.split("/",-1); + String[] pathp = path.split("/", -1); String ptmp = ""; - for (int i = 0; i0) ? 1 : 0); + path = ptmp.substring((ptmp.length() > 0) ? 1 : 0); + } + + private void escapeRef() { + ref = escape(ref); + } + + private void escapeQuest() { + String[] questp = quest.split("&", -1); + String qtmp = ""; + for (int i = 0; i < questp.length; i++) { + if (questp[i].indexOf('=') != -1) { + qtmp += "&" + escape(questp[i].substring(0, questp[i].indexOf('='))); + qtmp += "=" + escape(questp[i].substring(questp[i].indexOf('=') + 1)); + } else { + qtmp += "&" + escape(questp[i]); + } + } + quest = qtmp.substring((qtmp.length() > 0) ? 1 : 0); } final static String[] hex = { @@ -220,21 +267,19 @@ public class URL { * with the UTF-8-in-URL proposal. This is what happens: * * * * @param s The string to be encoded @@ -259,7 +304,7 @@ public class URL { || ch == '.' || ch == '!' || ch == '~' || ch == '*' || ch == '\'' || ch == '(' - || ch == ')' || ch == '%') { + || ch == ')') { sbuf.append((char)ch); } else if (ch <= 0x007f) { // other ASCII sbuf.append(hex[ch]); @@ -277,29 +322,25 @@ public class URL { // from: http://www.w3.org/International/unescape.java public static String unescape(String s) { - StringBuffer sbuf = new StringBuffer () ; + StringBuffer sbuf = new StringBuffer(); int l = s.length(); int ch = -1; int b, sumb = 0; - for (int i = 0, more = -1 ; i < l ; i++) { + for (int i = 0, more = -1; i < l; i++) { /* Get next byte b from URL segment s */ switch (ch = s.charAt(i)) { - case '%': - ch = s.charAt (++i) ; - int hb = (Character.isDigit ((char) ch) - ? ch - '0' - : 10+Character.toLowerCase((char) ch) - 'a') & 0xF ; - ch = s.charAt (++i) ; - int lb = (Character.isDigit ((char) ch) - ? ch - '0' - : 10+Character.toLowerCase ((char) ch)-'a') & 0xF ; - b = (hb << 4) | lb ; - break ; - case '+': - b = ' ' ; - break ; - default: - b = ch ; + case '%': + ch = s.charAt(++i) ; + int hb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF; + ch = s.charAt(++i) ; + int lb = (Character.isDigit ((char) ch) ? ch - '0' : 10 + Character.toLowerCase ((char) ch) - 'a') & 0xF; + b = (hb << 4) | lb; + break; + case '+': + b = ' '; + break; + default: + b = ch; } /* Decode byte b as UTF-8, sumb collects incomplete chars */ if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte) @@ -325,7 +366,7 @@ public class URL { } /* We don't test if the UTF-8 encoding is well-formed */ } - return sbuf.toString() ; + return sbuf.toString(); } private void identPort(String inputURL) throws MalformedURLException { @@ -427,19 +468,16 @@ public class URL { } else if (this.protocol.equals("https")) { if (this.port < 0 || this.port == 443) { defaultPort = true; } } - String path = this.getFile(includeReference); - - if (path.length() == 0 || path.charAt(0) != '/') { path = "/" + path; } - - Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)[.]{2}(?=/)|/\\.(?=/)|/(?=/)"); - Matcher matcher = pathPattern.matcher(path); - while (matcher.find()) { - path = matcher.replaceAll(""); - matcher.reset(path); - } + String path = resolveBackpath(this.getFile(includeReference)); - if (defaultPort) { return this.protocol + "://" + (this.userInfo!=null?this.userInfo+"@":"") + this.getHost().toLowerCase() + path; } - return this.protocol + "://" + (this.userInfo!=null?this.userInfo+"@":"")+ this.getHost().toLowerCase() + ((defaultPort) ? "" : (":" + this.port)) + path; + if (defaultPort) { + return this.protocol + "://" + + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + + this.getHost().toLowerCase() + path; + } + return this.protocol + "://" + + ((this.userInfo != null) ? (this.userInfo + "@") : ("")) + + this.getHost().toLowerCase() + ((defaultPort) ? ("") : (":" + this.port)) + path; } public boolean equals(URL other) { @@ -486,7 +524,9 @@ public class URL { new String[]{null,"http://www.bla.org/bli bla blo"}, new String[]{null,"http://www.blubb.org/bli bla/ blo blubb/bla.html"}, new String[]{null,"http://california-press-release.com/30/Hendrick Chevrolet, the renowned car dealer for Chevrolet in Cary, North Carolina (NC) announces the arrival of 2007 Chevrolet Cobalt SS Coupe For further information, call Hendrick Chevrolet on (800)-857-4909.php"}, - new String[]{"http://california-press-release.com","30/Hendrick Chevrolet, the renowned car dealer for Chevrolet in Cary, North Carolina (NC) announces the arrival of 2007 Chevrolet Cobalt SS Coupe For further information, call Hendrick Chevrolet on (800)-857-4909.php"} + new String[]{"http://california-press-release.com","/30/Hendrick%20Chevrolet%2c%20the%20renowned%20car%20dealer%20for%20Chevrolet%20in%20Cary%2c%20North%20Carolina%20(NC)%20announces%20the%20arrival%20of%202007%20Chevrolet%20Cobalt%20SS%20%20Coupe%20For%20further%20information%2c%20call%20Hendrick%20Chevrolet%20on%20(800)-857-4909.php"}, + new String[]{null, "http://www.anomic.de/home/test?x=1&täst=xyß#höme"}, + new String[]{null, "http://www.anomic.de/home/test?x&test=#"} }; String environment, url; de.anomic.net.URL aURL = null; @@ -500,15 +540,15 @@ public class URL { } else { try {aURL = new de.anomic.net.URL(new de.anomic.net.URL(environment), url);} catch (MalformedURLException e) {aURL = null;} try {jURL = new java.net.URL(new java.net.URL(environment), url);} catch (MalformedURLException e) {jURL = null;} - }/* + } if (((aURL == null) && (jURL != null)) || ((aURL != null) && (jURL == null)) || - ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) {*/ + ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) { System.out.println("Difference for environment=" + environment + ", url=" + url + ":"); System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString()); System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toString()); - System.out.println(); - //} + System.out.println((aURL == null || unescape(aURL.toString()) == null) ? "aURL rejected input" : "back=" + unescape(aURL.toString())); + } } } } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 63bc44184..f6150c8fc 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -139,7 +139,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { } public int[] assortmentsSizes() { - return (assortmentCluster == null) ? new int[assortmentCount] : assortmentCluster.sizes(); + return (assortmentCluster == null) ? null : assortmentCluster.sizes(); } public int assortmentsCacheChunkSizeAvg() {