Made "tld:" modifier case insensitive and IDN complient.

Thus allowing typing internationalized top-level domains with non ASCII
characters as tld: modifier.
pull/149/head
luccioman 7 years ago
parent a4494d6e01
commit f9cba827c0

@ -29,6 +29,7 @@
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.IDN;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
@ -36,6 +37,7 @@ import java.util.Collection;
import java.util.ConcurrentModificationException; import java.util.ConcurrentModificationException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.SortedSet; import java.util.SortedSet;
import java.util.TreeSet; import java.util.TreeSet;
@ -497,18 +499,34 @@ public class yacysearch {
modifier.add("/heuristic"); modifier.add("/heuristic");
} }
final int tldp = querystring.indexOf("tld:", 0); final String tldModifierPrefix = "tld:";
final int tldp = querystring.indexOf(tldModifierPrefix, 0);
if (tldp >= 0) { if (tldp >= 0) {
int ftb = querystring.indexOf(' ', tldp); int ftb = querystring.indexOf(' ', tldp);
if (ftb == -1) ftb = querystring.length(); if (ftb == -1) {
tld = querystring.substring(tldp + 4, ftb); ftb = querystring.length();
querystring = querystring.replace("tld:" + tld, ""); }
modifier.add("tld:" + tld); tld = querystring.substring(tldp + tldModifierPrefix.length(), ftb);
querystring = querystring.replace(tldModifierPrefix + tld, "");
modifier.add(tldModifierPrefix + tld);
while ( tld.length() > 0 && tld.charAt(0) == '.' ) { while ( tld.length() > 0 && tld.charAt(0) == '.' ) {
tld = tld.substring(1); tld = tld.substring(1);
} }
if (tld.length() == 0) tld = null; if (tld.length() == 0) {
tld = null;
} else {
try {
/* Convert to the same lower case ASCII Compatible Encoding that is used in normalized URLs */
tld = IDN.toASCII(tld, 0);
} catch(final IllegalArgumentException e){
ConcurrentLog.warn("LOCAL_SEARCH", "Failed to convert tld modifier value " + tld + "to ASCII Compatible Encoding (ACE)", e);
}
/* Domain name in an URL is case insensitive : convert now modifier to lower case for further processing over normalized URLs */
tld = tld.toLowerCase(Locale.ROOT);
}
} }
if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given
// read the language from the language-restrict option 'lr' // read the language from the language-restrict option 'lr'

Loading…
Cancel
Save