write info about robots.txt evaluation into getpageinfo_p.xml

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8038 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent f8b8c82421
commit eb1c7c041d

@ -25,6 +25,7 @@ public class getpageinfo_p {
prop.put("desc", ""); prop.put("desc", "");
prop.put("lang", ""); prop.put("lang", "");
prop.put("robots-allowed", "3"); //unknown prop.put("robots-allowed", "3"); //unknown
prop.put("robotsInfo", ""); //unknown
prop.put("sitemap", ""); prop.put("sitemap", "");
prop.put("favicon",""); prop.put("favicon","");
prop.put("sitelist", ""); prop.put("sitelist", "");
@ -39,6 +40,7 @@ public class getpageinfo_p {
String url=post.get("url"); String url=post.get("url");
if (url.toLowerCase().startsWith("ftp://")) { if (url.toLowerCase().startsWith("ftp://")) {
prop.put("robots-allowed", "1"); prop.put("robots-allowed", "1");
prop.put("robotsInfo", "ftp does not follow robots.txt");
prop.putXML("title", "FTP: " + url); prop.putXML("title", "FTP: " + url);
return prop; return prop;
} else if (!url.startsWith("http://") && } else if (!url.startsWith("http://") &&
@ -114,6 +116,7 @@ public class getpageinfo_p {
Log.logException(e); Log.logException(e);
} }
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry.getInfo());
// get the sitemap URL of the domain // get the sitemap URL of the domain
final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();

@ -4,6 +4,7 @@
<desc>#[desc]#</desc> <desc>#[desc]#</desc>
<lang>#[lang]#</lang> <lang>#[lang]#</lang>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots> <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<robotsInfo>#[robotsInfo]#</robotsInfo>
<sitemap>#[sitemap]#</sitemap> <sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon> <favicon>#[favicon]#</favicon>
<sitelist>#[sitelist]#</sitelist> <sitelist>#[sitelist]#</sitelist>

@ -59,10 +59,12 @@ public class RobotsTxtEntry {
private final Map<String, byte[]> mem; private final Map<String, byte[]> mem;
private final List<String> allowPathList, denyPathList; private final List<String> allowPathList, denyPathList;
private final String hostName, agentName; private final String hostName, agentName;
private String info; // this is filled if robots disallowed access; then the reason is noted there;
protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) { protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase(); this.hostName = hostName.toLowerCase();
this.mem = mem; this.mem = mem;
this.info = "";
if (this.mem.containsKey(DISALLOW_PATH_LIST)) { if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.denyPathList = new LinkedList<String>(); this.denyPathList = new LinkedList<String>();
@ -122,7 +124,7 @@ public class RobotsTxtEntry {
this.allowPathList.addAll(allowPathList); this.allowPathList.addAll(allowPathList);
final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30); final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30);
for (String element : allowPathList) { for (final String element : allowPathList) {
pathListStr.append(element) pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
} }
@ -133,7 +135,7 @@ public class RobotsTxtEntry {
this.denyPathList.addAll(disallowPathList); this.denyPathList.addAll(disallowPathList);
final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30); final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30);
for (String element : disallowPathList) { for (final String element : disallowPathList) {
pathListStr.append(element) pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
} }
@ -167,11 +169,11 @@ public class RobotsTxtEntry {
* @return the sitemap url or null if no sitemap url is given * @return the sitemap url or null if no sitemap url is given
*/ */
public MultiProtocolURI getSitemap() { public MultiProtocolURI getSitemap() {
String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
if (url == null) return null; if (url == null) return null;
try { try {
return new MultiProtocolURI(url); return new MultiProtocolURI(url);
} catch (MalformedURLException e) { } catch (final MalformedURLException e) {
return null; return null;
} }
} }
@ -217,23 +219,35 @@ public class RobotsTxtEntry {
return 0; return 0;
} }
public boolean isDisallowed(MultiProtocolURI subpathURL) { public boolean isDisallowed(final MultiProtocolURI subpathURL) {
String path = subpathURL.getFile(); String path = subpathURL.getFile();
if ((this.mem == null) || (this.denyPathList.isEmpty())) return false; if (this.mem == null) {
this.info = "no robots file available";
return false;
}
if (this.denyPathList.isEmpty()) {
this.info = "no entry in robots.txt";
return false;
}
// if the path is null or empty we set it to / // if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/"; if (path == null || path.length() == 0) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB // escaping all occurences of ; because this char is used as special char in the Robots DB
else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B"); else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
for (String element : this.denyPathList) { for (final String element : this.denyPathList) {
// disallow rule // disallow rule
if (path.startsWith(element)) { if (path.startsWith(element)) {
this.info = "path '" + path + "' starts with '" + element + "' from deny path list";
return true; return true;
} }
} }
this.info = "path '" + path + "' does not start with any element from deny path list";
return false; return false;
} }
public String getInfo() {
return this.info;
}
} }
Loading…
Cancel
Save