write info about robots.txt evaluation into getpageinfo_p.xml

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8038 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent f8b8c82421
commit eb1c7c041d

@ -25,6 +25,7 @@ public class getpageinfo_p {
prop.put("desc", ""); prop.put("desc", "");
prop.put("lang", ""); prop.put("lang", "");
prop.put("robots-allowed", "3"); //unknown prop.put("robots-allowed", "3"); //unknown
prop.put("robotsInfo", ""); //unknown
prop.put("sitemap", ""); prop.put("sitemap", "");
prop.put("favicon",""); prop.put("favicon","");
prop.put("sitelist", ""); prop.put("sitelist", "");
@ -39,6 +40,7 @@ public class getpageinfo_p {
String url=post.get("url"); String url=post.get("url");
if (url.toLowerCase().startsWith("ftp://")) { if (url.toLowerCase().startsWith("ftp://")) {
prop.put("robots-allowed", "1"); prop.put("robots-allowed", "1");
prop.put("robotsInfo", "ftp does not follow robots.txt");
prop.putXML("title", "FTP: " + url); prop.putXML("title", "FTP: " + url);
return prop; return prop;
} else if (!url.startsWith("http://") && } else if (!url.startsWith("http://") &&
@ -114,6 +116,7 @@ public class getpageinfo_p {
Log.logException(e); Log.logException(e);
} }
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry.getInfo());
// get the sitemap URL of the domain // get the sitemap URL of the domain
final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();

@ -4,6 +4,7 @@
<desc>#[desc]#</desc> <desc>#[desc]#</desc>
<lang>#[lang]#</lang> <lang>#[lang]#</lang>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots> <robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<robotsInfo>#[robotsInfo]#</robotsInfo>
<sitemap>#[sitemap]#</sitemap> <sitemap>#[sitemap]#</sitemap>
<favicon>#[favicon]#</favicon> <favicon>#[favicon]#</favicon>
<sitelist>#[sitelist]#</sitelist> <sitelist>#[sitelist]#</sitelist>

@ -1,4 +1,4 @@
//RobotsEntry.java //RobotsEntry.java
//------------------------------------- //-------------------------------------
//part of YACY //part of YACY
//(C) by Michael Peter Christen; mc@yacy.net //(C) by Michael Peter Christen; mc@yacy.net
@ -43,7 +43,7 @@ import net.yacy.kelondro.util.ByteArray;
public class RobotsTxtEntry { public class RobotsTxtEntry {
private static final String HOST_NAME = "hostname"; private static final String HOST_NAME = "hostname";
private static final String ALLOW_PATH_LIST = "allow"; private static final String ALLOW_PATH_LIST = "allow";
private static final String DISALLOW_PATH_LIST = "disallow"; private static final String DISALLOW_PATH_LIST = "disallow";
@ -54,16 +54,18 @@ public class RobotsTxtEntry {
private static final String CRAWL_DELAY = "crawlDelay"; private static final String CRAWL_DELAY = "crawlDelay";
private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
private static final String AGENT_NAME = "agentname"; private static final String AGENT_NAME = "agentname";
// this is a simple record structure that holds all properties of a single crawl start // this is a simple record structure that holds all properties of a single crawl start
private final Map<String, byte[]> mem; private final Map<String, byte[]> mem;
private final List<String> allowPathList, denyPathList; private final List<String> allowPathList, denyPathList;
private final String hostName, agentName; private final String hostName, agentName;
private String info; // this is filled if robots disallowed access; then the reason is noted there;
protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) { protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase(); this.hostName = hostName.toLowerCase();
this.mem = mem; this.mem = mem;
this.info = "";
if (this.mem.containsKey(DISALLOW_PATH_LIST)) { if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.denyPathList = new LinkedList<String>(); this.denyPathList = new LinkedList<String>();
final String csPl = UTF8.String(this.mem.get(DISALLOW_PATH_LIST)); final String csPl = UTF8.String(this.mem.get(DISALLOW_PATH_LIST));
@ -89,12 +91,12 @@ public class RobotsTxtEntry {
this.allowPathList = new LinkedList<String>(); this.allowPathList = new LinkedList<String>();
} }
this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null; this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
} }
protected RobotsTxtEntry( protected RobotsTxtEntry(
final MultiProtocolURI theURL, final MultiProtocolURI theURL,
final List<String> allowPathList, final List<String> allowPathList,
final List<String> disallowPathList, final List<String> disallowPathList,
final Date loadedDate, final Date loadedDate,
final Date modDate, final Date modDate,
final String eTag, final String eTag,
@ -103,12 +105,12 @@ public class RobotsTxtEntry {
final String agentName final String agentName
) { ) {
if (theURL == null) throw new IllegalArgumentException("The url is missing"); if (theURL == null) throw new IllegalArgumentException("The url is missing");
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase(); this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
this.allowPathList = new LinkedList<String>(); this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>(); this.denyPathList = new LinkedList<String>();
this.agentName = agentName; this.agentName = agentName;
this.mem = new LinkedHashMap<String, byte[]>(10); this.mem = new LinkedHashMap<String, byte[]>(10);
this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName)); this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime()))); if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime())));
@ -117,92 +119,92 @@ public class RobotsTxtEntry {
if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap)); if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap));
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis))); if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis)));
if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName)); if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName));
if (allowPathList != null && !allowPathList.isEmpty()) { if (allowPathList != null && !allowPathList.isEmpty()) {
this.allowPathList.addAll(allowPathList); this.allowPathList.addAll(allowPathList);
final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30); final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30);
for (String element : allowPathList) { for (final String element : allowPathList) {
pathListStr.append(element) pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
} }
this.mem.put(ALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0,pathListStr.length()-1))); this.mem.put(ALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0,pathListStr.length()-1)));
} }
if (disallowPathList != null && !disallowPathList.isEmpty()) { if (disallowPathList != null && !disallowPathList.isEmpty()) {
this.denyPathList.addAll(disallowPathList); this.denyPathList.addAll(disallowPathList);
final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30); final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30);
for (String element : disallowPathList) { for (final String element : disallowPathList) {
pathListStr.append(element) pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
} }
this.mem.put(DISALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0, pathListStr.length()-1))); this.mem.put(DISALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0, pathListStr.length()-1)));
} }
} }
protected String getHostName() { protected String getHostName() {
return this.hostName; return this.hostName;
} }
protected String getAgentName() { protected String getAgentName() {
return this.agentName; return this.agentName;
} }
protected Map<String, byte[]> getMem() { protected Map<String, byte[]> getMem() {
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName)); if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
return this.mem; return this.mem;
} }
@Override @Override
public String toString() { public String toString() {
final StringBuilder str = new StringBuilder(6000); final StringBuilder str = new StringBuilder(6000);
str.append((this.hostName == null) ? "null" : this.hostName).append(": "); str.append((this.hostName == null) ? "null" : this.hostName).append(": ");
if (this.mem != null) str.append(this.mem.toString()); if (this.mem != null) str.append(this.mem.toString());
return str.toString(); return str.toString();
} }
/** /**
* get the sitemap url * get the sitemap url
* @return the sitemap url or null if no sitemap url is given * @return the sitemap url or null if no sitemap url is given
*/ */
public MultiProtocolURI getSitemap() { public MultiProtocolURI getSitemap() {
String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
if (url == null) return null; if (url == null) return null;
try { try {
return new MultiProtocolURI(url); return new MultiProtocolURI(url);
} catch (MalformedURLException e) { } catch (final MalformedURLException e) {
return null; return null;
} }
} }
protected Date getLoadedDate() { protected Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) { if (this.mem.containsKey(LOADED_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE))); return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
} }
return null; return null;
} }
protected void setLoadedDate(final Date newLoadedDate) { protected void setLoadedDate(final Date newLoadedDate) {
if (newLoadedDate != null) { if (newLoadedDate != null) {
this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime()))); this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
} }
} }
protected Date getModDate() { protected Date getModDate() {
if (this.mem.containsKey(MOD_DATE)) { if (this.mem.containsKey(MOD_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE))); return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
} }
return null; return null;
} }
protected String getETag() { protected String getETag() {
if (this.mem.containsKey(ETAG)) { if (this.mem.containsKey(ETAG)) {
return ASCII.String(this.mem.get(ETAG)); return ASCII.String(this.mem.get(ETAG));
} }
return null; return null;
} }
protected long getCrawlDelayMillis() { protected long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS)); return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
@ -214,26 +216,38 @@ public class RobotsTxtEntry {
} catch (final NumberFormatException e) { } catch (final NumberFormatException e) {
return 0; return 0;
} }
return 0; return 0;
} }
public boolean isDisallowed(MultiProtocolURI subpathURL) { public boolean isDisallowed(final MultiProtocolURI subpathURL) {
String path = subpathURL.getFile(); String path = subpathURL.getFile();
if ((this.mem == null) || (this.denyPathList.isEmpty())) return false; if (this.mem == null) {
this.info = "no robots file available";
return false;
}
if (this.denyPathList.isEmpty()) {
this.info = "no entry in robots.txt";
return false;
}
// if the path is null or empty we set it to / // if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/"; if (path == null || path.length() == 0) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB // escaping all occurences of ; because this char is used as special char in the Robots DB
else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B"); else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
for (String element : this.denyPathList) { for (final String element : this.denyPathList) {
// disallow rule // disallow rule
if (path.startsWith(element)) { if (path.startsWith(element)) {
this.info = "path '" + path + "' starts with '" + element + "' from deny path list";
return true; return true;
} }
} }
this.info = "path '" + path + "' does not start with any element from deny path list";
return false; return false;
} }
public String getInfo() {
return this.info;
}
} }
Loading…
Cancel
Save