diff --git a/.classpath b/.classpath
index 9088d90ba..95ff3dba7 100644
--- a/.classpath
+++ b/.classpath
@@ -49,5 +49,7 @@
+
+
diff --git a/addon/YaCy.app/Contents/Info.plist b/addon/YaCy.app/Contents/Info.plist
index 268bfeb26..da5ae6bde 100644
--- a/addon/YaCy.app/Contents/Info.plist
+++ b/addon/YaCy.app/Contents/Info.plist
@@ -50,6 +50,8 @@
$JAVAROOT/lib/commons-logging-1.1.1.jar
$JAVAROOT/lib/fontbox-1.6.0.jar
$JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
+ $JAVAROOT/lib/htmllexer.jar
+ $JAVAROOT/lib/htmlparser.jar
$JAVAROOT/lib/httpclient-4.2.jar
$JAVAROOT/lib/httpcore-4.2.jar
$JAVAROOT/lib/httpmime-4.2.jar
diff --git a/build.xml b/build.xml
index beae51d7c..602b769f7 100644
--- a/build.xml
+++ b/build.xml
@@ -168,6 +168,8 @@
+
+
diff --git a/lib/htmllexer.jar b/lib/htmllexer.jar
new file mode 100644
index 000000000..0a0fc4df3
Binary files /dev/null and b/lib/htmllexer.jar differ
diff --git a/lib/htmlparser.jar b/lib/htmlparser.jar
new file mode 100644
index 000000000..fad7a207b
Binary files /dev/null and b/lib/htmlparser.jar differ
diff --git a/source/de/anomic/http/server/ServerSideIncludes.java b/source/de/anomic/http/server/ServerSideIncludes.java
index 087c2e128..53673206f 100644
--- a/source/de/anomic/http/server/ServerSideIncludes.java
+++ b/source/de/anomic/http/server/ServerSideIncludes.java
@@ -74,7 +74,7 @@ public class ServerSideIncludes {
}
}
- private static void writeContent(String path, final OutputStream out, final String authorization, final String requesthost, final RequestHeader requestHeader) {
+ public static void writeContent(String path, final OutputStream out, final String authorization, final String requesthost, final RequestHeader requestHeader) {
// check if there are arguments in path string
String args = "";
final int argpos = path.indexOf('?');
diff --git a/source/net/yacy/interaction/AugmentHtmlStream.java b/source/net/yacy/interaction/AugmentHtmlStream.java
index ddb7aeb81..426c731c6 100644
--- a/source/net/yacy/interaction/AugmentHtmlStream.java
+++ b/source/net/yacy/interaction/AugmentHtmlStream.java
@@ -1,21 +1,430 @@
package net.yacy.interaction;
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.net.URLEncoder;
import java.nio.charset.Charset;
+import net.yacy.yacy;
import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.protocol.http.HTTPClient;
+import net.yacy.document.Document;
+import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;
+import net.yacy.search.Switchboard;
+
+import org.htmlparser.Tag;
+import org.htmlparser.Text;
+import org.htmlparser.tags.LinkTag;
+import org.htmlparser.util.NodeList;
+import org.htmlparser.visitors.NodeVisitor;
+
+import de.anomic.http.server.ServerSideIncludes;
public class AugmentHtmlStream {
-
+
+ static RequestHeader globalrequestHeader;
+
+ /**
+ * creates a NodeVisitor which assigns a unique ID to every node
+ *
+ * @return customized NodeVisitor
+ */
+ private static class VisitorAddUniqueID extends NodeVisitor {
+
+ private int counter;
+
+ public VisitorAddUniqueID() {
+ this.setCounter(0);
+ }
+
+ @Override
+ public void visitTag(Tag tag) {
+ if (tag.getAttribute("id") == null) {
+ this.setCounter(this.getCounter() + 1);
+ tag.setAttribute("id", "\"sci" + this.getCounter() + "\"");
+ }
+
+ if (tag instanceof org.htmlparser.tags.LinkTag) {
+ // Link
+ Log.logInfo("AUGMENTATION", tag.getAttribute("href"));
+
+ LinkTag lt = (LinkTag)tag;
+
+ }
+
+ }
+
+ @Override
+ public void visitStringNode(Text string) {
+
+ }
+
+ public void setCounter(int counter) {
+ this.counter = counter;
+ }
+
+ public int getCounter() {
+ return this.counter;
+ }
+
+ }
+
+ /**
+ * creates a NodeVisitor which inspects the element if it contains useful
+ * text
+ *
+ * @return customized NodeVisitor
+ */
+ private static class VisitorText extends NodeVisitor {
+
+ private int counter;
+
+ public VisitorText() {
+ this.setCounter(0);
+ }
+
+ @Override
+ public void visitTag(Tag tag) {
+
+// tag.setText(tag.getText()+" augmented");
+
+// Node node = new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/scibutton.html", globalrequestHeader));
+// NodeList nl = tag.getChildren();
+// nl.add (node);
+// tag.setChildren(nl);
+
+ }
+
+ @Override
+ public void visitStringNode(Text string) {
+
+// if (string.getParent() != null) {
+//
+// string.setText(string
+// .getText()
+// .replaceAll("und",
+// "KIT"));
+//
+//
+// }
+ }
+
+ public void setCounter(int counter) {
+ this.counter = counter;
+ }
+
+ public int getCounter() {
+ return this.counter;
+ }
+
+ }
+
+ /**
+ * send web page to external REFLECT web service
+ *
+ * @return the web page with integrated REFLECT elements
+ */
+ private static String processExternal(String url, String fieldname,
+ String data) throws IOException {
+ final HTTPClient client = new HTTPClient();
+ try {
+ StringBuilder postdata = new StringBuilder();
+ postdata.append("document=");
+ postdata.append(URLEncoder.encode(data, "UTF-8"));
+ InputStream in = new ByteArrayInputStream(postdata.toString()
+ .getBytes());
+ byte[] result = client.POSTbytes(url, in, postdata.length());
+ if (result != null) {
+ return new String(result);
+ }
+ } finally {
+ client.finish();
+ }
+ return null;
+ }
+
+ private static String loadInternal(String path, RequestHeader requestHeader) {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ String realmProp = requestHeader.get(RequestHeader.AUTHORIZATION);
+ ServerSideIncludes.writeContent(path, buffer, realmProp, "127.0.0.1", requestHeader); // TODO: ip
+ return buffer.toString();
+ }
+
+ /**
+ * add DOCTYPE if necessary
+ *
+ * @return the web page with a leading DOCTYPE definition
+ */
+ private static String processAddDoctype(String data) {
+
+ String result = data;
+
+ BufferedReader reader = new BufferedReader(new StringReader(data));
+
+ try {
+ String firstline = reader.readLine();
+
+ if (firstline != null) {
+ if (!firstline.startsWith("\n"
+ + data;
+ }
+ }
+ } catch (IOException e1) {
+
+ }
+
+ return result;
+
+ }
+
+ /**
+ * load snippet from resource text file
+ *
+ * @return text from resource text file
+ */
+ private static String loadPart(String part) {
+ String result = "";
+ try {
+ BufferedReader in = new BufferedReader(new FileReader(yacy.homedir + File.separatorChar + "htroot"
+ + File.separatorChar + "interaction" + File.separatorChar
+ + "parts" + File.separatorChar + part));
+ String str;
+ while ((str = in.readLine()) != null) {
+ result += str;
+ }
+ in.close();
+ } catch (IOException e) {
+ }
+
+ return result;
+ }
+
public static StringBuffer process (StringBuffer data, Charset charset, DigestURI url, RequestHeader requestHeader) {
-
+
+ globalrequestHeader = requestHeader;
+
+ Switchboard sb = Switchboard.getSwitchboard();
+
boolean augmented = false;
-
+
String Doc = data.toString();
-
+
+ // Send document to REFLECT (http://www.reflect.ws/REST_API.html)
+ if (sb.getConfigBool("augmentation.reflect", false) == true) {
+ try {
+
+ Doc = processExternal("http://reflect.ws/REST/GetHTML",
+ "document", Doc);
+ Log.logInfo("AUGMENTATION", "reflected " + url);
+ augmented = true;
+ } catch (Exception e) {
+
+ }
+ }
+
+ // Add DOCTYPE if not present.
+ // This is required for IE to render position:absolute correctly.
+
+ if (sb.getConfigBool("augmentation.addDoctype", true) == true) {
+ Doc = processAddDoctype(Doc);
+ augmented = true;
+ }
+
+
+ if (sb.getConfigBool("augmentation.reparse", true) == true) {
+
+ NodeList list = new NodeList();
+
+ // Fill NodeList with parsed Document
+ try {
+
+ org.htmlparser.Parser par = new org.htmlparser.Parser();
+
+ par.setInputHTML(Doc);
+
+ list = par.parse(null);
+
+ Log.logInfo ("AUGMENTATION", url.toString());
+
+ } catch (Exception e) {
+ }
+
+ // Add Unique ID to every node element which has no id yet.
+ // This allows consistent interaction between client (browser) and
+ // back-end (data store) by providing "position awareness" in the
+ // document.
+ if (sb.getConfigBool("augmentation.reparse.adduniqueid", true) == true) {
+ try {
+
+ NodeVisitor visitorAddUniqueID = new AugmentHtmlStream.VisitorAddUniqueID();
+ list.visitAllNodesWith(visitorAddUniqueID);
+
+ } catch (Exception e) {
+ }
+ }
+
+ // Inspect on text tags
+
+ try {
+
+ NodeVisitor visitorText = new AugmentHtmlStream.VisitorText();
+ list.visitAllNodesWith(visitorText);
+
+ } catch (Exception e) {
+ }
+
+ String SCI_GUID = "";
+
+ String SCI_GUID_DOI = "";
+ String SCI_GUID_PMID = "";
+
+ String SCI_TITLE = "";
+ String SCI_CREATOR = "";
+ String SCI_DESCRIPTION = "";
+ String SCI_IDENTIFIER = "";
+
+ String SCI_WHITELIST = "";
+
+ String SCI_URL = "";
+
+ String SCI_HASH = "";
+
+ SCI_URL = url.toString();
+
+ // System.out.println("Starting augmentation for " + url);
+ // System.out.println("Content: " + Doc);
+
+ if (!(list == null)) {
+
+ // DOCUMENT IS MANIPULABLE BY HTML REWRITER
+
+ // SO SEND IT TO YACY PARSER
+
+ Document document = null;
+
+ try {
+ final StringReader stringReader = new StringReader(Doc);
+ InputStream inputStream = new InputStream() {
+
+ @Override
+ public int read() throws IOException {
+ return stringReader.read();
+ }
+ };
+
+ document = Document.mergeDocuments(
+ url,
+ "text/html",
+ TextParser.parseSource(url, "text/html", null, data.length(), inputStream));
+
+ } catch (Exception e) {
+
+ }
+
+ if (document != null) {
+
+ if (document.dc_format() == "text/html") {
+
+ SCI_TITLE = document.dc_title();
+ SCI_CREATOR = document.dc_creator();
+ SCI_DESCRIPTION = document.dc_description();
+ SCI_IDENTIFIER = document.dc_identifier();
+
+ }
+
+ }
+
+ SCI_HASH = "" + url.hashCode();
+
+ // ADD AUGMENTED HEADER INFORMATION
+
+ NodeList header = list.extractAllNodesThatMatch(
+ new org.htmlparser.filters.NodeClassFilter(
+ org.htmlparser.tags.HeadTag.class), true);
+
+ org.htmlparser.util.SimpleNodeIterator iterHeader = header
+ .elements();
+
+ while (iterHeader.hasMoreNodes()) {
+ org.htmlparser.tags.HeadTag ht = ((org.htmlparser.tags.HeadTag) iterHeader
+ .nextNode());
+
+ NodeList headchildren = ht.getChildren();
+
+ headchildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/interaction.html", requestHeader)));
+
+ augmented = true;
+
+ ht.setChildren(headchildren);
+ }
+
+ // ADD AUGMENTED BODY INFORMATION
+
+ NodeList body = list.extractAllNodesThatMatch(
+ new org.htmlparser.filters.NodeClassFilter(
+ org.htmlparser.tags.BodyTag.class), true);
+
+ org.htmlparser.util.SimpleNodeIterator iterBody = body
+ .elements();
+
+ while (iterBody.hasMoreNodes()) {
+
+ org.htmlparser.tags.BodyTag bt = ((org.htmlparser.tags.BodyTag) iterBody
+ .nextNode());
+
+ NodeList bodychildren = bt.getChildren();
+
+
+
+ // ADD AUGMENTED INFO
+
+ org.htmlparser.tags.Div sci_aug = new org.htmlparser.tags.Div();
+
+ sci_aug.setTagName("div");
+
+ sci_aug.setAttribute("id", "sciety_augmented");
+ sci_aug.setAttribute("style",
+ "visibility: hidden; position: absolute; overflow: hidden;");
+
+ org.htmlparser.util.NodeList childr = new org.htmlparser.util.NodeList();
+
+
+ sci_aug.setChildren(childr);
+
+ org.htmlparser.tags.Div sci_aug_endtag = new org.htmlparser.tags.Div();
+
+ sci_aug_endtag.setTagName("/div");
+
+ sci_aug.setEndTag(sci_aug_endtag);
+
+ bodychildren.add(sci_aug);
+
+ bt.setChildren(bodychildren);
+
+ augmented = true;
+
+ }
+
+ Doc = list.toHtml(true);
+
+ augmented = true;
+
+ } // not list = null
+
+ } // reparse
+
if (augmented) {
-
+
return (new StringBuffer (Doc));
} else {
return (data);
diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java
index 34658285f..5c75b74f1 100644
--- a/source/net/yacy/yacy.java
+++ b/source/net/yacy/yacy.java
@@ -136,6 +136,7 @@ public final class yacy {
* {@link yacy#startup(String, long, long)} method.
*/
private static Switchboard sb = null;
+ public static String homedir;
/**
* Starts up the whole application. Sets up all datastructures and starts
@@ -168,6 +169,8 @@ public final class yacy {
System.err.println("Error creating DATA-directory in " + dataHome.toString() + " . Please check your write-permission for this folder. YaCy will now terminate.");
System.exit(-1);
}
+
+ homedir = appHome.toString();
// setting up logging
f = new File(dataHome, "DATA/LOG/");