diff --git a/.classpath b/.classpath index 9088d90ba..95ff3dba7 100644 --- a/.classpath +++ b/.classpath @@ -49,5 +49,7 @@ + + diff --git a/addon/YaCy.app/Contents/Info.plist b/addon/YaCy.app/Contents/Info.plist index 268bfeb26..da5ae6bde 100644 --- a/addon/YaCy.app/Contents/Info.plist +++ b/addon/YaCy.app/Contents/Info.plist @@ -50,6 +50,8 @@ $JAVAROOT/lib/commons-logging-1.1.1.jar $JAVAROOT/lib/fontbox-1.6.0.jar $JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar + $JAVAROOT/lib/htmllexer.jar + $JAVAROOT/lib/htmlparser.jar $JAVAROOT/lib/httpclient-4.2.jar $JAVAROOT/lib/httpcore-4.2.jar $JAVAROOT/lib/httpmime-4.2.jar diff --git a/build.xml b/build.xml index beae51d7c..602b769f7 100644 --- a/build.xml +++ b/build.xml @@ -168,6 +168,8 @@ + + diff --git a/lib/htmllexer.jar b/lib/htmllexer.jar new file mode 100644 index 000000000..0a0fc4df3 Binary files /dev/null and b/lib/htmllexer.jar differ diff --git a/lib/htmlparser.jar b/lib/htmlparser.jar new file mode 100644 index 000000000..fad7a207b Binary files /dev/null and b/lib/htmlparser.jar differ diff --git a/source/de/anomic/http/server/ServerSideIncludes.java b/source/de/anomic/http/server/ServerSideIncludes.java index 087c2e128..53673206f 100644 --- a/source/de/anomic/http/server/ServerSideIncludes.java +++ b/source/de/anomic/http/server/ServerSideIncludes.java @@ -74,7 +74,7 @@ public class ServerSideIncludes { } } - private static void writeContent(String path, final OutputStream out, final String authorization, final String requesthost, final RequestHeader requestHeader) { + public static void writeContent(String path, final OutputStream out, final String authorization, final String requesthost, final RequestHeader requestHeader) { // check if there are arguments in path string String args = ""; final int argpos = path.indexOf('?'); diff --git a/source/net/yacy/interaction/AugmentHtmlStream.java b/source/net/yacy/interaction/AugmentHtmlStream.java index ddb7aeb81..426c731c6 100644 --- a/source/net/yacy/interaction/AugmentHtmlStream.java +++ b/source/net/yacy/interaction/AugmentHtmlStream.java @@ -1,21 +1,430 @@ package net.yacy.interaction; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.net.URLEncoder; import java.nio.charset.Charset; +import net.yacy.yacy; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.document.Document; +import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.Switchboard; + +import org.htmlparser.Tag; +import org.htmlparser.Text; +import org.htmlparser.tags.LinkTag; +import org.htmlparser.util.NodeList; +import org.htmlparser.visitors.NodeVisitor; + +import de.anomic.http.server.ServerSideIncludes; public class AugmentHtmlStream { - + + static RequestHeader globalrequestHeader; + + /** + * creates a NodeVisitor which assigns a unique ID to every node + * + * @return customized NodeVisitor + */ + private static class VisitorAddUniqueID extends NodeVisitor { + + private int counter; + + public VisitorAddUniqueID() { + this.setCounter(0); + } + + @Override + public void visitTag(Tag tag) { + if (tag.getAttribute("id") == null) { + this.setCounter(this.getCounter() + 1); + tag.setAttribute("id", "\"sci" + this.getCounter() + "\""); + } + + if (tag instanceof org.htmlparser.tags.LinkTag) { + // Link + Log.logInfo("AUGMENTATION", tag.getAttribute("href")); + + LinkTag lt = (LinkTag)tag; + + } + + } + + @Override + public void visitStringNode(Text string) { + + } + + public void setCounter(int counter) { + this.counter = counter; + } + + public int getCounter() { + return this.counter; + } + + } + + /** + * creates a NodeVisitor which inspects the element if it contains useful + * text + * + * @return customized NodeVisitor + */ + private static class VisitorText extends NodeVisitor { + + private int counter; + + public VisitorText() { + this.setCounter(0); + } + + @Override + public void visitTag(Tag tag) { + +// tag.setText(tag.getText()+" augmented"); + +// Node node = new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/scibutton.html", globalrequestHeader)); +// NodeList nl = tag.getChildren(); +// nl.add (node); +// tag.setChildren(nl); + + } + + @Override + public void visitStringNode(Text string) { + +// if (string.getParent() != null) { +// +// string.setText(string +// .getText() +// .replaceAll("und", +// "KIT")); +// +// +// } + } + + public void setCounter(int counter) { + this.counter = counter; + } + + public int getCounter() { + return this.counter; + } + + } + + /** + * send web page to external REFLECT web service + * + * @return the web page with integrated REFLECT elements + */ + private static String processExternal(String url, String fieldname, + String data) throws IOException { + final HTTPClient client = new HTTPClient(); + try { + StringBuilder postdata = new StringBuilder(); + postdata.append("document="); + postdata.append(URLEncoder.encode(data, "UTF-8")); + InputStream in = new ByteArrayInputStream(postdata.toString() + .getBytes()); + byte[] result = client.POSTbytes(url, in, postdata.length()); + if (result != null) { + return new String(result); + } + } finally { + client.finish(); + } + return null; + } + + private static String loadInternal(String path, RequestHeader requestHeader) { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + String realmProp = requestHeader.get(RequestHeader.AUTHORIZATION); + ServerSideIncludes.writeContent(path, buffer, realmProp, "127.0.0.1", requestHeader); // TODO: ip + return buffer.toString(); + } + + /** + * add DOCTYPE if necessary + * + * @return the web page with a leading DOCTYPE definition + */ + private static String processAddDoctype(String data) { + + String result = data; + + BufferedReader reader = new BufferedReader(new StringReader(data)); + + try { + String firstline = reader.readLine(); + + if (firstline != null) { + if (!firstline.startsWith("\n" + + data; + } + } + } catch (IOException e1) { + + } + + return result; + + } + + /** + * load snippet from resource text file + * + * @return text from resource text file + */ + private static String loadPart(String part) { + String result = ""; + try { + BufferedReader in = new BufferedReader(new FileReader(yacy.homedir + File.separatorChar + "htroot" + + File.separatorChar + "interaction" + File.separatorChar + + "parts" + File.separatorChar + part)); + String str; + while ((str = in.readLine()) != null) { + result += str; + } + in.close(); + } catch (IOException e) { + } + + return result; + } + public static StringBuffer process (StringBuffer data, Charset charset, DigestURI url, RequestHeader requestHeader) { - + + globalrequestHeader = requestHeader; + + Switchboard sb = Switchboard.getSwitchboard(); + boolean augmented = false; - + String Doc = data.toString(); - + + // Send document to REFLECT (http://www.reflect.ws/REST_API.html) + if (sb.getConfigBool("augmentation.reflect", false) == true) { + try { + + Doc = processExternal("http://reflect.ws/REST/GetHTML", + "document", Doc); + Log.logInfo("AUGMENTATION", "reflected " + url); + augmented = true; + } catch (Exception e) { + + } + } + + // Add DOCTYPE if not present. + // This is required for IE to render position:absolute correctly. + + if (sb.getConfigBool("augmentation.addDoctype", true) == true) { + Doc = processAddDoctype(Doc); + augmented = true; + } + + + if (sb.getConfigBool("augmentation.reparse", true) == true) { + + NodeList list = new NodeList(); + + // Fill NodeList with parsed Document + try { + + org.htmlparser.Parser par = new org.htmlparser.Parser(); + + par.setInputHTML(Doc); + + list = par.parse(null); + + Log.logInfo ("AUGMENTATION", url.toString()); + + } catch (Exception e) { + } + + // Add Unique ID to every node element which has no id yet. + // This allows consistent interaction between client (browser) and + // back-end (data store) by providing "position awareness" in the + // document. + if (sb.getConfigBool("augmentation.reparse.adduniqueid", true) == true) { + try { + + NodeVisitor visitorAddUniqueID = new AugmentHtmlStream.VisitorAddUniqueID(); + list.visitAllNodesWith(visitorAddUniqueID); + + } catch (Exception e) { + } + } + + // Inspect on text tags + + try { + + NodeVisitor visitorText = new AugmentHtmlStream.VisitorText(); + list.visitAllNodesWith(visitorText); + + } catch (Exception e) { + } + + String SCI_GUID = ""; + + String SCI_GUID_DOI = ""; + String SCI_GUID_PMID = ""; + + String SCI_TITLE = ""; + String SCI_CREATOR = ""; + String SCI_DESCRIPTION = ""; + String SCI_IDENTIFIER = ""; + + String SCI_WHITELIST = ""; + + String SCI_URL = ""; + + String SCI_HASH = ""; + + SCI_URL = url.toString(); + + // System.out.println("Starting augmentation for " + url); + // System.out.println("Content: " + Doc); + + if (!(list == null)) { + + // DOCUMENT IS MANIPULABLE BY HTML REWRITER + + // SO SEND IT TO YACY PARSER + + Document document = null; + + try { + final StringReader stringReader = new StringReader(Doc); + InputStream inputStream = new InputStream() { + + @Override + public int read() throws IOException { + return stringReader.read(); + } + }; + + document = Document.mergeDocuments( + url, + "text/html", + TextParser.parseSource(url, "text/html", null, data.length(), inputStream)); + + } catch (Exception e) { + + } + + if (document != null) { + + if (document.dc_format() == "text/html") { + + SCI_TITLE = document.dc_title(); + SCI_CREATOR = document.dc_creator(); + SCI_DESCRIPTION = document.dc_description(); + SCI_IDENTIFIER = document.dc_identifier(); + + } + + } + + SCI_HASH = "" + url.hashCode(); + + // ADD AUGMENTED HEADER INFORMATION + + NodeList header = list.extractAllNodesThatMatch( + new org.htmlparser.filters.NodeClassFilter( + org.htmlparser.tags.HeadTag.class), true); + + org.htmlparser.util.SimpleNodeIterator iterHeader = header + .elements(); + + while (iterHeader.hasMoreNodes()) { + org.htmlparser.tags.HeadTag ht = ((org.htmlparser.tags.HeadTag) iterHeader + .nextNode()); + + NodeList headchildren = ht.getChildren(); + + headchildren.add(new org.htmlparser.nodes.TextNode(loadInternal("interactionparts/interaction.html", requestHeader))); + + augmented = true; + + ht.setChildren(headchildren); + } + + // ADD AUGMENTED BODY INFORMATION + + NodeList body = list.extractAllNodesThatMatch( + new org.htmlparser.filters.NodeClassFilter( + org.htmlparser.tags.BodyTag.class), true); + + org.htmlparser.util.SimpleNodeIterator iterBody = body + .elements(); + + while (iterBody.hasMoreNodes()) { + + org.htmlparser.tags.BodyTag bt = ((org.htmlparser.tags.BodyTag) iterBody + .nextNode()); + + NodeList bodychildren = bt.getChildren(); + + + + // ADD AUGMENTED INFO + + org.htmlparser.tags.Div sci_aug = new org.htmlparser.tags.Div(); + + sci_aug.setTagName("div"); + + sci_aug.setAttribute("id", "sciety_augmented"); + sci_aug.setAttribute("style", + "visibility: hidden; position: absolute; overflow: hidden;"); + + org.htmlparser.util.NodeList childr = new org.htmlparser.util.NodeList(); + + + sci_aug.setChildren(childr); + + org.htmlparser.tags.Div sci_aug_endtag = new org.htmlparser.tags.Div(); + + sci_aug_endtag.setTagName("/div"); + + sci_aug.setEndTag(sci_aug_endtag); + + bodychildren.add(sci_aug); + + bt.setChildren(bodychildren); + + augmented = true; + + } + + Doc = list.toHtml(true); + + augmented = true; + + } // not list = null + + } // reparse + if (augmented) { - + return (new StringBuffer (Doc)); } else { return (data); diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index 34658285f..5c75b74f1 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -136,6 +136,7 @@ public final class yacy { * {@link yacy#startup(String, long, long)} method. */ private static Switchboard sb = null; + public static String homedir; /** * Starts up the whole application. Sets up all datastructures and starts @@ -168,6 +169,8 @@ public final class yacy { System.err.println("Error creating DATA-directory in " + dataHome.toString() + " . Please check your write-permission for this folder. YaCy will now terminate."); System.exit(-1); } + + homedir = appHome.toString(); // setting up logging f = new File(dataHome, "DATA/LOG/");