diff --git a/source/net/yacy/data/wiki/WikiCode.java b/source/net/yacy/data/wiki/WikiCode.java index ab4323866..1d5248713 100644 --- a/source/net/yacy/data/wiki/WikiCode.java +++ b/source/net/yacy/data/wiki/WikiCode.java @@ -107,7 +107,9 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { private static final String WIKI_CLOSE_LINK = "]]"; private static final String WIKI_OPEN_LINK = "[["; + /** Wiki template inclusion closing tag */ private static final String WIKI_CLOSE_METADATA = "}}"; + /** Wiki template inclusion opening tag */ private static final String WIKI_OPEN_METADATA = "{{"; private static final String WIKI_CLOSE_EXTERNAL_LINK = "]"; private static final String WIKI_OPEN_EXTERNAL_LINK = "["; @@ -127,6 +129,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { private static final char SIX = '6'; private static final char WIKI_FORMATTED = ' '; private static final char WIKI_INDENTION = ':'; + /** Wiki template parameter separator */ + private static final char WIKI_METADATA_PARAMETER_SEPARATOR = '|'; private static final int LEN_WIKI_CLOSE_PRE_ESCAPED = WIKI_CLOSE_PRE_ESCAPED.length(); private static final int LEN_WIKI_OPEN_PRE_ESCAPED = WIKI_OPEN_PRE_ESCAPED.length(); @@ -140,6 +144,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { private static final int LEN_WIKI_HR_LINE = WIKI_HR_LINE.length(); private static final int LEN_PIPE_ESCAPED = PIPE_ESCAPED.length(); private static final int LEN_WIKI_OPEN_METADATA = WIKI_OPEN_METADATA.length(); + private static final int LEN_WIKI_CLOSE_METADATA = WIKI_CLOSE_METADATA.length(); /** List of properties which can be used in tables. */ private final static String[] TABLE_PROPERTIES = {"rowspan", "colspan", "vspace", "hspace", "cellspacing", "cellpadding", "border"}; @@ -1042,70 +1047,137 @@ public class WikiCode extends AbstractWikiParser implements WikiParser { /** - * Process line with geo coordinate metadata - * @param line of wiki text - * @return line with geo coordinate formatted to be recogizeable by parser + * Process template inclusions in line, eventually with geo coordinate metadata + * @param line line of wiki text + * @return cleaned text with eventual geo coordinates formatted to be recognizable by parser + * @see https://en.wikipedia.org/wiki/Wikipedia:Transclusion */ - private static String processMetadata(String line) { - int p, q, s = 0; - while ((p = line.indexOf(WIKI_OPEN_METADATA, s)) >= 0 && (q = line.indexOf(WIKI_CLOSE_METADATA, p + 1)) >= 0) { - s = q; // continue with next position - final String a = line.substring(p + LEN_WIKI_OPEN_METADATA, q); - if (a.toLowerCase().startsWith("coordinate")) { - // parse Geographical Coordinates as described in - // http://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style_%28dates_and_numbers%29#Geographical_coordinates - // looks like: - // {{Coord|57|18|22.5|N|4|27|32.7|W|display=title}} - // however, such information does not appear as defined above but as: - // {{coordinate|NS=52.205944|EW=0.117593|region=GB-CAM|type=landmark}} - // {{coordinate|NS=43/50/29/N|EW=73/23/17/W|type=landmark|region=US-NY}} - // and if passed through this parser: - // {{Coordinate |NS 45/37/43.0/N |EW. 07/58/41.0/E |type=landmark |region=IT-BI}} ## means: degree/minute/second - // {{Coordinate |NS 51.48994 |EW. 7.33249 |type=landmark |region=DE-NW}} - final String b[] = a.split("\\|"); - float lon = Float.NaN, lat = Float.NaN; // degree - float lonm = 0.0f, latm = 0.0f; // minutes (including sec as fraction) - String lono = "E", lato = "N"; - String name = ""; - try { - for (final String c : b) { - if (c.toLowerCase().startsWith("name=")) { - name = c.substring(5); - } - if (c.toUpperCase().startsWith("NS=")) { - final String d[] = c.substring(3).split("/"); - if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lato = "S"; l = -l;} lat = (float) Math.floor(l); latm = 60.0f * (l - lat);} - else if (d.length > 1) { //format: NS deg/min/sec/N - lat = Float.parseFloat(d[0]); // degree - if (!d[1].isEmpty()) latm = Float.parseFloat(d[1]); // minutes - if (d.length >= 3 && !d[2].isEmpty()) {latm += (Float.parseFloat(d[2]) / 60.0f);} // sec (check empty because format found "45/10//N" ) - if (d[d.length - 1].toUpperCase().equals("S")) lato = "S"; - } - } - if (c.toUpperCase().startsWith("EW=")) { - final String d[] = c.substring(3).split("/"); - if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lono = "W"; l = -l;} lon = (float) Math.floor(l); lonm = 60.0f * (l - lon);} - else if (d.length > 1) { - lon = Float.parseFloat(d[0]); - if (!d[1].isEmpty()) lonm = Float.parseFloat(d[1]); - if (d.length >= 3 && !d[2].isEmpty()) {lonm += (Float.parseFloat(d[2]) / 60.0f);} - if (d[d.length-1].toUpperCase().equals("W")) {lono = "W";} - } - } - } - } catch (NumberFormatException nsExcept) { - // catch parseFloat exception (may still happen if wiki code contains expressions) - continue; - } - if (!Float.isNaN(lon) && !Float.isNaN(lat)) { - // replace this with a format that the html parser can understand - line = line.substring(0, p) + (name.length() > 0 ? (" " + name) : "") + " " + lato + " " + lat + "\u00B0 " + latm + "'" + lono + " " + lon + "\u00B0 " + lonm + "' " + line.substring(q + WIKI_CLOSE_METADATA.length()); - s = p; - continue; - } + protected static String processMetadata(final String line) { + StringBuilder processedLine = new StringBuilder(line); + int openIndex, closeIndex, fromIndex = 0; + while ((openIndex = processedLine.indexOf(WIKI_OPEN_METADATA, fromIndex)) >= 0) { + closeIndex = processedLine.indexOf(WIKI_CLOSE_METADATA, openIndex + LEN_WIKI_OPEN_METADATA); + /* Closing tag position : handle eventually nested tags */ + int nextOpenIndex = processedLine.indexOf(WIKI_OPEN_METADATA, openIndex + LEN_WIKI_OPEN_METADATA); + while(nextOpenIndex >= 0 && nextOpenIndex < closeIndex) { + closeIndex = processedLine.indexOf(WIKI_CLOSE_METADATA, closeIndex + LEN_WIKI_CLOSE_METADATA); + if(closeIndex < 0) { + /* Parent closing mark is missing: likely a multi-line template inclusion */ + break; + } + nextOpenIndex = processedLine.indexOf(WIKI_OPEN_METADATA, nextOpenIndex + LEN_WIKI_OPEN_METADATA); } + if(closeIndex > 0) { + final String content = processedLine.substring(openIndex + LEN_WIKI_OPEN_METADATA, closeIndex); + if (content.toLowerCase().startsWith("coordinate")) { + // parse Geographical Coordinates as described in + // http://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style_%28dates_and_numbers%29#Geographical_coordinates + // looks like: + // {{Coord|57|18|22.5|N|4|27|32.7|W|display=title}} + // however, such information does not appear as defined above but as: + // {{coordinate|NS=52.205944|EW=0.117593|region=GB-CAM|type=landmark}} + // {{coordinate|NS=43/50/29/N|EW=73/23/17/W|type=landmark|region=US-NY}} + // and if passed through this parser: + // {{Coordinate |NS 45/37/43.0/N |EW. 07/58/41.0/E |type=landmark |region=IT-BI}} ## means: degree/minute/second + // {{Coordinate |NS 51.48994 |EW. 7.33249 |type=landmark |region=DE-NW}} + final String b[] = content.split("\\|"); + float lon = Float.NaN, lat = Float.NaN; // degree + float lonm = 0.0f, latm = 0.0f; // minutes (including sec as fraction) + String lono = "E", lato = "N"; + String name = ""; + try { + for (final String c : b) { + if (c.toLowerCase().startsWith("name=")) { + name = c.substring(5); + } + if (c.toUpperCase().startsWith("NS=")) { + final String d[] = c.substring(3).split("/"); + if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lato = "S"; l = -l;} lat = (float) Math.floor(l); latm = 60.0f * (l - lat);} + else if (d.length > 1) { //format: NS deg/min/sec/N + lat = Float.parseFloat(d[0]); // degree + if (!d[1].isEmpty()) latm = Float.parseFloat(d[1]); // minutes + if (d.length >= 3 && !d[2].isEmpty()) {latm += (Float.parseFloat(d[2]) / 60.0f);} // sec (check empty because format found "45/10//N" ) + if (d[d.length - 1].toUpperCase().equals("S")) lato = "S"; + } + } + if (c.toUpperCase().startsWith("EW=")) { + final String d[] = c.substring(3).split("/"); + if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lono = "W"; l = -l;} lon = (float) Math.floor(l); lonm = 60.0f * (l - lon);} + else if (d.length > 1) { + lon = Float.parseFloat(d[0]); + if (!d[1].isEmpty()) lonm = Float.parseFloat(d[1]); + if (d.length >= 3 && !d[2].isEmpty()) {lonm += (Float.parseFloat(d[2]) / 60.0f);} + if (d[d.length-1].toUpperCase().equals("W")) {lono = "W";} + } + } + } + } catch (NumberFormatException nsExcept) { + // catch parseFloat exception (may still happen if wiki code contains expressions) + processedLine.delete(closeIndex, closeIndex + LEN_WIKI_CLOSE_METADATA); + processedLine.delete(openIndex, openIndex + LEN_WIKI_OPEN_METADATA); + fromIndex = openIndex; + continue; + } + if (!Float.isNaN(lon) && !Float.isNaN(lat)) { + // replace this with a format that the html parser can understand + final String htmlCoord = (name.length() > 0 ? (" " + name) : "") + + WIKI_FORMATTED +" " + lato + " " + lat + "\u00B0 " + latm + "'" + lono + " " + lon + "\u00B0 " + lonm + "'" + WIKI_FORMATTED; + processedLine.replace(openIndex, closeIndex + LEN_WIKI_CLOSE_METADATA, htmlCoord); + + /* Set next position to openIndex as some parameters can still contain nested template inclusion tags */ + fromIndex = openIndex; + continue; + } + fromIndex = closeIndex; // continue with next position + } else { + String processedContent; + /* Any other template inclusion : only remove opening and closing tag and parameter separators */ + int nestedOpenTagIndex = content.indexOf(WIKI_OPEN_METADATA); + int lastNestedCloseTagIndex = content.lastIndexOf(WIKI_CLOSE_METADATA); + if(nestedOpenTagIndex >= 0 && lastNestedCloseTagIndex > 0) { + processedContent = WIKI_FORMATTED + content.substring(0, nestedOpenTagIndex).replace(WIKI_METADATA_PARAMETER_SEPARATOR, ' ').replace('=', ' ') + + content.substring(nestedOpenTagIndex, lastNestedCloseTagIndex) + + content.substring(lastNestedCloseTagIndex).replace(WIKI_METADATA_PARAMETER_SEPARATOR, ' ').replace('=', ' ') + WIKI_FORMATTED; + fromIndex = openIndex; // continue with next nested position + } else { + /* No nested tag : we can now replace parameter separators with spaces in all remaining content */ + processedContent = WIKI_FORMATTED + content.replace(WIKI_METADATA_PARAMETER_SEPARATOR, ' ').replace('=', ' ') + WIKI_FORMATTED; + fromIndex = openIndex + processedContent.length(); // continue with next position + } + processedLine.replace(openIndex, closeIndex + LEN_WIKI_CLOSE_METADATA, processedContent); + } + } else { + /* Multi-line template inclusion : only remove opening tag and parameter separators until eventually first nested tag */ + int nestedOpenTagIndex = processedLine.indexOf(WIKI_OPEN_METADATA, openIndex + LEN_WIKI_OPEN_METADATA); + if(nestedOpenTagIndex >= 0) { + processedLine.replace(openIndex, nestedOpenTagIndex, + WIKI_FORMATTED + + processedLine.substring(openIndex + LEN_WIKI_OPEN_METADATA, nestedOpenTagIndex) + .replace(WIKI_METADATA_PARAMETER_SEPARATOR, ' ').replace('=', ' ')); + fromIndex = openIndex; + } else { + processedLine.replace(openIndex, processedLine.length(), WIKI_FORMATTED + + processedLine.substring(openIndex + LEN_WIKI_OPEN_METADATA).replace(WIKI_METADATA_PARAMETER_SEPARATOR, ' ').replace('=', ' ')); + break; + } + } + } - return line; + + /* Handle any eventual multi-line template remaining closing tags */ + fromIndex = 0; + while ((closeIndex = processedLine.indexOf(WIKI_CLOSE_METADATA, fromIndex)) >= 0) { + processedLine.replace(fromIndex, closeIndex, processedLine.substring(fromIndex, closeIndex).replace(WIKI_METADATA_PARAMETER_SEPARATOR, ' ').replace('=', ' ')); + processedLine.delete(closeIndex, closeIndex + LEN_WIKI_CLOSE_METADATA); + fromIndex = closeIndex; + } + + /* Handle any eventual multi-line template remaining parameter lines */ + String result = processedLine.toString(); + if(result.matches("^\\s*\\" + WIKI_METADATA_PARAMETER_SEPARATOR + "\\s*[^\\-\\}\\|].*")) { + result = result.replace(WIKI_METADATA_PARAMETER_SEPARATOR, ' ').replace('=', ' '); + } + return result; } private class TableOfContent { diff --git a/test/java/net/yacy/data/wiki/WikiCodeTest.java b/test/java/net/yacy/data/wiki/WikiCodeTest.java index e7d3c49dd..1bb7c2fec 100644 --- a/test/java/net/yacy/data/wiki/WikiCodeTest.java +++ b/test/java/net/yacy/data/wiki/WikiCodeTest.java @@ -1,6 +1,7 @@ package net.yacy.data.wiki; import org.junit.Test; + import static org.junit.Assert.*; @@ -10,16 +11,18 @@ public class WikiCodeTest { * test geo location metadata convert */ @Test - public void testProcessMetadata() { + public void testProcessMetadataCoordinates() { String[] testmeta = new String[]{ "{{coordinate|NS=52.205944|EW=0.117593|region=GB-CAM|type=landmark}}", // decimal N-E location "{{coordinate|NS=43/50/29/N|EW=73/23/17/W|type=landmark|region=US-NY}}", // N-W location "{{Coordinate |text=DMS |NS=50/7/49/N |EW=6/8/09/E |type=landmark |region=BE-WLG |name=Monument des trois Frontières}}", "{{Coordinate |text=DMS |NS= 49.047169|EW=7.899148|region=DE-RP |type=landmark |name=Europadenkmal (Rheinland-Pfalz)}}", + "{{Coordinate |text=DMS |NS= 49.047169|EW=7.899148|region=DE-RP |type=landmark |name={{de}}Europadenkmal (Rheinland-Pfalz)}}",// with nested language template "{{coordinate|NS=0.00000|EW=0.117593}}", // testing equator coord - "{{coordinate|NS=-10.00000|EW=-10.10000}}" // testing S-E location + "{{coordinate|NS=-10.00000|EW=-10.10000}}", // testing S-E location + "{{coordinate|NS=12a5|EW=-10.10000}}" // testing malformed coordinates value }; WikiCode wc = new WikiCode(); @@ -27,10 +30,88 @@ public class WikiCodeTest { String result = wc.transform("http://wiki:8080",testmeta[i]); System.out.println(testmeta[i] + " --> " + result); // simply check if replacement took place, if no coordinate recognized original string is just html encoded - assertFalse(result.contains("#124;")); // simple check - result not containing char code for "{", + assertFalse(result.contains("#123;")); // simple check - result not containing char code for "{", assertFalse(result.contains("#125;")); // simple check - result not containing char code for "}" } } + + /** + * Test multi-line template inclusion processing + */ + @Test + public void testTransformMultilineTemplateInclusion() { + String wikitext = "{{Infobox|Example\n" + + "\n" + + "| name = Example\n" + + "| category = [[Infobox Examples|Example]]\n" + + "\n" + + "| website = {{URL|http://example.com}}\n" + + "}}"; + WikiCode wc = new WikiCode(); + String result = wc.transform("http://wiki:8080", wikitext); + System.out.println(wikitext + " --> " + result); + assertFalse(result.contains("#123;")); // simple check - result not containing char code for "{", + assertFalse(result.contains("#125;")); // simple check - result not containing char code for "}" + } + + /** + * Test single line template inclusion processing + */ + @Test + public void testProcessMetadataTransclusion() { + final String[] wikitexts = new String[]{ + "{{Like}}", // most simple template inclusion + "{{Stochastic processes}}", // page name including space + "{{:Stochastic processes}}", // page inclusion with implicit namespace + "{{WP:Assume good faith}}", // page inclusion from Wikipedia namespace + "{{Pagename|parameter1|parameter2|parameter3}}", // with unnamed parameters + "{{Pagename|parameter1=value1|parameter2=value2|parameter3=value3}}", // with named parameters + "{{Template|This is the title text|This is a custom warning line}}", // with parameters including spaces + "{{Special:Recentchangeslinked/General}}", // subpage inclusion + "{{Template1}} text {{Template2}} {{Template3|parameter value1|param2}}", // multiple templates on the same line + "{{Template|[[Page]]}}", // with link parameter + "{{Template|parameter1={{en}}value1|parameter2}}", // nested template inclusion + "{{Template|parameter1={{en|param1|param2=val2}}value1}}", // nested template with parameters inclusion + "{{Template", // Multi-line template inclusion beginning + "simple text {{Template", // Multi-line template inclusion beginning with text before + "{{Template|parameter1={{en}} value1", // Multi-line template inclusion beginning with nested tag + "{{Template|parameter1={{subTemplate", // Multi-line nested template inclusion + "|parameter", // Multi-line template inclusion unnamed parameter line + "|parameter=value", // Multi-line template inclusion named parameter line + "|parameter={{subTemplate|param1|param2}}value", // Multi-line template inclusion with nested template inclusion + "|[[Page]]", // Multi-line template inclusion with unnamed link parameter + "|parameter=[[Page]]", // Multi-line template inclusion with named link parameter + "}}", // Multi-line template inclusion closing + "|lastParameter}}", // Multi-line template inclusion closing with unnamed parameter + "|lastParameter=value}}", // Multi-line template inclusion closing with named parameter + "|lastParameter={{en}}value}}", // Multi-line template inclusion closing with nested tag + "}}}}" // Multi-line nested template inclusion closing + }; + + for (String wikitext : wikitexts) { + String result = WikiCode.processMetadata(wikitext); + System.out.println(wikitext + " --> " + result); + // simply check if replacement took place + assertFalse(result.contains("{")); + assertFalse(result.contains("|")); + assertFalse(result.contains("=")); + assertFalse(result.contains("}")); + } + + final String[] wikitextsNotToModify = new String[]{ + "", // empty string + "Simple text", + "
Simple preformatted text
", + "[[Page]]", // link + "{|", // table start + "|-", // new table line + "||", // table cell divider + "|}", // table end + }; + for (String wikitext : wikitextsNotToModify) { + assertEquals("Text sould not have been modified", wikitext, WikiCode.processMetadata(wikitext)); + } + } /** * test header wiki markup