From 5a3d5cb92cfeb3ada006ff5c12238d5f7be0b9c2 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 16 Apr 2019 10:25:48 +0200 Subject: [PATCH] Upgraded Solr config files with the ones provided by Solr release Fixes #292 --- defaults/solr/elevate.xml | 24 ++++++++++++--------- defaults/solr/lang/contractions_fr.txt | 6 ++++++ defaults/solr/lang/stopwords_da.txt | 2 ++ defaults/solr/lang/stopwords_de.txt | 2 ++ defaults/solr/lang/stopwords_es.txt | 2 ++ defaults/solr/lang/stopwords_fi.txt | 2 ++ defaults/solr/lang/stopwords_fr.txt | 5 ++++- defaults/solr/lang/stopwords_hu.txt | 2 ++ defaults/solr/lang/stopwords_it.txt | 2 ++ defaults/solr/lang/stopwords_nl.txt | 2 ++ defaults/solr/lang/stopwords_no.txt | 2 ++ defaults/solr/lang/stopwords_pt.txt | 2 ++ defaults/solr/lang/stopwords_ru.txt | 2 ++ defaults/solr/lang/stopwords_sv.txt | 2 ++ defaults/solr/lang/userdict_ja.txt | 29 ++++++++++++++++++++++++++ defaults/solr/synonyms.txt | 2 +- 16 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 defaults/solr/lang/userdict_ja.txt diff --git a/defaults/solr/elevate.xml b/defaults/solr/elevate.xml index 25d5cebe4..2c09ebed6 100644 --- a/defaults/solr/elevate.xml +++ b/defaults/solr/elevate.xml @@ -24,15 +24,19 @@ --> - - - - - - - - - - + + diff --git a/defaults/solr/lang/contractions_fr.txt b/defaults/solr/lang/contractions_fr.txt index 722db5883..f1bba51b2 100644 --- a/defaults/solr/lang/contractions_fr.txt +++ b/defaults/solr/lang/contractions_fr.txt @@ -7,3 +7,9 @@ qu n s j +d +c +jusqu +quoiqu +lorsqu +puisqu diff --git a/defaults/solr/lang/stopwords_da.txt b/defaults/solr/lang/stopwords_da.txt index a3ff5fe12..42e6145b9 100644 --- a/defaults/solr/lang/stopwords_da.txt +++ b/defaults/solr/lang/stopwords_da.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A Danish stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_de.txt b/defaults/solr/lang/stopwords_de.txt index f77038418..86525e7ae 100644 --- a/defaults/solr/lang/stopwords_de.txt +++ b/defaults/solr/lang/stopwords_de.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A German stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_es.txt b/defaults/solr/lang/stopwords_es.txt index 2db147600..487d78c8d 100644 --- a/defaults/solr/lang/stopwords_es.txt +++ b/defaults/solr/lang/stopwords_es.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A Spanish stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_fi.txt b/defaults/solr/lang/stopwords_fi.txt index addad798c..4372c9a05 100644 --- a/defaults/solr/lang/stopwords_fi.txt +++ b/defaults/solr/lang/stopwords_fi.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | forms of BE diff --git a/defaults/solr/lang/stopwords_fr.txt b/defaults/solr/lang/stopwords_fr.txt index c00837ea9..749abae68 100644 --- a/defaults/solr/lang/stopwords_fr.txt +++ b/defaults/solr/lang/stopwords_fr.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A French stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. @@ -167,7 +169,8 @@ eussent | Later additions (from Jean-Christophe Deschamps) ceci | this -celà  | that +cela | that +celà | that cet | this cette | this ici | here diff --git a/defaults/solr/lang/stopwords_hu.txt b/defaults/solr/lang/stopwords_hu.txt index 1a96f1db6..37526da8a 100644 --- a/defaults/solr/lang/stopwords_hu.txt +++ b/defaults/solr/lang/stopwords_hu.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | Hungarian stop word list | prepared by Anna Tordai diff --git a/defaults/solr/lang/stopwords_it.txt b/defaults/solr/lang/stopwords_it.txt index 4cb5b0891..1219cc773 100644 --- a/defaults/solr/lang/stopwords_it.txt +++ b/defaults/solr/lang/stopwords_it.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | An Italian stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_nl.txt b/defaults/solr/lang/stopwords_nl.txt index f4d61f509..47a2aeacf 100644 --- a/defaults/solr/lang/stopwords_nl.txt +++ b/defaults/solr/lang/stopwords_nl.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A Dutch stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_no.txt b/defaults/solr/lang/stopwords_no.txt index e76f36e69..a7a2c28ba 100644 --- a/defaults/solr/lang/stopwords_no.txt +++ b/defaults/solr/lang/stopwords_no.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A Norwegian stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_pt.txt b/defaults/solr/lang/stopwords_pt.txt index 276c1b446..acfeb01af 100644 --- a/defaults/solr/lang/stopwords_pt.txt +++ b/defaults/solr/lang/stopwords_pt.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A Portuguese stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_ru.txt b/defaults/solr/lang/stopwords_ru.txt index 643076934..55271400c 100644 --- a/defaults/solr/lang/stopwords_ru.txt +++ b/defaults/solr/lang/stopwords_ru.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | a russian stop word list. comments begin with vertical bar. each stop | word is at the start of a line. diff --git a/defaults/solr/lang/stopwords_sv.txt b/defaults/solr/lang/stopwords_sv.txt index 22bddfd8c..096f87f67 100644 --- a/defaults/solr/lang/stopwords_sv.txt +++ b/defaults/solr/lang/stopwords_sv.txt @@ -4,6 +4,8 @@ | Also see http://www.opensource.org/licenses/bsd-license.html | - Encoding was converted to UTF-8. | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" | A Swedish stop word list. Comments begin with vertical bar. Each stop | word is at the start of a line. diff --git a/defaults/solr/lang/userdict_ja.txt b/defaults/solr/lang/userdict_ja.txt new file mode 100644 index 000000000..6f0368e4d --- /dev/null +++ b/defaults/solr/lang/userdict_ja.txt @@ -0,0 +1,29 @@ +# +# This is a sample user dictionary for Kuromoji (JapaneseTokenizer) +# +# Add entries to this file in order to override the statistical model in terms +# of segmentation, readings and part-of-speech tags. Notice that entries do +# not have weights since they are always used when found. This is by-design +# in order to maximize ease-of-use. +# +# Entries are defined using the following CSV format: +# , ... , ... , +# +# Notice that a single half-width space separates tokens and readings, and +# that the number tokens and readings must match exactly. +# +# Also notice that multiple entries with the same is undefined. +# +# Whitespace only lines are ignored. Comments are not allowed on entry lines. +# + +# Custom segmentation for kanji compounds +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 + +# Custom segmentation for compound katakana +トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 +ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 + +# Custom reading for former sumo wrestler +朝青龍,朝青龍,アサショウリュウ,カスタム人名 diff --git a/defaults/solr/synonyms.txt b/defaults/solr/synonyms.txt index 7f7212830..eab4ee875 100644 --- a/defaults/solr/synonyms.txt +++ b/defaults/solr/synonyms.txt @@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa GB,gib,gigabyte,gigabytes MB,mib,megabyte,megabytes Television, Televisions, TV, TVs -#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming #after us won't split it into two words. # Synonym mappings can be used for spelling correction too