source: trunk/dports/java/apache-solr/files/patch-solr-ja.diff @ 87591

Last change on this file since 87591 was 87591, checked in by hum@…, 9 years ago

apache-solr: update to 3.5.0.

File size: 8.0 KB
RevLine 
[87591]1--- example/solr-ja/conf/schema.xml.orig        2011-11-22 22:02:40.000000000 +0900
2+++ example/solr-ja/conf/schema.xml     2011-11-27 00:08:15.000000000 +0900
3@@ -469,6 +469,92 @@
[83880]4     See http://wiki.apache.org/solr/SpatialSearch
5    -->
6     <fieldtype name="geohash" class="solr.GeoHashField"/>
7+
8+    <!-- configuration for japanese text, using a morphological analyzer
9+      Most possibilities for customization are specified here in the schema.
10+
11+      Note: you can set the default query operator to be OR, AND, or PHRASE:
12+       OR: Use these defaults (autoGeneratePhraseQueries="false", <solrQueryParser defaultOperator="OR"/>
13+           In this case Solr works like it does with the English language. The default query is OR,
14+           but documents that contain more of the query terms get a special boost. You can probably
15+           use a less aggressive stopwords/stoptags in this case, and its probably a good idea to use
16+           enablePositionIncrements=true, so that if a user puts a query in quotes, they get a much more
17+           exact phrase query.
18+       AND: Set autoGeneratePhraseQueries=false, but set <solrQueryParser defaultOperator="AND"/> in
19+           your schema.xml. Note if you do this, you should use a more aggressive stopwords/stoptags
20+           list (at least at query-time), otherwise a document might not match simply because it does
21+           not contain a prefix or particle. As in the above case, its probably a good idea to use
22+           enablePositionIncrements=true for explicit phrase queries from the user.
23+       PHRASE: Set autoGeneratePhraseQueries=true. If you do this, you should probably use both a very
24+           aggressive stopwords list, and you should probably also set enablePositionIncrements=false
25+           everywhere.  Otherwise, even documents that contain the query's phrase in exact order will
26+           not match because of slightly different grammatical structure.
27+    -->
28+    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
29+      <analyzer>
30+        <!-- map characters before the tokenizer:
31+             Optionally, instead of the JapaneseWidthFactory, you can choose to do the width
32+             mappings before the text is sent to the tokenizer.
33+        <charFilter class="solr.MappingCharFilterFactory" mapping="@gosen_path@/conf/mapping-japanese.txt"/>
34+        -->
35+
36+        <!-- morphological tokenizer: sets the SURFACE form as the token, but also sets these attributes:
37+             BasicFormAttribute, ConjugationAttribute, PartOfSpeechAttribute, PronunciationsAttribute,
38+             ReadingsAttribute, and SentenceStartAttribute.
39+        -->
40+        <tokenizer class="solr.JapaneseTokenizerFactory"/>
41+
42+        <!-- normalizes CJK width differences:
43+             1. Folds fullwidth ASCII variants into the equivalent basic latin
44+             2. Folds halfwidth Katakana variants into the equivalent kana
45+
46+             Note: alternatively you can use a MappingCharFilter before the tokenizer for this, but please note
47+             that mapping characters can change how Sen tokenizes text.
48+        -->
49+        <filter class="solr.JapaneseWidthFilterFactory"/>
50+
51+        <!-- the punctuation filter removes all-punctuation tokens base on Unicode properties.
52+             punctuation tokens are tagged as "unknown", and its better to do this than to remove
53+             tokens with an unknown pos (as they might be valuable!). Because this punctuation
54+             usually signifies a phrase or sentence boundary, enablePositionIncrements can be
55+             used to prevent phrase queries from matching across natural phrase/sentence boundaries -->
56+        <filter class="solr.JapanesePunctuationFilterFactory" enablePositionIncrements="true"/>
57+
58+        <!-- this is a part-of-speech based stopfilter, it removes any tokens that have a certain
59+             of speech. you can set enablePositionIncrements for tighter phrase queries -->
60+        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="@gosen_path@/conf/stoptags_ja.txt" enablePositionIncrements="true"/>
61+       
62+        <!-- a standard stopfilter, to specify specific stopwords. -->
63+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="@gosen_path@/conf/stopwords_ja.txt" enablePositionIncrements="true"/>
64+
65+        <!-- alternatively, instead of using a part-of-speech based stopfilter, you can use a
66+             part-of-speech based keepfilter: specifying only the parts of speech you wish to index.
67+             anything else will be removed. HOWEVER: this could be a little dangerous, because if
68+             we upgrade ipadic they might add some new tags (the tags are fairly specific), and suddenly
69+             things that you were indexing before are no longer being indexed. Its recommended to
70+             use the part-of-speech based stopfilter above if at all possible, for safety.
71+        <filter class="solr.JapanesePartOfSpeechKeepFilterFactory" tags="@gosen_path@/conf/keeptags_ja.txt" enablePositionIncrements="true"/>
72+        -->
73+
74+        <!-- before any stemming/lemmatization, you can protect words from being modified by specifying
75+             a protwords.txt.
76+        <filter class="solr.KeywordMarkerFilterFactory" protected="@gosen_path@/conf/protwords_ja.txt" ignoreCase="false"/>
77+
78+             or you can also supply a custom stem dictionary for inflected forms (tab separated). No
79+             further stemming/lemmatization will modify this.
80+        <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
81+        -->
82+
83+        <!-- the basic form filter converts inflected verbs and adjectives to their dictionary citation form. -->
84+        <filter class="solr.JapaneseBasicFormFilterFactory"/>
85+
86+        <!-- this filter heuristically normalizes katakana forms with a final prolonged sound mark -->
87+        <filter class="solr.JapaneseKatakanaStemFilterFactory"/>
88+
89+        <!-- you might want to lowercase for any english text content you have -->
90+        <filter class="solr.LowerCaseFilterFactory"/>
91+      </analyzer>
92+    </fieldType>
93  </types>
94 
95 
[87591]96@@ -534,7 +620,7 @@
[83880]97 
98    <!-- catchall field, containing all other searchable text fields (implemented
99         via copyField further on in this schema  -->
100-   <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
101+   <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/>
102 
103    <!-- catchall text field that indexes tokens both normally and in reverse for efficient
104         leading wildcard queries. -->
[87591]105--- example/solr-ja/conf/solrconfig.xml.orig    2011-11-22 22:02:40.000000000 +0900
106+++ example/solr-ja/conf/solrconfig.xml 2011-11-27 00:08:15.000000000 +0900
107@@ -86,6 +86,7 @@
[83880]108        is found that matches, it will be ignored
109     -->
110   <lib dir="../../contrib/clustering/lib/" />
111+  <lib dir="@gosen_path@/lib/" />
[87591]112   <lib dir="/total/crap/dir/ignored" />
113 
[83880]114   <!-- an exact path can be used to specify a specific file.  This
[87591]115@@ -791,6 +792,7 @@
[83880]116        <str name="wt">velocity</str>
117 
118        <str name="v.template">browse</str>
119+       <str name="v.properties">velocity.properties</str>
120        <str name="v.layout">layout</str>
121        <str name="title">Solritas</str>
122 
123--- example/solr-ja/conf/velocity/head.vm.orig  2011-09-03 23:57:07.000000000 +0900
124+++ example/solr-ja/conf/velocity/head.vm       2011-09-05 23:56:41.000000000 +0900
125@@ -32,7 +32,7 @@
126            extraParams:{
127              'terms.prefix': function() { return $("\#q").val();},
128              'terms.sort': 'count',
129-             'terms.fl': 'name',
130+             'terms.fl': 'text',
131              'wt': 'velocity',
132              'v.template': 'suggest'
133            }
134--- example/solr-ja/conf/velocity/suggest.vm.orig       2011-09-03 23:57:07.000000000 +0900
135+++ example/solr-ja/conf/velocity/suggest.vm    2011-09-05 23:57:16.000000000 +0900
136@@ -1,3 +1,3 @@
137-#foreach($t in $response.response.terms.name)
138+#foreach($t in $response.response.terms.text)
139 $t.key
140 #end
141\ No newline at end of file
Note: See TracBrowser for help on using the repository browser.