Ticket #31155: patch-solr-ja.diff

File patch-solr-ja.diff, 8.0 KB (added by humem (humem), 13 years ago)
  • example/solr-ja/conf/schema.xml

    old new  
    467467    See http://wiki.apache.org/solr/SpatialSearch
    468468   -->
    469469    <fieldtype name="geohash" class="solr.GeoHashField"/>
     470
     471    <!-- configuration for japanese text, using a morphological analyzer
     472      Most possibilities for customization are specified here in the schema.
     473
     474      Note: you can set the default query operator to be OR, AND, or PHRASE:
     475       OR: Use these defaults (autoGeneratePhraseQueries="false", <solrQueryParser defaultOperator="OR"/>
     476           In this case Solr works like it does with the English language. The default query is OR,
     477           but documents that contain more of the query terms get a special boost. You can probably
     478           use a less aggressive stopwords/stoptags in this case, and its probably a good idea to use
     479           enablePositionIncrements=true, so that if a user puts a query in quotes, they get a much more
     480           exact phrase query.
     481       AND: Set autoGeneratePhraseQueries=false, but set <solrQueryParser defaultOperator="AND"/> in
     482           your schema.xml. Note if you do this, you should use a more aggressive stopwords/stoptags
     483           list (at least at query-time), otherwise a document might not match simply because it does
     484           not contain a prefix or particle. As in the above case, its probably a good idea to use
     485           enablePositionIncrements=true for explicit phrase queries from the user.
     486       PHRASE: Set autoGeneratePhraseQueries=true. If you do this, you should probably use both a very
     487           aggressive stopwords list, and you should probably also set enablePositionIncrements=false
     488           everywhere.  Otherwise, even documents that contain the query's phrase in exact order will
     489           not match because of slightly different grammatical structure.
     490    -->
     491    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
     492      <analyzer>
     493        <!-- map characters before the tokenizer:
     494             Optionally, instead of the JapaneseWidthFactory, you can choose to do the width
     495             mappings before the text is sent to the tokenizer.
     496        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-japanese.txt"/>
     497        -->
     498
     499        <!-- morphological tokenizer: sets the SURFACE form as the token, but also sets these attributes:
     500             BasicFormAttribute, ConjugationAttribute, PartOfSpeechAttribute, PronunciationsAttribute,
     501             ReadingsAttribute, and SentenceStartAttribute.
     502        -->
     503        <tokenizer class="solr.JapaneseTokenizerFactory"/>
     504
     505        <!-- normalizes CJK width differences:
     506             1. Folds fullwidth ASCII variants into the equivalent basic latin
     507             2. Folds halfwidth Katakana variants into the equivalent kana
     508
     509             Note: alternatively you can use a MappingCharFilter before the tokenizer for this, but please note
     510             that mapping characters can change how Sen tokenizes text.
     511        -->
     512        <filter class="solr.JapaneseWidthFilterFactory"/>
     513
     514        <!-- the punctuation filter removes all-punctuation tokens base on Unicode properties.
     515             punctuation tokens are tagged as "unknown", and its better to do this than to remove
     516             tokens with an unknown pos (as they might be valuable!). Because this punctuation
     517             usually signifies a phrase or sentence boundary, enablePositionIncrements can be
     518             used to prevent phrase queries from matching across natural phrase/sentence boundaries -->
     519        <filter class="solr.JapanesePunctuationFilterFactory" enablePositionIncrements="true"/>
     520
     521        <!-- this is a part-of-speech based stopfilter, it removes any tokens that have a certain
     522             of speech. you can set enablePositionIncrements for tighter phrase queries -->
     523        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="stoptags_ja.txt" enablePositionIncrements="true"/>
     524       
     525        <!-- a standard stopfilter, to specify specific stopwords. -->
     526        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_ja.txt" enablePositionIncrements="true"/>
     527
     528        <!-- alternatively, instead of using a part-of-speech based stopfilter, you can use a
     529             part-of-speech based keepfilter: specifying only the parts of speech you wish to index.
     530             anything else will be removed. HOWEVER: this could be a little dangerous, because if
     531             we upgrade ipadic they might add some new tags (the tags are fairly specific), and suddenly
     532             things that you were indexing before are no longer being indexed. Its recommended to
     533             use the part-of-speech based stopfilter above if at all possible, for safety.
     534        <filter class="solr.JapanesePartOfSpeechKeepFilterFactory" tags="keeptags_ja.txt" enablePositionIncrements="true"/>
     535        -->
     536
     537        <!-- before any stemming/lemmatization, you can protect words from being modified by specifying
     538             a protwords.txt.
     539        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_ja.txt" ignoreCase="false"/>
     540
     541             or you can also supply a custom stem dictionary for inflected forms (tab separated). No
     542             further stemming/lemmatization will modify this.
     543        <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
     544        -->
     545
     546        <!-- the basic form filter converts inflected verbs and adjectives to their dictionary citation form. -->
     547        <filter class="solr.JapaneseBasicFormFilterFactory"/>
     548
     549        <!-- this filter heuristically normalizes katakana forms with a final prolonged sound mark -->
     550        <filter class="solr.JapaneseKatakanaStemFilterFactory"/>
     551
     552        <!-- you might want to lowercase for any english text content you have -->
     553        <filter class="solr.LowerCaseFilterFactory"/>
     554      </analyzer>
     555    </fieldType>
    470556 </types>
    471557
    472558
     
    533619
    534620   <!-- catchall field, containing all other searchable text fields (implemented
    535621        via copyField further on in this schema  -->
    536    <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
     622   <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/>
    537623
    538624   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
    539625        leading wildcard queries. -->
  • example/solr-ja/conf/solrconfig.xml

    old new  
    8080       is found that matches, it will be ignored
    8181    -->
    8282  <lib dir="../../contrib/clustering/lib/" />
     83  <lib dir="../../contrib/lucene-gosen/lib/" />
    8384  <lib dir="/total/crap/dir/ignored" />
    8485  <!-- an exact path can be used to specify a specific file.  This
    8586       will cause a serious error to be logged if it can't be loaded.
     
    780781       <str name="wt">velocity</str>
    781782
    782783       <str name="v.template">browse</str>
     784       <str name="v.properties">velocity.properties</str>
    783785       <str name="v.layout">layout</str>
    784786       <str name="title">Solritas</str>
    785787
  • example/solr-ja/conf/velocity/head.vm

    old new  
    3232           extraParams:{
    3333             'terms.prefix': function() { return $("\#q").val();},
    3434             'terms.sort': 'count',
    35              'terms.fl': 'name',
     35             'terms.fl': 'text',
    3636             'wt': 'velocity',
    3737             'v.template': 'suggest'
    3838           }
  • example/solr-ja/conf/velocity/suggest.vm

    old new  
    1 #foreach($t in $response.response.terms.name)
     1#foreach($t in $response.response.terms.text)
    22$t.key
    33#end
     4 No newline at end of file