source: trunk/dports/java/apache-solr/files/patch-solr-ja.diff @ 87591

Last change on this file since 87591 was 87591, checked in by hum@…, 9 years ago

apache-solr: update to 3.5.0.

File size: 8.0 KB
  • example/solr-ja/conf/schema.xml

    old new  
    469469    See
    470470   -->
    471471    <fieldtype name="geohash" class="solr.GeoHashField"/>
     473    <!-- configuration for japanese text, using a morphological analyzer
     474      Most possibilities for customization are specified here in the schema.
     476      Note: you can set the default query operator to be OR, AND, or PHRASE:
     477       OR: Use these defaults (autoGeneratePhraseQueries="false", <solrQueryParser defaultOperator="OR"/>
     478           In this case Solr works like it does with the English language. The default query is OR,
     479           but documents that contain more of the query terms get a special boost. You can probably
     480           use a less aggressive stopwords/stoptags in this case, and its probably a good idea to use
     481           enablePositionIncrements=true, so that if a user puts a query in quotes, they get a much more
     482           exact phrase query.
     483       AND: Set autoGeneratePhraseQueries=false, but set <solrQueryParser defaultOperator="AND"/> in
     484           your schema.xml. Note if you do this, you should use a more aggressive stopwords/stoptags
     485           list (at least at query-time), otherwise a document might not match simply because it does
     486           not contain a prefix or particle. As in the above case, its probably a good idea to use
     487           enablePositionIncrements=true for explicit phrase queries from the user.
     488       PHRASE: Set autoGeneratePhraseQueries=true. If you do this, you should probably use both a very
     489           aggressive stopwords list, and you should probably also set enablePositionIncrements=false
     490           everywhere.  Otherwise, even documents that contain the query's phrase in exact order will
     491           not match because of slightly different grammatical structure.
     492    -->
     493    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
     494      <analyzer>
     495        <!-- map characters before the tokenizer:
     496             Optionally, instead of the JapaneseWidthFactory, you can choose to do the width
     497             mappings before the text is sent to the tokenizer.
     498        <charFilter class="solr.MappingCharFilterFactory" mapping="@gosen_path@/conf/mapping-japanese.txt"/>
     499        -->
     501        <!-- morphological tokenizer: sets the SURFACE form as the token, but also sets these attributes:
     502             BasicFormAttribute, ConjugationAttribute, PartOfSpeechAttribute, PronunciationsAttribute,
     503             ReadingsAttribute, and SentenceStartAttribute.
     504        -->
     505        <tokenizer class="solr.JapaneseTokenizerFactory"/>
     507        <!-- normalizes CJK width differences:
     508             1. Folds fullwidth ASCII variants into the equivalent basic latin
     509             2. Folds halfwidth Katakana variants into the equivalent kana
     511             Note: alternatively you can use a MappingCharFilter before the tokenizer for this, but please note
     512             that mapping characters can change how Sen tokenizes text.
     513        -->
     514        <filter class="solr.JapaneseWidthFilterFactory"/>
     516        <!-- the punctuation filter removes all-punctuation tokens base on Unicode properties.
     517             punctuation tokens are tagged as "unknown", and its better to do this than to remove
     518             tokens with an unknown pos (as they might be valuable!). Because this punctuation
     519             usually signifies a phrase or sentence boundary, enablePositionIncrements can be
     520             used to prevent phrase queries from matching across natural phrase/sentence boundaries -->
     521        <filter class="solr.JapanesePunctuationFilterFactory" enablePositionIncrements="true"/>
     523        <!-- this is a part-of-speech based stopfilter, it removes any tokens that have a certain
     524             of speech. you can set enablePositionIncrements for tighter phrase queries -->
     525        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="@gosen_path@/conf/stoptags_ja.txt" enablePositionIncrements="true"/>
     527        <!-- a standard stopfilter, to specify specific stopwords. -->
     528        <filter class="solr.StopFilterFactory" ignoreCase="true" words="@gosen_path@/conf/stopwords_ja.txt" enablePositionIncrements="true"/>
     530        <!-- alternatively, instead of using a part-of-speech based stopfilter, you can use a
     531             part-of-speech based keepfilter: specifying only the parts of speech you wish to index.
     532             anything else will be removed. HOWEVER: this could be a little dangerous, because if
     533             we upgrade ipadic they might add some new tags (the tags are fairly specific), and suddenly
     534             things that you were indexing before are no longer being indexed. Its recommended to
     535             use the part-of-speech based stopfilter above if at all possible, for safety.
     536        <filter class="solr.JapanesePartOfSpeechKeepFilterFactory" tags="@gosen_path@/conf/keeptags_ja.txt" enablePositionIncrements="true"/>
     537        -->
     539        <!-- before any stemming/lemmatization, you can protect words from being modified by specifying
     540             a protwords.txt.
     541        <filter class="solr.KeywordMarkerFilterFactory" protected="@gosen_path@/conf/protwords_ja.txt" ignoreCase="false"/>
     543             or you can also supply a custom stem dictionary for inflected forms (tab separated). No
     544             further stemming/lemmatization will modify this.
     545        <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
     546        -->
     548        <!-- the basic form filter converts inflected verbs and adjectives to their dictionary citation form. -->
     549        <filter class="solr.JapaneseBasicFormFilterFactory"/>
     551        <!-- this filter heuristically normalizes katakana forms with a final prolonged sound mark -->
     552        <filter class="solr.JapaneseKatakanaStemFilterFactory"/>
     554        <!-- you might want to lowercase for any english text content you have -->
     555        <filter class="solr.LowerCaseFilterFactory"/>
     556      </analyzer>
     557    </fieldType>
    472558 </types>
    535621   <!-- catchall field, containing all other searchable text fields (implemented
    536622        via copyField further on in this schema  -->
    537    <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
     623   <field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/>
    539625   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
    540626        leading wildcard queries. -->
  • example/solr-ja/conf/solrconfig.xml

    old new  
    8686       is found that matches, it will be ignored
    8787    -->
    8888  <lib dir="../../contrib/clustering/lib/" />
     89  <lib dir="@gosen_path@/lib/" />
    8990  <lib dir="/total/crap/dir/ignored" />
    9192  <!-- an exact path can be used to specify a specific file.  This
    791792       <str name="wt">velocity</str>
    793794       <str name="v.template">browse</str>
     795       <str name=""></str>
    794796       <str name="v.layout">layout</str>
    795797       <str name="title">Solritas</str>
  • example/solr-ja/conf/velocity/head.vm

    old new  
    3232           extraParams:{
    3333             'terms.prefix': function() { return $("\#q").val();},
    3434             'terms.sort': 'count',
    35              'terms.fl': 'name',
     35             'terms.fl': 'text',
    3636             'wt': 'velocity',
    3737             'v.template': 'suggest'
    3838           }
  • example/solr-ja/conf/velocity/suggest.vm

    old new  
    1 #foreach($t in $
     1#foreach($t in $response.response.terms.text)
     4 No newline at end of file
Note: See TracBrowser for help on using the repository browser.