Changeset 91889 for trunk/dports/java


Ignore:
Timestamp:
Apr 12, 2012, 4:20:30 PM (7 years ago)
Author:
hum@…
Message:

apache-solr: update to 3.6.0; remove the dependency on lucene-gosen in ja variant.

Location:
trunk/dports/java/apache-solr
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/dports/java/apache-solr/Portfile

    r88853 r91889  
    55
    66name                apache-solr
    7 version             3.5.0
     7version             3.6.0
    88categories          java textproc
    99platforms           darwin
     
    1818master_sites        apache:lucene/solr/${version}/
    1919extract.suffix      .tgz
    20 checksums           rmd160  365d4b27753375ea3a39b9d42c06b80dce474731 \
    21                     sha256  804f3ba9d1296f81388605a79538b7362355693fbdd03b7b2dbf9a706bf1d1d0
     20checksums           rmd160  f54aee9e0d0196e7e96408b40ae025fc69478229 \
     21                    sha256  3acac4323ba3dbfa153d8ef01f156bab9b0eccf1b1f1f03e91b8b6739d3dc6c6
    2222
    2323# set the destination paths.
     
    5555set solr_home_ja    ${solr_home}-ja
    5656
    57 variant ja description {Add Japanese settings with lucene-gosen} {
    58     depends_run-append  port:lucene-gosen
     57variant ja description {Add Japanese settings} {
    5958    # create Japanese solr home 'solr-ja'.
    6059    post-extract {
     
    6867    }
    6968    post-destroot {
    70         # set the lucene-gosen configuration path to config files.
    71         foreach config {schema.xml solrconfig.xml} {
    72             reinplace "s|@gosen_path@|${java_basepath}/lucene-gosen|g" \
    73                 ${destroot}${solr_home_ja}/conf/${config}
    74         }
    7569        # install a property file for UTF-8 encoding.
    7670        copy ${filespath}/velocity.properties ${destroot}${solr_home_ja}/conf
  • trunk/dports/java/apache-solr/files/patch-solr-ja.diff

    r87591 r91889  
    1 --- example/solr-ja/conf/schema.xml.orig        2011-11-22 22:02:40.000000000 +0900
    2 +++ example/solr-ja/conf/schema.xml     2011-11-27 00:08:15.000000000 +0900
    3 @@ -469,6 +469,92 @@
    4      See http://wiki.apache.org/solr/SpatialSearch
    5     -->
    6      <fieldtype name="geohash" class="solr.GeoHashField"/>
    7 +
    8 +    <!-- configuration for japanese text, using a morphological analyzer
    9 +      Most possibilities for customization are specified here in the schema.
    10 +
    11 +      Note: you can set the default query operator to be OR, AND, or PHRASE:
    12 +       OR: Use these defaults (autoGeneratePhraseQueries="false", <solrQueryParser defaultOperator="OR"/>
    13 +           In this case Solr works like it does with the English language. The default query is OR,
    14 +           but documents that contain more of the query terms get a special boost. You can probably
    15 +           use a less aggressive stopwords/stoptags in this case, and its probably a good idea to use
    16 +           enablePositionIncrements=true, so that if a user puts a query in quotes, they get a much more
    17 +           exact phrase query.
    18 +       AND: Set autoGeneratePhraseQueries=false, but set <solrQueryParser defaultOperator="AND"/> in
    19 +           your schema.xml. Note if you do this, you should use a more aggressive stopwords/stoptags
    20 +           list (at least at query-time), otherwise a document might not match simply because it does
    21 +           not contain a prefix or particle. As in the above case, its probably a good idea to use
    22 +           enablePositionIncrements=true for explicit phrase queries from the user.
    23 +       PHRASE: Set autoGeneratePhraseQueries=true. If you do this, you should probably use both a very
    24 +           aggressive stopwords list, and you should probably also set enablePositionIncrements=false
    25 +           everywhere.  Otherwise, even documents that contain the query's phrase in exact order will
    26 +           not match because of slightly different grammatical structure.
    27 +    -->
    28 +    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
    29 +      <analyzer>
    30 +        <!-- map characters before the tokenizer:
    31 +             Optionally, instead of the JapaneseWidthFactory, you can choose to do the width
    32 +             mappings before the text is sent to the tokenizer.
    33 +        <charFilter class="solr.MappingCharFilterFactory" mapping="@gosen_path@/conf/mapping-japanese.txt"/>
    34 +        -->
    35 +
    36 +        <!-- morphological tokenizer: sets the SURFACE form as the token, but also sets these attributes:
    37 +             BasicFormAttribute, ConjugationAttribute, PartOfSpeechAttribute, PronunciationsAttribute,
    38 +             ReadingsAttribute, and SentenceStartAttribute.
    39 +        -->
    40 +        <tokenizer class="solr.JapaneseTokenizerFactory"/>
    41 +
    42 +        <!-- normalizes CJK width differences:
    43 +             1. Folds fullwidth ASCII variants into the equivalent basic latin
    44 +             2. Folds halfwidth Katakana variants into the equivalent kana
    45 +
    46 +             Note: alternatively you can use a MappingCharFilter before the tokenizer for this, but please note
    47 +             that mapping characters can change how Sen tokenizes text.
    48 +        -->
    49 +        <filter class="solr.JapaneseWidthFilterFactory"/>
    50 +
    51 +        <!-- the punctuation filter removes all-punctuation tokens base on Unicode properties.
    52 +             punctuation tokens are tagged as "unknown", and its better to do this than to remove
    53 +             tokens with an unknown pos (as they might be valuable!). Because this punctuation
    54 +             usually signifies a phrase or sentence boundary, enablePositionIncrements can be
    55 +             used to prevent phrase queries from matching across natural phrase/sentence boundaries -->
    56 +        <filter class="solr.JapanesePunctuationFilterFactory" enablePositionIncrements="true"/>
    57 +
    58 +        <!-- this is a part-of-speech based stopfilter, it removes any tokens that have a certain
    59 +             of speech. you can set enablePositionIncrements for tighter phrase queries -->
    60 +        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="@gosen_path@/conf/stoptags_ja.txt" enablePositionIncrements="true"/>
    61 +       
    62 +        <!-- a standard stopfilter, to specify specific stopwords. -->
    63 +        <filter class="solr.StopFilterFactory" ignoreCase="true" words="@gosen_path@/conf/stopwords_ja.txt" enablePositionIncrements="true"/>
    64 +
    65 +        <!-- alternatively, instead of using a part-of-speech based stopfilter, you can use a
    66 +             part-of-speech based keepfilter: specifying only the parts of speech you wish to index.
    67 +             anything else will be removed. HOWEVER: this could be a little dangerous, because if
    68 +             we upgrade ipadic they might add some new tags (the tags are fairly specific), and suddenly
    69 +             things that you were indexing before are no longer being indexed. Its recommended to
    70 +             use the part-of-speech based stopfilter above if at all possible, for safety.
    71 +        <filter class="solr.JapanesePartOfSpeechKeepFilterFactory" tags="@gosen_path@/conf/keeptags_ja.txt" enablePositionIncrements="true"/>
    72 +        -->
    73 +
    74 +        <!-- before any stemming/lemmatization, you can protect words from being modified by specifying
    75 +             a protwords.txt.
    76 +        <filter class="solr.KeywordMarkerFilterFactory" protected="@gosen_path@/conf/protwords_ja.txt" ignoreCase="false"/>
    77 +
    78 +             or you can also supply a custom stem dictionary for inflected forms (tab separated). No
    79 +             further stemming/lemmatization will modify this.
    80 +        <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
    81 +        -->
    82 +
    83 +        <!-- the basic form filter converts inflected verbs and adjectives to their dictionary citation form. -->
    84 +        <filter class="solr.JapaneseBasicFormFilterFactory"/>
    85 +
    86 +        <!-- this filter heuristically normalizes katakana forms with a final prolonged sound mark -->
    87 +        <filter class="solr.JapaneseKatakanaStemFilterFactory"/>
    88 +
    89 +        <!-- you might want to lowercase for any english text content you have -->
    90 +        <filter class="solr.LowerCaseFilterFactory"/>
    91 +      </analyzer>
    92 +    </fieldType>
    93   </types>
    94  
    95  
    96 @@ -534,7 +620,7 @@
     1--- example/solr-ja/conf/schema.xml.orig        2012-03-31 01:07:12.000000000 +0900
     2+++ example/solr-ja/conf/schema.xml     2012-04-13 00:51:09.000000000 +0900
     3@@ -931,7 +931,7 @@
    974 
    985    <!-- catchall field, containing all other searchable text fields (implemented
     
    10310    <!-- catchall text field that indexes tokens both normally and in reverse for efficient
    10411         leading wildcard queries. -->
    105 --- example/solr-ja/conf/solrconfig.xml.orig    2011-11-22 22:02:40.000000000 +0900
    106 +++ example/solr-ja/conf/solrconfig.xml 2011-11-27 00:08:15.000000000 +0900
    107 @@ -86,6 +86,7 @@
    108         is found that matches, it will be ignored
    109      -->
    110    <lib dir="../../contrib/clustering/lib/" />
    111 +  <lib dir="@gosen_path@/lib/" />
    112    <lib dir="/total/crap/dir/ignored" />
    113  
    114    <!-- an exact path can be used to specify a specific file.  This
    115 @@ -791,6 +792,7 @@
     12--- example/solr-ja/conf/solrconfig.xml.orig    2012-03-31 01:07:12.000000000 +0900
     13+++ example/solr-ja/conf/solrconfig.xml 2012-04-13 00:51:09.000000000 +0900
     14@@ -798,6 +798,7 @@
    11615        <str name="wt">velocity</str>
    11716 
Note: See TracChangeset for help on using the changeset viewer.