Ignore:
Timestamp:
Dec 25, 2014, 11:59:20 AM (5 years ago)
Author:
hum@…
Message:

word2vec: update to 20140915; disable mt variant; add livecheck

Location:
trunk/dports/textproc/word2vec
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/dports/textproc/word2vec/Portfile

    r123335 r130058  
    55
    66name                word2vec
    7 version             20131218
    8 revision            1
     7version             20140915
    98categories          textproc
    109maintainers         hum openmaintainer
     
    2423fetch.type          svn
    2524svn.url             http://word2vec.googlecode.com/svn/trunk
    26 svn.revision        37
     25svn.revision        41
    2726worksrcdir          trunk
     27
     28depends_run         port:wget
    2829
    2930patchfiles          patch-malloc.diff \
     
    3435variant universal   {}
    3536
    36 configure.optflags  -O2
     37configure.optflags  -O3
    3738
    3839build.args          CC="${configure.cc}" \
     
    6465}
    6566
    66 variant mt description {Apply multiple threads patch} {
    67     distfiles-append    word2vec.local.tgz:mt
    68     master_sites-append http://www.chokkan.org/software/word2vec-multi/:mt
    69     checksums           rmd160  5c9092531f1c4d8f5482359e9d78f847adcd260c \
    70                         sha256  57476a59f3f485ee5ada7214caf67fcbfa53f78283a7e85c5b6c764a96171844
    71     post-patch {
    72         system -W ${worksrcpath} "patch -p1 < ${workpath}/word2vec.local/word2vec.local.patch"
    73     }
    74 }
     67variant mt description {disabled: Apply multiple threads patch} {}
    7568
    76 default_variants    +mt
    77 
    78 livecheck.type      none
     69livecheck.type      regex
     70livecheck.url       https://code.google.com/p/word2vec/source/list
     71livecheck.version   ${svn.revision}
     72livecheck.regex     r(\\d+)
  • trunk/dports/textproc/word2vec/files/patch-demo.diff

    r117348 r130058  
    1 --- demo-analogy.sh.orig        2014-02-22 20:36:04.000000000 +0900
    2 +++ demo-analogy.sh     2014-02-22 20:27:27.000000000 +0900
    3 @@ -1,11 +1,13 @@
    4 -make
    5 +#!/bin/sh
    6  if [ ! -e text8 ]; then
    7 -  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
    8 -  gzip -d text8.gz -f
    9 +  curl -O http://mattmahoney.net/dc/text8.zip
    10 +  unzip text8.zip
     1--- demo-analogy.sh.orig        2014-09-07 01:54:27.000000000 +0900
     2+++ demo-analogy.sh     2014-12-24 22:55:24.000000000 +0900
     3@@ -7,5 +7,5 @@
     4 echo Note that for the word analogy to perform well, the model should be trained on much larger data set
     5 echo Example input: paris france berlin
     6 echo ---------------------------------------------------------------------------------------------------
     7-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
     8-./word-analogy vectors.bin
     9+time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
     10+@EXECDIR@/word-analogy vectors.bin
     11--- demo-classes.sh.orig        2014-09-07 01:54:27.000000000 +0900
     12+++ demo-classes.sh     2014-12-24 22:57:00.000000000 +0900
     13@@ -3,6 +3,6 @@
     14   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
     15   gzip -d text8.gz -f
    1116 fi
    12  echo -----------------------------------------------------------------------------------------------------
    13  echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
    14  echo Example input: paris france berlin
    15  echo -----------------------------------------------------------------------------------------------------
    16 -time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
    17 -./word-analogy vectors.bin
    18 +if [ ! -e vectors.bin ]; then
    19 +  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
    20 +fi
    21 +@EXECDIR@/word-analogy vectors.bin
    22 --- demo-classes.sh.orig        2014-02-22 20:36:09.000000000 +0900
    23 +++ demo-classes.sh     2014-02-22 20:22:53.000000000 +0900
    24 @@ -1,8 +1,8 @@
    25 -make
    26 +#!/bin/sh
    27  if [ ! -e text8 ]; then
    28 -  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
    29 -  gzip -d text8.gz -f
    30 +  curl -O http://mattmahoney.net/dc/text8.zip
    31 +  unzip text8.zip
    32  fi
    33 -time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
    34 +time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
     17-time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
     18+time @EXECDIR@/word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
    3519 sort classes.txt -k 2 -n > classes.sorted.txt
    3620 echo The word classes were saved to file classes.sorted.txt
    37 --- demo-phrase-accuracy.sh.orig        2014-02-22 20:36:25.000000000 +0900
    38 +++ demo-phrase-accuracy.sh     2014-02-22 20:29:40.000000000 +0900
    39 @@ -1,12 +1,14 @@
    40 -make
    41 +#!/bin/sh
    42  if [ ! -e text8 ]; then
    43 -  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
    44 -  gzip -d text8.gz -f
    45 +  curl -O http://mattmahoney.net/dc/text8.zip
    46 +  unzip text8.zip
     21--- demo-phrase-accuracy.sh.orig        2014-09-07 01:54:27.000000000 +0900
     22+++ demo-phrase-accuracy.sh     2014-12-24 22:57:51.000000000 +0900
     23@@ -4,8 +4,8 @@
     24   gzip -d news.2012.en.shuffled.gz -f
    4725 fi
    48  echo ----------------------------------------------------------------------------------------------------------------
    49  echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
    50  echo To achieve better accuracy, larger training set is needed
    51  echo ----------------------------------------------------------------------------------------------------------------
    52 -time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
    53 -time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
    54 -./compute-accuracy vectors-phrase.bin <questions-phrases.txt
    55 +if [ ! -e vectors-phrase.bin ]; then
    56 +  time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
    57 +  time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
    58 +fi
    59 +@EXECDIR@/compute-accuracy vectors-phrase.bin < @EXDIR@/questions-phrases.txt
    60 --- demo-phrases.sh.orig        2014-02-22 20:36:17.000000000 +0900
    61 +++ demo-phrases.sh     2014-02-22 20:30:19.000000000 +0900
    62 @@ -1,8 +1,10 @@
    63 -make
    64 +#!/bin/sh
    65  if [ ! -e text8 ]; then
    66 -  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
    67 -  gzip -d text8.gz -f
    68 +  curl -O http://mattmahoney.net/dc/text8.zip
    69 +  unzip text8.zip
     26 sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
     27-time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
     28-time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
     29+time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
     30+time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
     31 tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
     32-time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
     33-./compute-accuracy vectors-phrase.bin < questions-phrases.txt
     34+time @EXECDIR@/word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
     35+@EXECDIR@/compute-accuracy vectors-phrase.bin < questions-phrases.txt
     36--- demo-phrases.sh.orig        2014-09-07 01:54:27.000000000 +0900
     37+++ demo-phrases.sh     2014-12-24 22:58:20.000000000 +0900
     38@@ -4,8 +4,8 @@
     39   gzip -d news.2012.en.shuffled.gz -f
    7040 fi
    71 -time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
    72 -time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
     41 sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
     42-time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
     43-time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
     44+time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
     45+time @EXECDIR@/word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
     46 tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
     47-time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
    7348-./distance vectors-phrase.bin
    74 \ No newline at end of file
    75 +if [ ! -e vectors-phrase.bin ]; then
    76 +  time @EXECDIR@/word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
    77 +  time @EXECDIR@/word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
    78 +fi
     49+time @EXECDIR@/word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
    7950+@EXECDIR@/distance vectors-phrase.bin
    80 --- demo-word-accuracy.sh.orig  2014-02-22 20:36:32.000000000 +0900
    81 +++ demo-word-accuracy.sh       2014-02-22 20:31:16.000000000 +0900
    82 @@ -1,8 +1,10 @@
    83 -make
    84 +#!/bin/sh
    85  if [ ! -e text8 ]; then
    86 -  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
    87 -  gzip -d text8.gz -f
    88 +  curl -O http://mattmahoney.net/dc/text8.zip
    89 +  unzip text8.zip
     51--- demo-word-accuracy.sh.orig  2014-09-07 01:54:27.000000000 +0900
     52+++ demo-word-accuracy.sh       2014-12-24 22:58:49.000000000 +0900
     53@@ -3,6 +3,6 @@
     54   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
     55   gzip -d text8.gz -f
    9056 fi
    91 -time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
     57-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
    9258-./compute-accuracy vectors.bin 30000 < questions-words.txt
    9359-# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
    94 +if [ ! -e vectors.bin ]; then
    95 +  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
    96 +fi
    97 +@EXECDIR@/compute-accuracy vectors.bin 30000 < @EXDIR@/questions-words.txt
    98 +# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < @EXDIR@/questions-words.txt
    99 --- demo-word.sh.orig   2014-02-22 20:36:47.000000000 +0900
    100 +++ demo-word.sh        2014-02-22 20:31:57.000000000 +0900
    101 @@ -1,7 +1,9 @@
    102 -make
    103 +#!/bin/sh
    104  if [ ! -e text8 ]; then
    105 -  wget http://mattmahoney.net/dc/text8.zip -O text8.gz
    106 -  gzip -d text8.gz -f
    107 +  curl -O http://mattmahoney.net/dc/text8.zip
    108 +  unzip text8.zip
     60+time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
     61+@EXECDIR@/compute-accuracy vectors.bin 30000 < questions-words.txt
     62+# to compute accuracy with the full vocabulary, use: @EXECDIR@/compute-accuracy vectors.bin < questions-words.txt
     63--- demo-word.sh.orig   2014-09-07 01:54:27.000000000 +0900
     64+++ demo-word.sh        2014-12-24 22:59:00.000000000 +0900
     65@@ -3,5 +3,5 @@
     66   wget http://mattmahoney.net/dc/text8.zip -O text8.gz
     67   gzip -d text8.gz -f
    10968 fi
    110 -time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
     69-time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
    11170-./distance vectors.bin
    112 \ No newline at end of file
    113 +if [ ! -e vectors.bin ]; then
    114 +  time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
    115 +fi
     71+time @EXECDIR@/word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
    11672+@EXECDIR@/distance vectors.bin
Note: See TracChangeset for help on using the changeset viewer.