- moving to v0.4 julia

esalesky · esalesky · commit c63d2913e370 · 2016-01-07T15:09:20.000-05:00
diff --git a/.travis.yml b/.travis.yml
@@ -14,4 +14,4 @@ before_install:
     - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
 script:
     - julia -e 'versioninfo(); Pkg.init(); Pkg.clone("https://github.com/saltpork/Stage.jl"); Pkg.clone("https://github.com/mit-nlp/Ollam.jl"); Pkg.clone(pwd())'
-    - cd test; julia --color runtests.jl
+    - cd test; julia --color=yes runtests.jl
diff --git a/REQUIRE b/REQUIRE
@@ -1,4 +1,4 @@
-julia 0.3 0.4-
+julia 0.4-
 DataStructures
 Devectorize
 Iterators
diff --git a/src/constants.jl b/src/constants.jl
diff --git a/src/features.jl b/src/features.jl
@@ -16,14 +16,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import Base.norm
 export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams!
 
 immutable NgramStringIterator 
-  string :: String
+  string :: AbstractString
   order  :: Int32
   truncated_start :: Bool
 end
-type StringPosition
+type AbstractStringPosition
   start :: Int32
   fin   :: Int32
   nth   :: Int32
@@ -35,9 +36,9 @@ function start(ngi :: NgramStringIterator)
     for i = 1:(ngi.order-1)  #necessary because strings are indexed to bytes, not characters
       idx = nextind(ngi.string, idx)
     end
-    return StringPosition(1, idx, ngi.order)
+    return AbstractStringPosition(1, idx, ngi.order)
   else
-    return StringPosition(1, 1, 1)
+    return AbstractStringPosition(1, 1, 1)
   end
 end
 
@@ -61,11 +62,11 @@ end
 # -------------------------------------------------------------------------------------------------------------------------
 # feature extractors
 # -------------------------------------------------------------------------------------------------------------------------
-make_string(words :: String, b, e) = SubString(words, b, e)
+make_string(words :: AbstractString, b, e) = SubString(words, b, e)
 make_string(words :: Array, b, e) = join(words[b:e], " ")
 
 function ngrams(words::Array; order = 2, truncated_start = false)
-  ret = String[]
+  ret = AbstractString[]
 
   if !truncated_start
     for o = 1:min(order - 1, length(words))
@@ -81,19 +82,19 @@ function ngrams(words::Array; order = 2, truncated_start = false)
   return ret
 end
 
-function ngrams(words::String; order = 2, truncated_start = false)
-  ret = String[]
+function ngrams(words :: AbstractString; order = 2, truncated_start = false)
+  ret = AbstractString[]
   return ngrams!(ret, words, order = order, truncated_start = truncated_start)
 end
 
-function ngrams!(ret :: Array, words :: String; order = 2, truncated_start = false)
+function ngrams!(ret :: Array, words :: AbstractString; order = 2, truncated_start = false)
   for x in ngram_iterator(words, order = order, truncated_start = truncated_start)
     push!(ret, x)
   end
   return ret
 end
 
-ngram_iterator(words :: String; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
+ngram_iterator(words :: AbstractString; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
 
 # -------------------------------------------------------------------------------------------------------------------------
 # feature vector operations
@@ -107,7 +108,7 @@ function sparse_count(text, bkg)
 end
 
 function dict_count(tokens)
-  map = DefaultDict{String,Int32}()
+  map = DefaultDict{AbstractString,Int32}()
   for w in tokens
     map[w] += 1
   end
diff --git a/src/models.jl b/src/models.jl
@@ -19,13 +19,13 @@
 export make_background, stats, vocab_size, apply, BKG
 
 type BKG
-  dict  :: Associative{String, Int32}
-  index :: Array{String}
+  dict  :: Associative{AbstractString, Int32}
+  index :: Array{AbstractString}
   stats :: Vector{Float64}
 end
 vocab_size(bkg::BKG) = length(bkg.index)
-getindex(bkg::BKG, token :: String) = bkg.dict[token]
-stats(bkg::BKG, s::String) = bkg.stats[bkg[s]]
+getindex(bkg::BKG, token :: AbstractString) = bkg.dict[token]
+stats(bkg::BKG, s::AbstractString) = bkg.stats[bkg[s]]
 
 function tfnorm(stats; cutoff = 1e10, squash :: Function = log)
   for i = 1:length(stats)
@@ -43,7 +43,7 @@ function apply(bkg::BKG, counts)
 end
 
 function make_background(features; mincount = 1, prune = 0.0, unk = true, norm = stats -> min(1.0 ./ stats, 1e10), logger = Log(STDERR))
-  dict = DefaultDict(String, Int32, 0)
+  dict = DefaultDict(AbstractString, Int32, 0)
 
   @timer logger "building background dictionary" begin
   # Count
@@ -67,8 +67,8 @@ function make_background(features; mincount = 1, prune = 0.0, unk = true, norm =
   end # timer
 
   # index
-  index          = (String)[unk_token]
-  rev            = DefaultDict(String, Int32, 1)
+  index          = (AbstractString)[unk_token]
+  rev            = DefaultDict(AbstractString, Int32, 1)
   rev[unk_token] = 1
   i              = 2
   @timer logger "building index" begin
diff --git a/src/readers.jl b/src/readers.jl
@@ -21,12 +21,12 @@ export read_tweets, read_usenet, filelines, zopen
 # -------------------------------------------------------------------------------------------------------------------------
 # Basic utilities
 # -------------------------------------------------------------------------------------------------------------------------
-function zopen(fn :: String)
+function zopen(fn :: AbstractString)
   return ismatch(r"^.*\.gz$", fn) ? gzopen(fn) : open(fn)
 end
 
 type FileLines
-  name :: String
+  name :: AbstractString
 end
 
 start(itr :: FileLines) = zopen(itr.name)
@@ -45,15 +45,15 @@ eltype(itr :: FileLines) = ByteString
 
 
 # get a file line iterator from a file name, open with gzip as needed
-filelines(fn :: String) = FileLines(fn)
+filelines(fn :: AbstractString) = FileLines(fn)
 streamlines(f) = eachline(f) # convenience
 
 # -------------------------------------------------------------------------------------------------------------------------
 # Text format readers
 # -------------------------------------------------------------------------------------------------------------------------
 
 # read collection of tweets from a file
-function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
+function read_tweets(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
   ret = Dict{String, Float32}[]
   rlat = Float32[]
   rlong = Float32[]
@@ -78,8 +78,8 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
       
       # validate text
       for c in text
-        if '\ud800' <= c <= '\udfff' || '\U10ffff' < c 
-          valid = false 
+        if 0xd800 <= c <= 0xdfff || 0x10ffff < c # same check made by isvalid(Char,ch) and deprecated is_valid_char
+          valid = false
         end 
       end
       
@@ -124,7 +124,7 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
 end
 
 # usenet/email single document reader -- 20ng
-function read_usenet(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
+function read_usenet(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
   ignore = false
   @info lg @sprintf("reading: %s", fn)
   vec = Dict{String, Float32}()
diff --git a/src/tc.jl b/src/tc.jl
@@ -62,7 +62,7 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2
   mapper = iteration_method == :eager ? map : lazy_map
 
   # define class index
-  classes = Dict{String, Int32}()
+  classes = Dict{AbstractString, Int32}()
   i       = 1
   @timer logger "indexing truth" for t in truth
     if !(t in keys(classes))
diff --git a/src/tokenizers.jl b/src/tokenizers.jl
@@ -27,11 +27,11 @@ const url_pattern     = r"http://[^\s]*"
 const hashtag_pattern = r"^#.*$"
 const mention_pattern = r"^@.*$"
 
-function replace_html_entities(s :: String)
+function replace_html_entities(s :: AbstractString)
   replace(s, r"&[^;]+?;", s -> s in keys(html_entity_table) ? html_entity_table[s] : s)
 end
 
-function pattern_replace(w :: String)
+function pattern_replace(w :: AbstractString)
   if ismatch(r"^[+-]?\p{Sc}\d+([.,]\d+)*$", w) return "--currency--"
   elseif ismatch(r"^[+-]?\d+([.,]\d+)*%$", w) return "--percent--"
   elseif ismatch(r"^[+-]?\d+([.,]\d+)*$", w) return "--number--"
@@ -44,14 +44,14 @@ function pattern_replace(w :: String)
   end
 end
 
-function prereplace(sent :: String)
+function prereplace(sent :: AbstractString)
   r = replace(sent, r"n't\b", " not")
   r = replace(r, r"'s\b", " s's")
   r = replace(r, r"'d\b", " d'd")
 end
 
 
-function english_tokenizer(s :: String)
+function english_tokenizer(s :: AbstractString)
   return [ 
     begin 
       m = match(punct_word, w)
@@ -61,7 +61,7 @@ function english_tokenizer(s :: String)
   ]
 end
 
-function twenglish_tokenizer(s :: String)
+function twenglish_tokenizer(s :: AbstractString)
   return [ 
     begin 
       m = match(r"^(\p{P}*)(.*?)\p{P}*$", w)
@@ -71,7 +71,7 @@ function twenglish_tokenizer(s :: String)
   ]
 end
 
-function twenglish_cleaner(tw :: String; urls = true, hashtags = true, mentions = true)
+function twenglish_cleaner(tw :: AbstractString; urls = true, hashtags = true, mentions = true)
   ctw = replace(normalize_string(tw, :NFKC), default_space, " ")
   ctw = urls ? replace(ctw, url_pattern, "\u0030\u20E3") : ctw
 
diff --git a/test/lid.jl b/test/lid.jl
@@ -15,7 +15,7 @@ bkgmodel, fextractor, model = tc_train(train, train_truth, lid_iterating_tokeniz
                                        trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 2, C = 0.01, average = true),
                                        iteration_method = :eager)
 
-confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
+confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
 res     = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0
 @info @sprintf("mira test set error rate: %7.3f", res)
 print_confusion_matrix(confmat)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -125,7 +125,7 @@ end
 # -------------------------------------------------------------------------------------------------------------------------
 # feature vector tests
 # -------------------------------------------------------------------------------------------------------------------------
-lines = (Array{String})[]
+lines = (Array{AbstractString})[]
 for l in filelines("data/test.txt")
   tokens = split(strip(l), r"\s+")
   push!(lines, tokens)
@@ -141,8 +141,8 @@ bkg = make_background(lines, mincount = 2)
 @expect stats(bkg, unk_token) == 19.0
 
 @info "bkg[c]    = $(stats(bkg, "c"))"
-@expect sparse_count(lines[1], bkg)   == sparsevec((Int64=>Float64)[ bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0], vocab_size(bkg))
-@expect sparse_count(lines[end], bkg) == sparsevec((Int64=>Float64)[ bkg[unk_token] => 1.0 ], vocab_size(bkg))
+@expect sparse_count(lines[1], bkg)   == sparsevec(Dict{Int64,Float64}( bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0), vocab_size(bkg))
+@expect sparse_count(lines[end], bkg) == sparsevec(Dict{Int64,Float64}( bkg[unk_token] => 1.0 ), vocab_size(bkg))
 
 @info "sparse[c] = $(sparse_count(lines[1], bkg)[2])"
 @expect norm(sparse_count(lines[1], bkg), bkg)[2] == 3.166666666666667
diff --git a/test/topic.jl b/test/topic.jl
@@ -12,8 +12,8 @@ function text(fn)
 end
 
 function getinstances(dir)
-  docs   = String[]
-  truth  = String[]
+  docs   = AbstractString[]
+  truth  = AbstractString[]
 
   for t in filter(d -> d != "." && d != "..", readdir(dir))
     for d in filter(d -> d != "." && d != "..", readdir("$dir/$t"))
@@ -52,7 +52,7 @@ test, test_truth   = getinstances("20ng/test")
 bkgmodel, fextractor, model = tc_train(train, train_truth, tokenize_file, mincount = 2, cutoff = 1e10,
                                        trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 19, C = 0.01, average = true),
                                        iteration_method = :eager)
-confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
+confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
 res     = test_classification(model, lazy_map(x -> fextractor(tokenize_file(x)), test), test_truth, 
                               record = (t, h) -> confmat[t][h] += 1) * 100.0
 @info @sprintf("mira test set error rate: %7.3f", res)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-julia 0.3 0.4-`
	`1`	`+julia 0.4-`
`2`	`2`	`DataStructures`
`3`	`3`	`Devectorize`
`4`	`4`	`Iterators`