mit-nlp · esalesky · Aug 7, 2015 · Aug 11, 2015 · Jan 5, 2016 · Jun 14, 2016
diff --git a/.travis.yml b/.travis.yml
@@ -14,4 +14,4 @@ before_install:
     - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
 script:
     - julia -e 'versioninfo(); Pkg.init(); Pkg.clone("https://github.com/saltpork/Stage.jl"); Pkg.clone("https://github.com/mit-nlp/Ollam.jl"); Pkg.clone(pwd())'
-    - cd test; julia --color runtests.jl
+    - cd test; julia --color=yes runtests.jl
diff --git a/REQUIRE b/REQUIRE
@@ -1,4 +1,4 @@
-julia 0.3 0.4-
+julia 0.4
 DataStructures
 Devectorize
 Iterators

diff --git a/src/constants.jl b/src/constants.jl
diff --git a/src/features.jl b/src/features.jl
@@ -16,54 +16,63 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import Base.norm
 export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams!
 
 immutable NgramStringIterator 
-  string :: String
-  order :: Int32
+  string :: AbstractString
+  order  :: Int32
   truncated_start :: Bool
 end
-type StringPosition
-  start  :: Int32
-  fin    :: Int32
-  nth    :: Int32
+type AbstractStringPosition
+  start :: Int32
+  fin   :: Int32
+  nth   :: Int32
 end
 
 function start(ngi :: NgramStringIterator) 
   if ngi.truncated_start 
     idx = 1
-    for i = 1:(ngi.order-1)
+    for i = 1:(ngi.order-1)  #necessary because strings are indexed to bytes, not characters
       idx = nextind(ngi.string, idx)
     end
-    return StringPosition(1, idx, ngi.order)
+    return AbstractStringPosition(1, idx, ngi.order)
   else
-    return StringPosition(1, 1, 1)
+    return AbstractStringPosition(1, 1, 1)
   end
 end
 
-done(ngi :: NgramStringIterator, position) = position.fin > endof(ngi.string)
+done(ngi :: NgramStringIterator, position) = position.nth > ngi.order || position.fin > endof(ngi.string)
 function next(ngi :: NgramStringIterator, position)
   str = make_string(ngi.string, position.start, position.fin)
-  if position.nth >= ngi.order
-    position.start = nextind(ngi.string, position.start)
+
+  if position.fin >= endof(ngi.string)
+    position.start = 0
+    position.fin   = 1
+    for i = 1:position.nth-1
+      position.fin = nextind(ngi.string, position.fin)
+    end
+    position.nth  += 1
   end
-  position.nth += 1
-  position.fin  = nextind(ngi.string, position.fin)
+  position.start = nextind(ngi.string, position.start)
+  position.fin   = nextind(ngi.string, position.fin)
   return str, position
 end
 
 # -------------------------------------------------------------------------------------------------------------------------
 # feature extractors
 # -------------------------------------------------------------------------------------------------------------------------
-make_string(words :: String, b, e) = SubString(words, b, e)
+make_string(words :: AbstractString, b, e) = SubString(words, b, e)
 make_string(words :: Array, b, e) = join(words[b:e], " ")
 
 function ngrams(words::Array; order = 2, truncated_start = false)
-  ret = String[]
+  ret = AbstractString[]
 
   if !truncated_start
-    for wi = 1:min(order - 1, length(words))
-      push!(ret, make_string(words, 1, wi))
+    for o = 1:min(order - 1, length(words))
+      for wi = 1:length(words)-(o-1)
+        push!(ret, make_string(words, wi, wi + o - 1))
+      end
     end
   end
 
@@ -73,19 +82,19 @@ function ngrams(words::Array; order = 2, truncated_start = false)
   return ret
 end
 
-function ngrams(words::String; order = 2, truncated_start = false)
-  ret = String[]
+function ngrams(words :: AbstractString; order = 2, truncated_start = false)
+  ret = AbstractString[]
   return ngrams!(ret, words, order = order, truncated_start = truncated_start)
 end
 
-function ngrams!(ret :: Array, words :: String; order = 2, truncated_start = false)
+function ngrams!(ret :: Array, words :: AbstractString; order = 2, truncated_start = false)
   for x in ngram_iterator(words, order = order, truncated_start = truncated_start)
     push!(ret, x)
   end
   return ret
 end
 
-ngram_iterator(words :: String; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
+ngram_iterator(words :: AbstractString; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
 
 # -------------------------------------------------------------------------------------------------------------------------
 # feature vector operations
@@ -98,9 +107,8 @@ function sparse_count(text, bkg)
   return vec
 end
 
-
 function dict_count(tokens)
-  map = DefaultDict{String,Int32}()
+  map = DefaultDict{AbstractString,Int32}()
   for w in tokens
     map[w] += 1
   end

diff --git a/src/models.jl b/src/models.jl
@@ -16,16 +16,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-export make_background, stats, vocab_size, apply
+import Main.apply
+export make_background, stats, vocab_size, apply, BKG
 
 type BKG
-  dict  :: Associative{String, Int32}
-  index :: Array{String}
+  dict  :: Associative{AbstractString, Int32}
+  index :: Array{AbstractString}
   stats :: Vector{Float64}
 end
 vocab_size(bkg::BKG) = length(bkg.index)
-getindex(bkg::BKG, token :: String) = bkg.dict[token]
-stats(bkg::BKG, s::String) = bkg.stats[bkg[s]]
+getindex(bkg::BKG, token :: AbstractString) = bkg.dict[token]
+stats(bkg::BKG, s::AbstractString) = bkg.stats[bkg[s]]
 
 function tfnorm(stats; cutoff = 1e10, squash :: Function = log)
   for i = 1:length(stats)
@@ -43,7 +44,7 @@ function apply(bkg::BKG, counts)
 end
 
 function make_background(features; mincount = 1, prune = 0.0, unk = true, norm = stats -> min(1.0 ./ stats, 1e10), logger = Log(STDERR))
-  dict = DefaultDict(String, Int32, 0)
+  dict = DefaultDict(AbstractString, Int32, 0)
 
   @timer logger "building background dictionary" begin
   # Count
@@ -67,8 +68,8 @@ function make_background(features; mincount = 1, prune = 0.0, unk = true, norm =
   end # timer
 
   # index
-  index          = (String)[unk_token]
-  rev            = DefaultDict(String, Int32, 1)
+  index          = (AbstractString)[unk_token]
+  rev            = DefaultDict(AbstractString, Int32, 1)
   rev[unk_token] = 1
   i              = 2
   @timer logger "building index" begin

diff --git a/src/readers.jl b/src/readers.jl
@@ -21,12 +21,12 @@ export read_tweets, read_usenet, filelines, zopen
 # -------------------------------------------------------------------------------------------------------------------------
 # Basic utilities
 # -------------------------------------------------------------------------------------------------------------------------
-function zopen(fn :: String)
+function zopen(fn :: AbstractString)
   return ismatch(r"^.*\.gz$", fn) ? gzopen(fn) : open(fn)
 end
 
 type FileLines
-  name :: String
+  name :: AbstractString
 end
 
 start(itr :: FileLines) = zopen(itr.name)
@@ -45,15 +45,23 @@ eltype(itr :: FileLines) = ByteString
 
 
 # get a file line iterator from a file name, open with gzip as needed
-filelines(fn :: String) = FileLines(fn)
+filelines(fn :: AbstractString) = FileLines(fn)
 streamlines(f) = eachline(f) # convenience
 
+#function getfile(name)
+#    file = joinpath(savedir, name)
+#    if !isfile(file)
+#        file = download(urlbase*name, file)
+#    end
+#    file
+#end
+
 # -------------------------------------------------------------------------------------------------------------------------
 # Text format readers
 # -------------------------------------------------------------------------------------------------------------------------
 
 # read collection of tweets from a file
-function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
+function read_tweets(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
   ret = Dict{String, Float32}[]
   rlat = Float32[]
   rlong = Float32[]
@@ -78,8 +86,8 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
 
       # validate text
       for c in text
-        if '\ud800' <= c <= '\udfff' || '\U10ffff' < c 
-          valid = false 
+        if 0xd800 <= c <= 0xdfff || 0x10ffff < c # same check made by isvalid(Char,ch) and deprecated is_valid_char
+          valid = false
         end 
       end
 
@@ -124,7 +132,7 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
 end
 
 # usenet/email single document reader -- 20ng
-function read_usenet(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
+function read_usenet(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
   ignore = false
   @info lg @sprintf("reading: %s", fn)
   vec = Dict{String, Float32}()

diff --git a/src/tc.jl b/src/tc.jl
@@ -49,20 +49,26 @@ end
 # -------------------------------------------------------------------------------------------------------------------------
 # training for text classifiers
 # -------------------------------------------------------------------------------------------------------------------------
-function tc_features(text, bkgmodel)
+function tc_features(text::Array, bkgmodel)
   counts = sparse_count(text, bkgmodel)
   counts /= sum(counts)
   return apply(bkgmodel, counts)
 end
 
-function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, 
+function tc_features{T<:Array,N}(text::Array{T,N}, bkgmodel)
+  counts = sparse_count(flatten(text), bkgmodel)
+  counts /= sum(counts)
+  return apply(bkgmodel, counts)
+end
+
+function tc_train(text::Array, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, 
                   iteration_method = :lazy,
                   trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true),
                   logger = Log(STDERR))
   mapper = iteration_method == :eager ? map : lazy_map
 
   # define class index
-  classes = Dict{String, Int32}()
+  classes = Dict{AbstractString, Int32}()
   i       = 1
   @timer logger "indexing truth" for t in truth
     if !(t in keys(classes))
@@ -81,3 +87,33 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2
 
   return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model
 end
+
+function tc_train{T<:Array,N}(text::Array{T,N}, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, 
+                              iteration_method = :lazy,
+                              trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true),
+                              logger = Log(STDERR))
+  mapper = iteration_method == :eager ? map : lazy_map
+
+  # define class index
+  classes = Dict{AbstractString, Int32}()
+  i       = 1
+  @timer logger "indexing truth" for t in truth
+    if !(t in keys(classes))
+      classes[t] = i
+      i += 1
+    end
+  end
+
+  # prep model
+  @timer logger "preprocessing input"     preprocessed_text = mapper((x) -> mapper(preprocess, x), text)
+  @timer logger "making background model" bkgmodel          = make_background(flatten_iter(preprocessed_text), mincount = mincount, prune = prune, 
+                                                                              norm = stats -> tfnorm(stats, squash = sqrt, cutoff = cutoff))
+  @timer logger "making feature vectors"  fvs               = mapper(text -> tc_features(text, bkgmodel), preprocessed_text)
+  @timer logger "initializating model"    init_model        = LinearModel(classes, vocab_size(bkgmodel))
+  @timer logger "training final model"    model             = trainer(fvs, truth, init_model)
+
+  return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model
+end
+
+flatten(a)      = mapreduce(x -> isa(x, Array) ? flatten(x) : x, vcat, [], a)
+flatten_iter(a) = map((x) -> flatten(x), a)
diff --git a/src/tokenizers.jl b/src/tokenizers.jl
@@ -27,11 +27,11 @@ const url_pattern     = r"http://[^\s]*"
 const hashtag_pattern = r"^#.*$"
 const mention_pattern = r"^@.*$"
 
-function replace_html_entities(s :: String)
+function replace_html_entities(s :: AbstractString)
   replace(s, r"&[^;]+?;", s -> s in keys(html_entity_table) ? html_entity_table[s] : s)
 end
 
-function pattern_replace(w :: String)
+function pattern_replace(w :: AbstractString)
   if ismatch(r"^[+-]?\p{Sc}\d+([.,]\d+)*$", w) return "--currency--"
   elseif ismatch(r"^[+-]?\d+([.,]\d+)*%$", w) return "--percent--"
   elseif ismatch(r"^[+-]?\d+([.,]\d+)*$", w) return "--number--"
@@ -44,14 +44,14 @@ function pattern_replace(w :: String)
   end
 end
 
-function prereplace(sent :: String)
+function prereplace(sent :: AbstractString)
   r = replace(sent, r"n't\b", " not")
   r = replace(r, r"'s\b", " s's")
   r = replace(r, r"'d\b", " d'd")
 end
 
 
-function english_tokenizer(s :: String)
+function english_tokenizer(s :: AbstractString)
   return [ 
     begin 
       m = match(punct_word, w)
@@ -61,7 +61,7 @@ function english_tokenizer(s :: String)
   ]
 end
 
-function twenglish_tokenizer(s :: String)
+function twenglish_tokenizer(s :: AbstractString)
   return [ 
     begin 
       m = match(r"^(\p{P}*)(.*?)\p{P}*$", w)
@@ -71,7 +71,7 @@ function twenglish_tokenizer(s :: String)
   ]
 end
 
-function twenglish_cleaner(tw :: String; urls = true, hashtags = true, mentions = true)
+function twenglish_cleaner(tw :: AbstractString; urls = true, hashtags = true, mentions = true)
   ctw = replace(normalize_string(tw, :NFKC), default_space, " ")
   ctw = urls ? replace(ctw, url_pattern, "\u0030\u20E3") : ctw
 

diff --git a/test/lid.jl b/test/lid.jl
@@ -15,11 +15,11 @@ bkgmodel, fextractor, model = tc_train(train, train_truth, lid_iterating_tokeniz
                                        trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 2, C = 0.01, average = true),
                                        iteration_method = :eager)
 
-confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
+confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
 res     = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0
 @info @sprintf("mira test set error rate: %7.3f", res)
 print_confusion_matrix(confmat)
-@expect abs(res - 0.596) < 0.01
+@expect abs(res - 0.700) < 0.01
 
 # List specific errors
 # for (text, t) in zip(test, test_truth)