From a2708ce5688339dd1b974215e6523fb71d2b32c6 Mon Sep 17 00:00:00 2001
From: Elizabeth Salesky <elizabeth.salesky@ll.mit.edu>
Date: Fri, 7 Aug 2015 16:34:29 -0400
Subject: [PATCH 1/4] - export BKG, ngrams to return all n-grams for all orders
 with truncated_start=false

---
 src/features.jl | 6 ++++--
 src/models.jl   | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/features.jl b/src/features.jl
index e21ddcb..ffd0241 100644
--- a/src/features.jl
+++ b/src/features.jl
@@ -62,8 +62,10 @@ function ngrams(words::Array; order = 2, truncated_start = false)
   ret = String[]
 
   if !truncated_start
-    for wi = 1:min(order - 1, length(words))
-      push!(ret, make_string(words, 1, wi))
+    for o = 1:min(order - 1, length(words))
+      for wi = 1:length(words)-(o-1)
+        push!(ret, make_string(words, wi, wi + o - 1))
+      end
     end
   end
 
diff --git a/src/models.jl b/src/models.jl
index 7d67db4..0e9eb2d 100644
--- a/src/models.jl
+++ b/src/models.jl
@@ -16,7 +16,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-export make_background, stats, vocab_size, apply
+export make_background, stats, vocab_size, apply, BKG
 
 type BKG
   dict  :: Associative{String, Int32}

From 921719d7c348125caeb1515c695537c58587c2d9 Mon Sep 17 00:00:00 2001
From: Elizabeth Salesky <elizabeth.salesky@ll.mit.edu>
Date: Tue, 11 Aug 2015 10:55:24 -0400
Subject: [PATCH 2/4] - applied ngrams change to Strings, updated tests

---
 src/features.jl  | 27 ++++++++++++++++-----------
 test/lid.jl      |  2 +-
 test/runtests.jl | 38 ++++++++++++++++++--------------------
 3 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/src/features.jl b/src/features.jl
index ffd0241..1358f8d 100644
--- a/src/features.jl
+++ b/src/features.jl
@@ -20,19 +20,19 @@ export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams!
 
 immutable NgramStringIterator 
   string :: String
-  order :: Int32
+  order  :: Int32
   truncated_start :: Bool
 end
 type StringPosition
-  start  :: Int32
-  fin    :: Int32
-  nth    :: Int32
+  start :: Int32
+  fin   :: Int32
+  nth   :: Int32
 end
 
 function start(ngi :: NgramStringIterator) 
   if ngi.truncated_start 
     idx = 1
-    for i = 1:(ngi.order-1)
+    for i = 1:(ngi.order-1)  #necessary because strings are indexed to bytes, not characters
       idx = nextind(ngi.string, idx)
     end
     return StringPosition(1, idx, ngi.order)
@@ -41,14 +41,20 @@ function start(ngi :: NgramStringIterator)
   end
 end
 
-done(ngi :: NgramStringIterator, position) = position.fin > endof(ngi.string)
+done(ngi :: NgramStringIterator, position) = position.nth > ngi.order || position.fin > endof(ngi.string)
 function next(ngi :: NgramStringIterator, position)
   str = make_string(ngi.string, position.start, position.fin)
-  if position.nth >= ngi.order
-    position.start = nextind(ngi.string, position.start)
+
+  if position.fin >= endof(ngi.string)
+    position.start = 0
+    position.fin   = 1
+    for i = 1:position.nth-1
+      position.fin = nextind(ngi.string, position.fin)
+    end
+    position.nth  += 1
   end
-  position.nth += 1
-  position.fin  = nextind(ngi.string, position.fin)
+  position.start = nextind(ngi.string, position.start)
+  position.fin   = nextind(ngi.string, position.fin)
   return str, position
 end
 
@@ -100,7 +106,6 @@ function sparse_count(text, bkg)
   return vec
 end
 
-
 function dict_count(tokens)
   map = DefaultDict{String,Int32}()
   for w in tokens
diff --git a/test/lid.jl b/test/lid.jl
index d573d21..f8ec04f 100644
--- a/test/lid.jl
+++ b/test/lid.jl
@@ -19,7 +19,7 @@ confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(Stri
 res     = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0
 @info @sprintf("mira test set error rate: %7.3f", res)
 print_confusion_matrix(confmat)
-@expect abs(res - 0.596) < 0.01
+@expect abs(res - 0.700) < 0.01
 
 # List specific errors
 # for (text, t) in zip(test, test_truth)
diff --git a/test/runtests.jl b/test/runtests.jl
index 273086a..fe09d0f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -52,10 +52,10 @@ end
 # feature extraction
 # -------------------------------------------------------------------------------------------------------------------------
 # ngrams from arrays
-@expect ngrams(["a", "b", "c"], order = 3)                         == ["a", "a b", "a b c"]
+@expect ngrams(["a", "b", "c"], order = 3)                         == ["a", "b", "c", "a b", "b c", "a b c"]
 @expect ngrams(["a", "b", "c"], order = 3, truncated_start = true) == ["a b c"]
 
-@expect ngrams(["a", "b", "c"], order = 2)                         == ["a", "a b", "b c"]
+@expect ngrams(["a", "b", "c"], order = 2)                         == ["a", "b", "c", "a b", "b c"]
 @expect ngrams(["a", "b", "c"], order = 2, truncated_start = true) == ["a b", "b c"]
 
 @expect ngrams(["a", "b", "c"], order = 1)                         == ["a", "b", "c"]
@@ -65,59 +65,59 @@ end
 @expect ngrams(["a"], order = 3, truncated_start = true)           == []
 
 # ngrams from strings
-@expect ngrams("abc", order = 3)                           == ["a", "ab", "abc"]
+@expect ngrams("abc", order = 3)                           == ["a", "b", "c", "ab", "bc", "abc"]
 @expect ngrams("abc", order = 3, truncated_start = true)   == ["abc"]
 
-@expect ngrams("abc", order = 2)                           == ["a", "ab", "bc"]
+@expect ngrams("abc", order = 2)                           == ["a", "b", "c", "ab", "bc"]
 @expect ngrams("abc", order = 2, truncated_start = true)   == ["ab", "bc"]
 
 @expect ngrams("abc", order = 1)                           == ["a", "b", "c"]
 @expect ngrams("abc", order = 1, truncated_start = true)   == ["a", "b", "c"]
 
 @expect ngrams("a", order = 3)                             == ["a"]
-@expect ngrams("ab", order = 3)                            == ["a", "ab"]
-@expect ngrams("abcd", order = 3)                          == ["a", "ab", "abc", "bcd"]
+@expect ngrams("ab", order = 3)                            == ["a", "b", "ab"]
+@expect ngrams("abcd", order = 3)                          == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"]
 @expect ngrams("a", order = 3, truncated_start = true)     == []
 @expect ngrams("ab", order = 3, truncated_start = true)    == []
 @expect ngrams("abcd", order = 3, truncated_start = true)  == ["abc", "bcd"]
 
 @expect ngrams("是的", order = 1)                          == ["是", "的"]
-@expect ngrams("是的", order = 2)                          == ["是", "是的"]
-@expect ngrams("是的", order = 3)                          == ["是", "是的"]
+@expect ngrams("是的", order = 2)                          == ["是", "的", "是的"]
+@expect ngrams("是的", order = 3)                          == ["是", "的", "是的"]
 @expect ngrams("是的", order = 3, truncated_start = true)  == []
 
 @expect ngrams("陇陇*", order = 1)                         == ["陇", "陇", "*"]
-@expect ngrams("陇陇*", order = 2)                         == ["陇", "陇陇", "陇*"]
-@expect ngrams("陇陇*", order = 3)                         == ["陇", "陇陇", "陇陇*"]
+@expect ngrams("陇陇*", order = 2)                         == ["陇", "陇", "*", "陇陇", "陇*"]
+@expect ngrams("陇陇*", order = 3)                         == ["陇", "陇", "*", "陇陇", "陇*", "陇陇*"]
 @expect ngrams("陇陇*", order = 3, truncated_start = true) == ["陇陇*"]
 
 @expect ngrams("", order = 1)                              == []
 
 # ngram iterator
-@expect collect(ngram_iterator("abc", order = 3))                           == ["a", "ab", "abc"]
+@expect collect(ngram_iterator("abc", order = 3))                           == ["a", "b", "c", "ab", "bc", "abc"]
 @expect collect(ngram_iterator("abc", order = 3, truncated_start = true))   == ["abc"]
 
-@expect collect(ngram_iterator("abc", order = 2))                           == ["a", "ab", "bc"]
+@expect collect(ngram_iterator("abc", order = 2))                           == ["a", "b", "c", "ab", "bc"]
 @expect collect(ngram_iterator("abc", order = 2, truncated_start = true))   == ["ab", "bc"]
 
 @expect collect(ngram_iterator("abc", order = 1))                           == ["a", "b", "c"]
 @expect collect(ngram_iterator("abc", order = 1, truncated_start = true))   == ["a", "b", "c"]
 
 @expect collect(ngram_iterator("a", order = 3))                             == ["a"]
-@expect collect(ngram_iterator("ab", order = 3))                            == ["a", "ab"]
-@expect collect(ngram_iterator("abcd", order = 3))                          == ["a", "ab", "abc", "bcd"]
+@expect collect(ngram_iterator("ab", order = 3))                            == ["a", "b", "ab"]
+@expect collect(ngram_iterator("abcd", order = 3))                          == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"]
 @expect collect(ngram_iterator("a", order = 3, truncated_start = true))     == []
 @expect collect(ngram_iterator("ab", order = 3, truncated_start = true))    == []
 @expect collect(ngram_iterator("abcd", order = 3, truncated_start = true))  == ["abc", "bcd"]
 
 @expect collect(ngram_iterator("是的", order = 1))                          == ["是", "的"]
-@expect collect(ngram_iterator("是的", order = 2))                          == ["是", "是的"]
-@expect collect(ngram_iterator("是的", order = 3))                          == ["是", "是的"]
+@expect collect(ngram_iterator("是的", order = 2))                          == ["是", "的", "是的"]
+@expect collect(ngram_iterator("是的", order = 3))                          == ["是", "的", "是的"]
 @expect collect(ngram_iterator("是的", order = 3, truncated_start = true))  == []
 
 @expect collect(ngram_iterator("陇陇*", order = 1))                         == ["陇", "陇", "*"]
-@expect collect(ngram_iterator("陇陇*", order = 2))                         == ["陇", "陇陇", "陇*"]
-@expect collect(ngram_iterator("陇陇*", order = 3))                         == ["陇", "陇陇", "陇陇*"]
+@expect collect(ngram_iterator("陇陇*", order = 2))                         == ["陇", "陇", "*", "陇陇", "陇*"]
+@expect collect(ngram_iterator("陇陇*", order = 3))                         == ["陇", "陇", "*", "陇陇", "陇*", "陇陇*"]
 @expect collect(ngram_iterator("陇陇*", order = 3, truncated_start = true)) == ["陇陇*"]
 
 @expect collect(ngram_iterator("", order = 1))                              == []
@@ -150,5 +150,3 @@ bkg = make_background(lines, mincount = 2)
 
 include("lid.jl")
 include("topic.jl")
-
-

From 0e662148ecd851c06b822227fa3269e95d57e0b7 Mon Sep 17 00:00:00 2001
From: Elizabeth Salesky <elizabeth.salesky@ll.mit.edu>
Date: Tue, 5 Jan 2016 15:34:04 -0500
Subject: [PATCH 3/4] - moving to v0.4 julia

---
 .travis.yml       |    2 +-
 REQUIRE           |    2 +-
 src/constants.jl  | 1351 ++++++++++++++++++++++-----------------------
 src/features.jl   |   23 +-
 src/models.jl     |   15 +-
 src/readers.jl    |   14 +-
 src/tc.jl         |    2 +-
 src/tokenizers.jl |   12 +-
 test/lid.jl       |    2 +-
 test/runtests.jl  |    6 +-
 test/topic.jl     |    6 +-
 11 files changed, 718 insertions(+), 717 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 59ec666..c1af4f9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,4 +14,4 @@ before_install:
     - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
 script:
     - julia -e 'versioninfo(); Pkg.init(); Pkg.clone("https://github.com/saltpork/Stage.jl"); Pkg.clone("https://github.com/mit-nlp/Ollam.jl"); Pkg.clone(pwd())'
-    - cd test; julia --color runtests.jl
+    - cd test; julia --color=yes runtests.jl
diff --git a/REQUIRE b/REQUIRE
index 1859b86..1fab71a 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -1,4 +1,4 @@
-julia 0.3 0.4-
+julia 0.4-
 DataStructures
 Devectorize
 Iterators
diff --git a/src/constants.jl b/src/constants.jl
index 7111d47..5b804f7 100644
--- a/src/constants.jl
+++ b/src/constants.jl
@@ -22,687 +22,686 @@ export default_space, english_stoplist, news_email_header, unk_token, html_entit
 const default_space = r"[\s\p{Zs}]+"
 
 # PHP's table
-const html_entity_table = [
-                           "&nbsp;" => " ",
-                           "&iexcl;" => "¡",
-                           "&cent;" => "¢",
-                           "&pound;" => "£",
-                           "&curren;" => "¤",
-                           "&yen;" => "¥",
-                           "&brvbar;" => "¦",
-                           "&sect;" => "§",
-                           "&uml;" => "¨",
-                           "&copy;" => "©",
-                           "&ordf;" => "ª",
-                           "&laquo;" => "«",
-                           "&not;" => "¬",
-                           "&shy;" => "­",
-                           "&reg;" => "®",
-                           "&macr;" => "¯",
-                           "&deg;" => "°",
-                           "&plusmn;" => "±",
-                           "&sup2;" => "²",
-                           "&sup3;" => "³",
-                           "&acute;" => "´",
-                           "&micro;" => "µ",
-                           "&para;" => "¶",
-                           "&middot;" => "·",
-                           "&cedil;" => "¸",
-                           "&sup1;" => "¹",
-                           "&ordm;" => "º",
-                           "&raquo;" => "»",
-                           "&frac14;" => "¼",
-                           "&frac12;" => "½",
-                           "&frac34;" => "¾",
-                           "&iquest;" => "¿",
-                           "&Agrave;" => "À",
-                           "&Aacute;" => "Á",
-                           "&Acirc;" => "Â",
-                           "&Atilde;" => "Ã",
-                           "&Auml;" => "Ä",
-                           "&Aring;" => "Å",
-                           "&AElig;" => "Æ",
-                           "&Ccedil;" => "Ç",
-                           "&Egrave;" => "È",
-                           "&Eacute;" => "É",
-                           "&Ecirc;" => "Ê",
-                           "&Euml;" => "Ë",
-                           "&Igrave;" => "Ì",
-                           "&Iacute;" => "Í",
-                           "&Icirc;" => "Î",
-                           "&Iuml;" => "Ï",
-                           "&ETH;" => "Ð",
-                           "&Ntilde;" => "Ñ",
-                           "&Ograve;" => "Ò",
-                           "&Oacute;" => "Ó",
-                           "&Ocirc;" => "Ô",
-                           "&Otilde;" => "Õ",
-                           "&Ouml;" => "Ö",
-                           "&times;" => "×",
-                           "&Oslash;" => "Ø",
-                           "&Ugrave;" => "Ù",
-                           "&Uacute;" => "Ú",
-                           "&Ucirc;" => "Û",
-                           "&Uuml;" => "Ü",
-                           "&Yacute;" => "Ý",
-                           "&THORN;" => "Þ",
-                           "&szlig;" => "ß",
-                           "&agrave;" => "à",
-                           "&aacute;" => "á",
-                           "&acirc;" => "â",
-                           "&atilde;" => "ã",
-                           "&auml;" => "ä",
-                           "&aring;" => "å",
-                           "&aelig;" => "æ",
-                           "&ccedil;" => "ç",
-                           "&egrave;" => "è",
-                           "&eacute;" => "é",
-                           "&ecirc;" => "ê",
-                           "&euml;" => "ë",
-                           "&igrave;" => "ì",
-                           "&iacute;" => "í",
-                           "&icirc;" => "î",
-                           "&iuml;" => "ï",
-                           "&eth;" => "ð",
-                           "&ntilde;" => "ñ",
-                           "&ograve;" => "ò",
-                           "&oacute;" => "ó",
-                           "&ocirc;" => "ô",
-                           "&otilde;" => "õ",
-                           "&ouml;" => "ö",
-                           "&divide;" => "÷",
-                           "&oslash;" => "ø",
-                           "&ugrave;" => "ù",
-                           "&uacute;" => "ú",
-                           "&ucirc;" => "û",
-                           "&uuml;" => "ü",
-                           "&yacute;" => "ý",
-                           "&thorn;" => "þ",
-                           "&yuml;" => "ÿ",
-                           "&amp;" => "&",
-                           "&quot;" => "\"",
-                           "&lt;" => "<",
-                           "&gt;" => ">",
-]
+const html_entity_table = Dict(
+                               "&nbsp;" => " ",
+                               "&iexcl;" => "¡",
+                               "&cent;" => "¢",
+                               "&pound;" => "£",
+                               "&curren;" => "¤",
+                               "&yen;" => "¥",
+                               "&brvbar;" => "¦",
+                               "&sect;" => "§",
+                               "&uml;" => "¨",
+                               "&copy;" => "©",
+                               "&ordf;" => "ª",
+                               "&laquo;" => "«",
+                               "&not;" => "¬",
+                               "&shy;" => "­",
+                               "&reg;" => "®",
+                               "&macr;" => "¯",
+                               "&deg;" => "°",
+                               "&plusmn;" => "±",
+                               "&sup2;" => "²",
+                               "&sup3;" => "³",
+                               "&acute;" => "´",
+                               "&micro;" => "µ",
+                               "&para;" => "¶",
+                               "&middot;" => "·",
+                               "&cedil;" => "¸",
+                               "&sup1;" => "¹",
+                               "&ordm;" => "º",
+                               "&raquo;" => "»",
+                               "&frac14;" => "¼",
+                               "&frac12;" => "½",
+                               "&frac34;" => "¾",
+                               "&iquest;" => "¿",
+                               "&Agrave;" => "À",
+                               "&Aacute;" => "Á",
+                               "&Acirc;" => "Â",
+                               "&Atilde;" => "Ã",
+                               "&Auml;" => "Ä",
+                               "&Aring;" => "Å",
+                               "&AElig;" => "Æ",
+                               "&Ccedil;" => "Ç",
+                               "&Egrave;" => "È",
+                               "&Eacute;" => "É",
+                               "&Ecirc;" => "Ê",
+                               "&Euml;" => "Ë",
+                               "&Igrave;" => "Ì",
+                               "&Iacute;" => "Í",
+                               "&Icirc;" => "Î",
+                               "&Iuml;" => "Ï",
+                               "&ETH;" => "Ð",
+                               "&Ntilde;" => "Ñ",
+                               "&Ograve;" => "Ò",
+                               "&Oacute;" => "Ó",
+                               "&Ocirc;" => "Ô",
+                               "&Otilde;" => "Õ",
+                               "&Ouml;" => "Ö",
+                               "&times;" => "×",
+                               "&Oslash;" => "Ø",
+                               "&Ugrave;" => "Ù",
+                               "&Uacute;" => "Ú",
+                               "&Ucirc;" => "Û",
+                               "&Uuml;" => "Ü",
+                               "&Yacute;" => "Ý",
+                               "&THORN;" => "Þ",
+                               "&szlig;" => "ß",
+                               "&agrave;" => "à",
+                               "&aacute;" => "á",
+                               "&acirc;" => "â",
+                               "&atilde;" => "ã",
+                               "&auml;" => "ä",
+                               "&aring;" => "å",
+                               "&aelig;" => "æ",
+                               "&ccedil;" => "ç",
+                               "&egrave;" => "è",
+                               "&eacute;" => "é",
+                               "&ecirc;" => "ê",
+                               "&euml;" => "ë",
+                               "&igrave;" => "ì",
+                               "&iacute;" => "í",
+                               "&icirc;" => "î",
+                               "&iuml;" => "ï",
+                               "&eth;" => "ð",
+                               "&ntilde;" => "ñ",
+                               "&ograve;" => "ò",
+                               "&oacute;" => "ó",
+                               "&ocirc;" => "ô",
+                               "&otilde;" => "õ",
+                               "&ouml;" => "ö",
+                               "&divide;" => "÷",
+                               "&oslash;" => "ø",
+                               "&ugrave;" => "ù",
+                               "&uacute;" => "ú",
+                               "&ucirc;" => "û",
+                               "&uuml;" => "ü",
+                               "&yacute;" => "ý",
+                               "&thorn;" => "þ",
+                               "&yuml;" => "ÿ",
+                               "&amp;" => "&",
+                               "&quot;" => "\"",
+                               "&lt;" => "<",
+                               "&gt;" => ">",
+                               )
 
 # -------------------------------------------------------------------------------------------------------------------------
 # From Lewis04's SMART stop list
 # -------------------------------------------------------------------------------------------------------------------------
-const english_stoplist = Set(String["a",
-                                    "a's",
-                                    "able",
-                                    "about",
-                                    "above",
-                                    "according",
-                                    "accordingly",
-                                    "across",
-                                    "actually",
-                                    "after",
-                                    "afterwards",
-                                    "again",
-                                    "against",
-                                    "ain't",
-                                    "all",
-                                    "allow",
-                                    "allows",
-                                    "almost",
-                                    "alone",
-                                    "along",
-                                    "already",
-                                    "also",
-                                    "although",
-                                    "always",
-                                    "am",
-                                    "among",
-                                    "amongst",
-                                    "an",
-                                    "and",
-                                    "another",
-                                    "any",
-                                    "anybody",
-                                    "anyhow",
-                                    "anyone",
-                                    "anything",
-                                    "anyway",
-                                    "anyways",
-                                    "anywhere",
-                                    "apart",
-                                    "appear",
-                                    "appreciate",
-                                    "appropriate",
-                                    "are",
-                                    "aren't",
-                                    "around",
-                                    "as",
-                                    "aside",
-                                    "ask",
-                                    "asking",
-                                    "associated",
-                                    "at",
-                                    "available",
-                                    "away",
-                                    "awfully",
-                                    "b",
-                                    "be",
-                                    "became",
-                                    "because",
-                                    "become",
-                                    "becomes",
-                                    "becoming",
-                                    "been",
-                                    "before",
-                                    "beforehand",
-                                    "behind",
-                                    "being",
-                                    "believe",
-                                    "below",
-                                    "beside",
-                                    "besides",
-                                    "best",
-                                    "better",
-                                    "between",
-                                    "beyond",
-                                    "both",
-                                    "brief",
-                                    "but",
-                                    "by",
-                                    "c",
-                                    "c'mon",
-                                    "c's",
-                                    "came",
-                                    "can",
-                                    "can't",
-                                    "cannot",
-                                    "cant",
-                                    "cause",
-                                    "causes",
-                                    "certain",
-                                    "certainly",
-                                    "changes",
-                                    "clearly",
-                                    "co",
-                                    "com",
-                                    "come",
-                                    "comes",
-                                    "concerning",
-                                    "consequently",
-                                    "consider",
-                                    "considering",
-                                    "contain",
-                                    "containing",
-                                    "contains",
-                                    "corresponding",
-                                    "could",
-                                    "couldn't",
-                                    "course",
-                                    "currently",
-                                    "d",
-                                    "definitely",
-                                    "described",
-                                    "despite",
-                                    "did",
-                                    "didn't",
-                                    "different",
-                                    "do",
-                                    "does",
-                                    "doesn't",
-                                    "doing",
-                                    "don't",
-                                    "done",
-                                    "down",
-                                    "downwards",
-                                    "during",
-                                    "e",
-                                    "each",
-                                    "edu",
-                                    "eg",
-                                    "eight",
-                                    "either",
-                                    "else",
-                                    "elsewhere",
-                                    "enough",
-                                    "entirely",
-                                    "especially",
-                                    "et",
-                                    "etc",
-                                    "even",
-                                    "ever",
-                                    "every",
-                                    "everybody",
-                                    "everyone",
-                                    "everything",
-                                    "everywhere",
-                                    "ex",
-                                    "exactly",
-                                    "example",
-                                    "except",
-                                    "f",
-                                    "far",
-                                    "few",
-                                    "fifth",
-                                    "first",
-                                    "five",
-                                    "followed",
-                                    "following",
-                                    "follows",
-                                    "for",
-                                    "former",
-                                    "formerly",
-                                    "forth",
-                                    "four",
-                                    "from",
-                                    "further",
-                                    "furthermore",
-                                    "g",
-                                    "get",
-                                    "gets",
-                                    "getting",
-                                    "given",
-                                    "gives",
-                                    "go",
-                                    "goes",
-                                    "going",
-                                    "gone",
-                                    "got",
-                                    "gotten",
-                                    "greetings",
-                                    "h",
-                                    "had",
-                                    "hadn't",
-                                    "happens",
-                                    "hardly",
-                                    "has",
-                                    "hasn't",
-                                    "have",
-                                    "haven't",
-                                    "having",
-                                    "he",
-                                    "he's",
-                                    "hello",
-                                    "help",
-                                    "hence",
-                                    "her",
-                                    "here",
-                                    "here's",
-                                    "hereafter",
-                                    "hereby",
-                                    "herein",
-                                    "hereupon",
-                                    "hers",
-                                    "herself",
-                                    "hi",
-                                    "him",
-                                    "himself",
-                                    "his",
-                                    "hither",
-                                    "hopefully",
-                                    "how",
-                                    "howbeit",
-                                    "however",
-                                    "i",
-                                    "i'd",
-                                    "i'll",
-                                    "i'm",
-                                    "i've",
-                                    "ie",
-                                    "if",
-                                    "ignored",
-                                    "immediate",
-                                    "in",
-                                    "inasmuch",
-                                    "inc",
-                                    "indeed",
-                                    "indicate",
-                                    "indicated",
-                                    "indicates",
-                                    "inner",
-                                    "insofar",
-                                    "instead",
-                                    "into",
-                                    "inward",
-                                    "is",
-                                    "isn't",
-                                    "it",
-                                    "it'd",
-                                    "it'll",
-                                    "it's",
-                                    "its",
-                                    "itself",
-                                    "j",
-                                    "just",
-                                    "k",
-                                    "keep",
-                                    "keeps",
-                                    "kept",
-                                    "know",
-                                    "knows",
-                                    "known",
-                                    "l",
-                                    "last",
-                                    "lately",
-                                    "later",
-                                    "latter",
-                                    "latterly",
-                                    "least",
-                                    "less",
-                                    "lest",
-                                    "let",
-                                    "let's",
-                                    "like",
-                                    "liked",
-                                    "likely",
-                                    "little",
-                                    "look",
-                                    "looking",
-                                    "looks",
-                                    "ltd",
-                                    "m",
-                                    "mainly",
-                                    "many",
-                                    "may",
-                                    "maybe",
-                                    "me",
-                                    "mean",
-                                    "meanwhile",
-                                    "merely",
-                                    "might",
-                                    "more",
-                                    "moreover",
-                                    "most",
-                                    "mostly",
-                                    "much",
-                                    "must",
-                                    "my",
-                                    "myself",
-                                    "n",
-                                    "name",
-                                    "namely",
-                                    "nd",
-                                    "near",
-                                    "nearly",
-                                    "necessary",
-                                    "need",
-                                    "needs",
-                                    "neither",
-                                    "never",
-                                    "nevertheless",
-                                    "new",
-                                    "next",
-                                    "nine",
-                                    "no",
-                                    "nobody",
-                                    "non",
-                                    "none",
-                                    "noone",
-                                    "nor",
-                                    "normally",
-                                    "not",
-                                    "nothing",
-                                    "novel",
-                                    "now",
-                                    "nowhere",
-                                    "o",
-                                    "obviously",
-                                    "of",
-                                    "off",
-                                    "often",
-                                    "oh",
-                                    "ok",
-                                    "okay",
-                                    "old",
-                                    "on",
-                                    "once",
-                                    "one",
-                                    "ones",
-                                    "only",
-                                    "onto",
-                                    "or",
-                                    "other",
-                                    "others",
-                                    "otherwise",
-                                    "ought",
-                                    "our",
-                                    "ours",
-                                    "ourselves",
-                                    "out",
-                                    "outside",
-                                    "over",
-                                    "overall",
-                                    "own",
-                                    "p",
-                                    "particular",
-                                    "particularly",
-                                    "per",
-                                    "perhaps",
-                                    "placed",
-                                    "please",
-                                    "plus",
-                                    "possible",
-                                    "presumably",
-                                    "probably",
-                                    "provides",
-                                    "q",
-                                    "que",
-                                    "quite",
-                                    "qv",
-                                    "r",
-                                    "rather",
-                                    "rd",
-                                    "re",
-                                    "really",
-                                    "reasonably",
-                                    "regarding",
-                                    "regardless",
-                                    "regards",
-                                    "relatively",
-                                    "respectively",
-                                    "right",
-                                    "s",
-                                    "said",
-                                    "same",
-                                    "saw",
-                                    "say",
-                                    "saying",
-                                    "says",
-                                    "second",
-                                    "secondly",
-                                    "see",
-                                    "seeing",
-                                    "seem",
-                                    "seemed",
-                                    "seeming",
-                                    "seems",
-                                    "seen",
-                                    "self",
-                                    "selves",
-                                    "sensible",
-                                    "sent",
-                                    "serious",
-                                    "seriously",
-                                    "seven",
-                                    "several",
-                                    "shall",
-                                    "she",
-                                    "should",
-                                    "shouldn't",
-                                    "since",
-                                    "six",
-                                    "so",
-                                    "some",
-                                    "somebody",
-                                    "somehow",
-                                    "someone",
-                                    "something",
-                                    "sometime",
-                                    "sometimes",
-                                    "somewhat",
-                                    "somewhere",
-                                    "soon",
-                                    "sorry",
-                                    "specified",
-                                    "specify",
-                                    "specifying",
-                                    "still",
-                                    "sub",
-                                    "such",
-                                    "sup",
-                                    "sure",
-                                    "t",
-                                    "t's",
-                                    "take",
-                                    "taken",
-                                    "tell",
-                                    "tends",
-                                    "th",
-                                    "than",
-                                    "thank",
-                                    "thanks",
-                                    "thanx",
-                                    "that",
-                                    "that's",
-                                    "thats",
-                                    "the",
-                                    "their",
-                                    "theirs",
-                                    "them",
-                                    "themselves",
-                                    "then",
-                                    "thence",
-                                    "there",
-                                    "there's",
-                                    "thereafter",
-                                    "thereby",
-                                    "therefore",
-                                    "therein",
-                                    "theres",
-                                    "thereupon",
-                                    "these",
-                                    "they",
-                                    "they'd",
-                                    "they'll",
-                                    "they're",
-                                    "they've",
-                                    "think",
-                                    "third",
-                                    "this",
-                                    "thorough",
-                                    "thoroughly",
-                                    "those",
-                                    "though",
-                                    "three",
-                                    "through",
-                                    "throughout",
-                                    "thru",
-                                    "thus",
-                                    "to",
-                                    "together",
-                                    "too",
-                                    "took",
-                                    "toward",
-                                    "towards",
-                                    "tried",
-                                    "tries",
-                                    "truly",
-                                    "try",
-                                    "trying",
-                                    "twice",
-                                    "two",
-                                    "u",
-                                    "un",
-                                    "under",
-                                    "unfortunately",
-                                    "unless",
-                                    "unlikely",
-                                    "until",
-                                    "unto",
-                                    "up",
-                                    "upon",
-                                    "us",
-                                    "use",
-                                    "used",
-                                    "useful",
-                                    "uses",
-                                    "using",
-                                    "usually",
-                                    "uucp",
-                                    "v",
-                                    "value",
-                                    "various",
-                                    "very",
-                                    "via",
-                                    "viz",
-                                    "vs",
-                                    "w",
-                                    "want",
-                                    "wants",
-                                    "was",
-                                    "wasn't",
-                                    "way",
-                                    "we",
-                                    "we'd",
-                                    "we'll",
-                                    "we're",
-                                    "we've",
-                                    "welcome",
-                                    "well",
-                                    "went",
-                                    "were",
-                                    "weren't",
-                                    "what",
-                                    "what's",
-                                    "whatever",
-                                    "when",
-                                    "whence",
-                                    "whenever",
-                                    "where",
-                                    "where's",
-                                    "whereafter",
-                                    "whereas",
-                                    "whereby",
-                                    "wherein",
-                                    "whereupon",
-                                    "wherever",
-                                    "whether",
-                                    "which",
-                                    "while",
-                                    "whither",
-                                    "who",
-                                    "who's",
-                                    "whoever",
-                                    "whole",
-                                    "whom",
-                                    "whose",
-                                    "why",
-                                    "will",
-                                    "willing",
-                                    "wish",
-                                    "with",
-                                    "within",
-                                    "without",
-                                    "won't",
-                                    "wonder",
-                                    "would",
-                                    "would",
-                                    "wouldn't",
-                                    "x",
-                                    "y",
-                                    "yes",
-                                    "yet",
-                                    "you",
-                                    "you'd",
-                                    "you'll",
-                                    "you're",
-                                    "you've",
-                                    "your",
-                                    "yours",
-                                    "yourself",
-                                    "yourselves",
-                                    "z",
-                                    "zero",
-                                    "--number--",
-                                    ])
+const english_stoplist = Set(AbstractString["a",
+                                            "a's",
+                                            "able",
+                                            "about",
+                                            "above",
+                                            "according",
+                                            "accordingly",
+                                            "across",
+                                            "actually",
+                                            "after",
+                                            "afterwards",
+                                            "again",
+                                            "against",
+                                            "ain't",
+                                            "all",
+                                            "allow",
+                                            "allows",
+                                            "almost",
+                                            "alone",
+                                            "along",
+                                            "already",
+                                            "also",
+                                            "although",
+                                            "always",
+                                            "am",
+                                            "among",
+                                            "amongst",
+                                            "an",
+                                            "and",
+                                            "another",
+                                            "any",
+                                            "anybody",
+                                            "anyhow",
+                                            "anyone",
+                                            "anything",
+                                            "anyway",
+                                            "anyways",
+                                            "anywhere",
+                                            "apart",
+                                            "appear",
+                                            "appreciate",
+                                            "appropriate",
+                                            "are",
+                                            "aren't",
+                                            "around",
+                                            "as",
+                                            "aside",
+                                            "ask",
+                                            "asking",
+                                            "associated",
+                                            "at",
+                                            "available",
+                                            "away",
+                                            "awfully",
+                                            "b",
+                                            "be",
+                                            "became",
+                                            "because",
+                                            "become",
+                                            "becomes",
+                                            "becoming",
+                                            "been",
+                                            "before",
+                                            "beforehand",
+                                            "behind",
+                                            "being",
+                                            "believe",
+                                            "below",
+                                            "beside",
+                                            "besides",
+                                            "best",
+                                            "better",
+                                            "between",
+                                            "beyond",
+                                            "both",
+                                            "brief",
+                                            "but",
+                                            "by",
+                                            "c",
+                                            "c'mon",
+                                            "c's",
+                                            "came",
+                                            "can",
+                                            "can't",
+                                            "cannot",
+                                            "cant",
+                                            "cause",
+                                            "causes",
+                                            "certain",
+                                            "certainly",
+                                            "changes",
+                                            "clearly",
+                                            "co",
+                                            "com",
+                                            "come",
+                                            "comes",
+                                            "concerning",
+                                            "consequently",
+                                            "consider",
+                                            "considering",
+                                            "contain",
+                                            "containing",
+                                            "contains",
+                                            "corresponding",
+                                            "could",
+                                            "couldn't",
+                                            "course",
+                                            "currently",
+                                            "d",
+                                            "definitely",
+                                            "described",
+                                            "despite",
+                                            "did",
+                                            "didn't",
+                                            "different",
+                                            "do",
+                                            "does",
+                                            "doesn't",
+                                            "doing",
+                                            "don't",
+                                            "done",
+                                            "down",
+                                            "downwards",
+                                            "during",
+                                            "e",
+                                            "each",
+                                            "edu",
+                                            "eg",
+                                            "eight",
+                                            "either",
+                                            "else",
+                                            "elsewhere",
+                                            "enough",
+                                            "entirely",
+                                            "especially",
+                                            "et",
+                                            "etc",
+                                            "even",
+                                            "ever",
+                                            "every",
+                                            "everybody",
+                                            "everyone",
+                                            "everything",
+                                            "everywhere",
+                                            "ex",
+                                            "exactly",
+                                            "example",
+                                            "except",
+                                            "f",
+                                            "far",
+                                            "few",
+                                            "fifth",
+                                            "first",
+                                            "five",
+                                            "followed",
+                                            "following",
+                                            "follows",
+                                            "for",
+                                            "former",
+                                            "formerly",
+                                            "forth",
+                                            "four",
+                                            "from",
+                                            "further",
+                                            "furthermore",
+                                            "g",
+                                            "get",
+                                            "gets",
+                                            "getting",
+                                            "given",
+                                            "gives",
+                                            "go",
+                                            "goes",
+                                            "going",
+                                            "gone",
+                                            "got",
+                                            "gotten",
+                                            "greetings",
+                                            "h",
+                                            "had",
+                                            "hadn't",
+                                            "happens",
+                                            "hardly",
+                                            "has",
+                                            "hasn't",
+                                            "have",
+                                            "haven't",
+                                            "having",
+                                            "he",
+                                            "he's",
+                                            "hello",
+                                            "help",
+                                            "hence",
+                                            "her",
+                                            "here",
+                                            "here's",
+                                            "hereafter",
+                                            "hereby",
+                                            "herein",
+                                            "hereupon",
+                                            "hers",
+                                            "herself",
+                                            "hi",
+                                            "him",
+                                            "himself",
+                                            "his",
+                                            "hither",
+                                            "hopefully",
+                                            "how",
+                                            "howbeit",
+                                            "however",
+                                            "i",
+                                            "i'd",
+                                            "i'll",
+                                            "i'm",
+                                            "i've",
+                                            "ie",
+                                            "if",
+                                            "ignored",
+                                            "immediate",
+                                            "in",
+                                            "inasmuch",
+                                            "inc",
+                                            "indeed",
+                                            "indicate",
+                                            "indicated",
+                                            "indicates",
+                                            "inner",
+                                            "insofar",
+                                            "instead",
+                                            "into",
+                                            "inward",
+                                            "is",
+                                            "isn't",
+                                            "it",
+                                            "it'd",
+                                            "it'll",
+                                            "it's",
+                                            "its",
+                                            "itself",
+                                            "j",
+                                            "just",
+                                            "k",
+                                            "keep",
+                                            "keeps",
+                                            "kept",
+                                            "know",
+                                            "knows",
+                                            "known",
+                                            "l",
+                                            "last",
+                                            "lately",
+                                            "later",
+                                            "latter",
+                                            "latterly",
+                                            "least",
+                                            "less",
+                                            "lest",
+                                            "let",
+                                            "let's",
+                                            "like",
+                                            "liked",
+                                            "likely",
+                                            "little",
+                                            "look",
+                                            "looking",
+                                            "looks",
+                                            "ltd",
+                                            "m",
+                                            "mainly",
+                                            "many",
+                                            "may",
+                                            "maybe",
+                                            "me",
+                                            "mean",
+                                            "meanwhile",
+                                            "merely",
+                                            "might",
+                                            "more",
+                                            "moreover",
+                                            "most",
+                                            "mostly",
+                                            "much",
+                                            "must",
+                                            "my",
+                                            "myself",
+                                            "n",
+                                            "name",
+                                            "namely",
+                                            "nd",
+                                            "near",
+                                            "nearly",
+                                            "necessary",
+                                            "need",
+                                            "needs",
+                                            "neither",
+                                            "never",
+                                            "nevertheless",
+                                            "new",
+                                            "next",
+                                            "nine",
+                                            "no",
+                                            "nobody",
+                                            "non",
+                                            "none",
+                                            "noone",
+                                            "nor",
+                                            "normally",
+                                            "not",
+                                            "nothing",
+                                            "novel",
+                                            "now",
+                                            "nowhere",
+                                            "o",
+                                            "obviously",
+                                            "of",
+                                            "off",
+                                            "often",
+                                            "oh",
+                                            "ok",
+                                            "okay",
+                                            "old",
+                                            "on",
+                                            "once",
+                                            "one",
+                                            "ones",
+                                            "only",
+                                            "onto",
+                                            "or",
+                                            "other",
+                                            "others",
+                                            "otherwise",
+                                            "ought",
+                                            "our",
+                                            "ours",
+                                            "ourselves",
+                                            "out",
+                                            "outside",
+                                            "over",
+                                            "overall",
+                                            "own",
+                                            "p",
+                                            "particular",
+                                            "particularly",
+                                            "per",
+                                            "perhaps",
+                                            "placed",
+                                            "please",
+                                            "plus",
+                                            "possible",
+                                            "presumably",
+                                            "probably",
+                                            "provides",
+                                            "q",
+                                            "que",
+                                            "quite",
+                                            "qv",
+                                            "r",
+                                            "rather",
+                                            "rd",
+                                            "re",
+                                            "really",
+                                            "reasonably",
+                                            "regarding",
+                                            "regardless",
+                                            "regards",
+                                            "relatively",
+                                            "respectively",
+                                            "right",
+                                            "s",
+                                            "said",
+                                            "same",
+                                            "saw",
+                                            "say",
+                                            "saying",
+                                            "says",
+                                            "second",
+                                            "secondly",
+                                            "see",
+                                            "seeing",
+                                            "seem",
+                                            "seemed",
+                                            "seeming",
+                                            "seems",
+                                            "seen",
+                                            "self",
+                                            "selves",
+                                            "sensible",
+                                            "sent",
+                                            "serious",
+                                            "seriously",
+                                            "seven",
+                                            "several",
+                                            "shall",
+                                            "she",
+                                            "should",
+                                            "shouldn't",
+                                            "since",
+                                            "six",
+                                            "so",
+                                            "some",
+                                            "somebody",
+                                            "somehow",
+                                            "someone",
+                                            "something",
+                                            "sometime",
+                                            "sometimes",
+                                            "somewhat",
+                                            "somewhere",
+                                            "soon",
+                                            "sorry",
+                                            "specified",
+                                            "specify",
+                                            "specifying",
+                                            "still",
+                                            "sub",
+                                            "such",
+                                            "sup",
+                                            "sure",
+                                            "t",
+                                            "t's",
+                                            "take",
+                                            "taken",
+                                            "tell",
+                                            "tends",
+                                            "th",
+                                            "than",
+                                            "thank",
+                                            "thanks",
+                                            "thanx",
+                                            "that",
+                                            "that's",
+                                            "thats",
+                                            "the",
+                                            "their",
+                                            "theirs",
+                                            "them",
+                                            "themselves",
+                                            "then",
+                                            "thence",
+                                            "there",
+                                            "there's",
+                                            "thereafter",
+                                            "thereby",
+                                            "therefore",
+                                            "therein",
+                                            "theres",
+                                            "thereupon",
+                                            "these",
+                                            "they",
+                                            "they'd",
+                                            "they'll",
+                                            "they're",
+                                            "they've",
+                                            "think",
+                                            "third",
+                                            "this",
+                                            "thorough",
+                                            "thoroughly",
+                                            "those",
+                                            "though",
+                                            "three",
+                                            "through",
+                                            "throughout",
+                                            "thru",
+                                            "thus",
+                                            "to",
+                                            "together",
+                                            "too",
+                                            "took",
+                                            "toward",
+                                            "towards",
+                                            "tried",
+                                            "tries",
+                                            "truly",
+                                            "try",
+                                            "trying",
+                                            "twice",
+                                            "two",
+                                            "u",
+                                            "un",
+                                            "under",
+                                            "unfortunately",
+                                            "unless",
+                                            "unlikely",
+                                            "until",
+                                            "unto",
+                                            "up",
+                                            "upon",
+                                            "us",
+                                            "use",
+                                            "used",
+                                            "useful",
+                                            "uses",
+                                            "using",
+                                            "usually",
+                                            "uucp",
+                                            "v",
+                                            "value",
+                                            "various",
+                                            "very",
+                                            "via",
+                                            "viz",
+                                            "vs",
+                                            "w",
+                                            "want",
+                                            "wants",
+                                            "was",
+                                            "wasn't",
+                                            "way",
+                                            "we",
+                                            "we'd",
+                                            "we'll",
+                                            "we're",
+                                            "we've",
+                                            "welcome",
+                                            "well",
+                                            "went",
+                                            "were",
+                                            "weren't",
+                                            "what",
+                                            "what's",
+                                            "whatever",
+                                            "when",
+                                            "whence",
+                                            "whenever",
+                                            "where",
+                                            "where's",
+                                            "whereafter",
+                                            "whereas",
+                                            "whereby",
+                                            "wherein",
+                                            "whereupon",
+                                            "wherever",
+                                            "whether",
+                                            "which",
+                                            "while",
+                                            "whither",
+                                            "who",
+                                            "who's",
+                                            "whoever",
+                                            "whole",
+                                            "whom",
+                                            "whose",
+                                            "why",
+                                            "will",
+                                            "willing",
+                                            "wish",
+                                            "with",
+                                            "within",
+                                            "without",
+                                            "won't",
+                                            "wonder",
+                                            "would",
+                                            "would",
+                                            "wouldn't",
+                                            "x",
+                                            "y",
+                                            "yes",
+                                            "yet",
+                                            "you",
+                                            "you'd",
+                                            "you'll",
+                                            "you're",
+                                            "you've",
+                                            "your",
+                                            "yours",
+                                            "yourself",
+                                            "yourselves",
+                                            "z",
+                                            "zero",
+                                            "--number--",
+                                            ])
 
 const news_email_header = r"^((to|from|subject|organization|keywords|summary|lines|distribution|reply-to|nntp-posting-host|disclaimer|x-newsreader|article-i.d.):|(-|\|).*$|-+$|in article.*writes:$|in article.*$|.* writes:$)"
 
 const unk_token = "--UNK--"
-
diff --git a/src/features.jl b/src/features.jl
index 1358f8d..32854dd 100644
--- a/src/features.jl
+++ b/src/features.jl
@@ -16,14 +16,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import Base.norm
 export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams!
 
 immutable NgramStringIterator 
-  string :: String
+  string :: AbstractString
   order  :: Int32
   truncated_start :: Bool
 end
-type StringPosition
+type AbstractStringPosition
   start :: Int32
   fin   :: Int32
   nth   :: Int32
@@ -35,9 +36,9 @@ function start(ngi :: NgramStringIterator)
     for i = 1:(ngi.order-1)  #necessary because strings are indexed to bytes, not characters
       idx = nextind(ngi.string, idx)
     end
-    return StringPosition(1, idx, ngi.order)
+    return AbstractStringPosition(1, idx, ngi.order)
   else
-    return StringPosition(1, 1, 1)
+    return AbstractStringPosition(1, 1, 1)
   end
 end
 
@@ -61,11 +62,11 @@ end
 # -------------------------------------------------------------------------------------------------------------------------
 # feature extractors
 # -------------------------------------------------------------------------------------------------------------------------
-make_string(words :: String, b, e) = SubString(words, b, e)
+make_string(words :: AbstractString, b, e) = SubString(words, b, e)
 make_string(words :: Array, b, e) = join(words[b:e], " ")
 
 function ngrams(words::Array; order = 2, truncated_start = false)
-  ret = String[]
+  ret = AbstractString[]
 
   if !truncated_start
     for o = 1:min(order - 1, length(words))
@@ -81,19 +82,19 @@ function ngrams(words::Array; order = 2, truncated_start = false)
   return ret
 end
 
-function ngrams(words::String; order = 2, truncated_start = false)
-  ret = String[]
+function ngrams(words :: AbstractString; order = 2, truncated_start = false)
+  ret = AbstractString[]
   return ngrams!(ret, words, order = order, truncated_start = truncated_start)
 end
 
-function ngrams!(ret :: Array, words :: String; order = 2, truncated_start = false)
+function ngrams!(ret :: Array, words :: AbstractString; order = 2, truncated_start = false)
   for x in ngram_iterator(words, order = order, truncated_start = truncated_start)
     push!(ret, x)
   end
   return ret
 end
 
-ngram_iterator(words :: String; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
+ngram_iterator(words :: AbstractString; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
 
 # -------------------------------------------------------------------------------------------------------------------------
 # feature vector operations
@@ -107,7 +108,7 @@ function sparse_count(text, bkg)
 end
 
 function dict_count(tokens)
-  map = DefaultDict{String,Int32}()
+  map = DefaultDict{AbstractString,Int32}()
   for w in tokens
     map[w] += 1
   end
diff --git a/src/models.jl b/src/models.jl
index 0e9eb2d..92c9a90 100644
--- a/src/models.jl
+++ b/src/models.jl
@@ -16,16 +16,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import Main.apply
 export make_background, stats, vocab_size, apply, BKG
 
 type BKG
-  dict  :: Associative{String, Int32}
-  index :: Array{String}
+  dict  :: Associative{AbstractString, Int32}
+  index :: Array{AbstractString}
   stats :: Vector{Float64}
 end
 vocab_size(bkg::BKG) = length(bkg.index)
-getindex(bkg::BKG, token :: String) = bkg.dict[token]
-stats(bkg::BKG, s::String) = bkg.stats[bkg[s]]
+getindex(bkg::BKG, token :: AbstractString) = bkg.dict[token]
+stats(bkg::BKG, s::AbstractString) = bkg.stats[bkg[s]]
 
 function tfnorm(stats; cutoff = 1e10, squash :: Function = log)
   for i = 1:length(stats)
@@ -43,7 +44,7 @@ function apply(bkg::BKG, counts)
 end
 
 function make_background(features; mincount = 1, prune = 0.0, unk = true, norm = stats -> min(1.0 ./ stats, 1e10), logger = Log(STDERR))
-  dict = DefaultDict(String, Int32, 0)
+  dict = DefaultDict(AbstractString, Int32, 0)
 
   @timer logger "building background dictionary" begin
   # Count
@@ -67,8 +68,8 @@ function make_background(features; mincount = 1, prune = 0.0, unk = true, norm =
   end # timer
 
   # index
-  index          = (String)[unk_token]
-  rev            = DefaultDict(String, Int32, 1)
+  index          = (AbstractString)[unk_token]
+  rev            = DefaultDict(AbstractString, Int32, 1)
   rev[unk_token] = 1
   i              = 2
   @timer logger "building index" begin
diff --git a/src/readers.jl b/src/readers.jl
index aa8a5f7..ebc178a 100644
--- a/src/readers.jl
+++ b/src/readers.jl
@@ -21,12 +21,12 @@ export read_tweets, read_usenet, filelines, zopen
 # -------------------------------------------------------------------------------------------------------------------------
 # Basic utilities
 # -------------------------------------------------------------------------------------------------------------------------
-function zopen(fn :: String)
+function zopen(fn :: AbstractString)
   return ismatch(r"^.*\.gz$", fn) ? gzopen(fn) : open(fn)
 end
 
 type FileLines
-  name :: String
+  name :: AbstractString
 end
 
 start(itr :: FileLines) = zopen(itr.name)
@@ -45,7 +45,7 @@ eltype(itr :: FileLines) = ByteString
 
 
 # get a file line iterator from a file name, open with gzip as needed
-filelines(fn :: String) = FileLines(fn)
+filelines(fn :: AbstractString) = FileLines(fn)
 streamlines(f) = eachline(f) # convenience
 
 # -------------------------------------------------------------------------------------------------------------------------
@@ -53,7 +53,7 @@ streamlines(f) = eachline(f) # convenience
 # -------------------------------------------------------------------------------------------------------------------------
 
 # read collection of tweets from a file
-function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
+function read_tweets(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
   ret = Dict{String, Float32}[]
   rlat = Float32[]
   rlong = Float32[]
@@ -78,8 +78,8 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
       
       # validate text
       for c in text
-        if '\ud800' <= c <= '\udfff' || '\U10ffff' < c 
-          valid = false 
+        if 0xd800 <= c <= 0xdfff || 0x10ffff < c # same check made by isvalid(Char,ch) and deprecated is_valid_char
+          valid = false
         end 
       end
       
@@ -124,7 +124,7 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
 end
 
 # usenet/email single document reader -- 20ng
-function read_usenet(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
+function read_usenet(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
   ignore = false
   @info lg @sprintf("reading: %s", fn)
   vec = Dict{String, Float32}()
diff --git a/src/tc.jl b/src/tc.jl
index c9f9b7d..0811aca 100644
--- a/src/tc.jl
+++ b/src/tc.jl
@@ -62,7 +62,7 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2
   mapper = iteration_method == :eager ? map : lazy_map
 
   # define class index
-  classes = Dict{String, Int32}()
+  classes = Dict{AbstractString, Int32}()
   i       = 1
   @timer logger "indexing truth" for t in truth
     if !(t in keys(classes))
diff --git a/src/tokenizers.jl b/src/tokenizers.jl
index 8289b2f..42fd260 100644
--- a/src/tokenizers.jl
+++ b/src/tokenizers.jl
@@ -27,11 +27,11 @@ const url_pattern     = r"http://[^\s]*"
 const hashtag_pattern = r"^#.*$"
 const mention_pattern = r"^@.*$"
 
-function replace_html_entities(s :: String)
+function replace_html_entities(s :: AbstractString)
   replace(s, r"&[^;]+?;", s -> s in keys(html_entity_table) ? html_entity_table[s] : s)
 end
 
-function pattern_replace(w :: String)
+function pattern_replace(w :: AbstractString)
   if ismatch(r"^[+-]?\p{Sc}\d+([.,]\d+)*$", w) return "--currency--"
   elseif ismatch(r"^[+-]?\d+([.,]\d+)*%$", w) return "--percent--"
   elseif ismatch(r"^[+-]?\d+([.,]\d+)*$", w) return "--number--"
@@ -44,14 +44,14 @@ function pattern_replace(w :: String)
   end
 end
 
-function prereplace(sent :: String)
+function prereplace(sent :: AbstractString)
   r = replace(sent, r"n't\b", " not")
   r = replace(r, r"'s\b", " s's")
   r = replace(r, r"'d\b", " d'd")
 end
 
 
-function english_tokenizer(s :: String)
+function english_tokenizer(s :: AbstractString)
   return [ 
     begin 
       m = match(punct_word, w)
@@ -61,7 +61,7 @@ function english_tokenizer(s :: String)
   ]
 end
 
-function twenglish_tokenizer(s :: String)
+function twenglish_tokenizer(s :: AbstractString)
   return [ 
     begin 
       m = match(r"^(\p{P}*)(.*?)\p{P}*$", w)
@@ -71,7 +71,7 @@ function twenglish_tokenizer(s :: String)
   ]
 end
 
-function twenglish_cleaner(tw :: String; urls = true, hashtags = true, mentions = true)
+function twenglish_cleaner(tw :: AbstractString; urls = true, hashtags = true, mentions = true)
   ctw = replace(normalize_string(tw, :NFKC), default_space, " ")
   ctw = urls ? replace(ctw, url_pattern, "\u0030\u20E3") : ctw
 
diff --git a/test/lid.jl b/test/lid.jl
index f8ec04f..d65e6a7 100644
--- a/test/lid.jl
+++ b/test/lid.jl
@@ -15,7 +15,7 @@ bkgmodel, fextractor, model = tc_train(train, train_truth, lid_iterating_tokeniz
                                        trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 2, C = 0.01, average = true),
                                        iteration_method = :eager)
 
-confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
+confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
 res     = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0
 @info @sprintf("mira test set error rate: %7.3f", res)
 print_confusion_matrix(confmat)
diff --git a/test/runtests.jl b/test/runtests.jl
index fe09d0f..9a88fde 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -125,7 +125,7 @@ end
 # -------------------------------------------------------------------------------------------------------------------------
 # feature vector tests
 # -------------------------------------------------------------------------------------------------------------------------
-lines = (Array{String})[]
+lines = (Array{AbstractString})[]
 for l in filelines("data/test.txt")
   tokens = split(strip(l), r"\s+")
   push!(lines, tokens)
@@ -141,8 +141,8 @@ bkg = make_background(lines, mincount = 2)
 @expect stats(bkg, unk_token) == 19.0
 
 @info "bkg[c]    = $(stats(bkg, "c"))"
-@expect sparse_count(lines[1], bkg)   == sparsevec((Int64=>Float64)[ bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0], vocab_size(bkg))
-@expect sparse_count(lines[end], bkg) == sparsevec((Int64=>Float64)[ bkg[unk_token] => 1.0 ], vocab_size(bkg))
+@expect sparse_count(lines[1], bkg)   == sparsevec(Dict{Int64,Float64}( bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0), vocab_size(bkg))
+@expect sparse_count(lines[end], bkg) == sparsevec(Dict{Int64,Float64}( bkg[unk_token] => 1.0 ), vocab_size(bkg))
 
 @info "sparse[c] = $(sparse_count(lines[1], bkg)[2])"
 @expect norm(sparse_count(lines[1], bkg), bkg)[2] == 3.166666666666667
diff --git a/test/topic.jl b/test/topic.jl
index 141fcf3..6d32350 100644
--- a/test/topic.jl
+++ b/test/topic.jl
@@ -12,8 +12,8 @@ function text(fn)
 end
 
 function getinstances(dir)
-  docs   = String[]
-  truth  = String[]
+  docs   = AbstractString[]
+  truth  = AbstractString[]
 
   for t in filter(d -> d != "." && d != "..", readdir(dir))
     for d in filter(d -> d != "." && d != "..", readdir("$dir/$t"))
@@ -52,7 +52,7 @@ test, test_truth   = getinstances("20ng/test")
 bkgmodel, fextractor, model = tc_train(train, train_truth, tokenize_file, mincount = 2, cutoff = 1e10,
                                        trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 19, C = 0.01, average = true),
                                        iteration_method = :eager)
-confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
+confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
 res     = test_classification(model, lazy_map(x -> fextractor(tokenize_file(x)), test), test_truth, 
                               record = (t, h) -> confmat[t][h] += 1) * 100.0
 @info @sprintf("mira test set error rate: %7.3f", res)

From a8247b7b7ad698425df71ff7ecb6eeb0cd1d84d8 Mon Sep 17 00:00:00 2001
From: Elizabeth Salesky <elizabeth.salesky@ll.mit.edu>
Date: Tue, 14 Jun 2016 11:24:29 -0400
Subject: [PATCH 4/4] - updating for ilr data (array{array}) (amended)

---
 REQUIRE        |  2 +-
 src/readers.jl |  8 ++++++++
 src/tc.jl      | 40 ++++++++++++++++++++++++++++++++++++++--
 test/topic.jl  |  2 ++
 4 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/REQUIRE b/REQUIRE
index 1fab71a..d7f65dd 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -1,4 +1,4 @@
-julia 0.4-
+julia 0.4
 DataStructures
 Devectorize
 Iterators
diff --git a/src/readers.jl b/src/readers.jl
index ebc178a..e4520c2 100644
--- a/src/readers.jl
+++ b/src/readers.jl
@@ -48,6 +48,14 @@ eltype(itr :: FileLines) = ByteString
 filelines(fn :: AbstractString) = FileLines(fn)
 streamlines(f) = eachline(f) # convenience
 
+#function getfile(name)
+#    file = joinpath(savedir, name)
+#    if !isfile(file)
+#        file = download(urlbase*name, file)
+#    end
+#    file
+#end
+
 # -------------------------------------------------------------------------------------------------------------------------
 # Text format readers
 # -------------------------------------------------------------------------------------------------------------------------
diff --git a/src/tc.jl b/src/tc.jl
index 0811aca..33766b3 100644
--- a/src/tc.jl
+++ b/src/tc.jl
@@ -49,13 +49,19 @@ end
 # -------------------------------------------------------------------------------------------------------------------------
 # training for text classifiers
 # -------------------------------------------------------------------------------------------------------------------------
-function tc_features(text, bkgmodel)
+function tc_features(text::Array, bkgmodel)
   counts = sparse_count(text, bkgmodel)
   counts /= sum(counts)
   return apply(bkgmodel, counts)
 end
 
-function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, 
+function tc_features{T<:Array,N}(text::Array{T,N}, bkgmodel)
+  counts = sparse_count(flatten(text), bkgmodel)
+  counts /= sum(counts)
+  return apply(bkgmodel, counts)
+end
+
+function tc_train(text::Array, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, 
                   iteration_method = :lazy,
                   trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true),
                   logger = Log(STDERR))
@@ -81,3 +87,33 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2
   
   return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model
 end
+
+function tc_train{T<:Array,N}(text::Array{T,N}, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, 
+                              iteration_method = :lazy,
+                              trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true),
+                              logger = Log(STDERR))
+  mapper = iteration_method == :eager ? map : lazy_map
+
+  # define class index
+  classes = Dict{AbstractString, Int32}()
+  i       = 1
+  @timer logger "indexing truth" for t in truth
+    if !(t in keys(classes))
+      classes[t] = i
+      i += 1
+    end
+  end
+
+  # prep model
+  @timer logger "preprocessing input"     preprocessed_text = mapper((x) -> mapper(preprocess, x), text)
+  @timer logger "making background model" bkgmodel          = make_background(flatten_iter(preprocessed_text), mincount = mincount, prune = prune, 
+                                                                              norm = stats -> tfnorm(stats, squash = sqrt, cutoff = cutoff))
+  @timer logger "making feature vectors"  fvs               = mapper(text -> tc_features(text, bkgmodel), preprocessed_text)
+  @timer logger "initializating model"    init_model        = LinearModel(classes, vocab_size(bkgmodel))
+  @timer logger "training final model"    model             = trainer(fvs, truth, init_model)
+  
+  return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model
+end
+
+flatten(a)      = mapreduce(x -> isa(x, Array) ? flatten(x) : x, vcat, [], a)
+flatten_iter(a) = map((x) -> flatten(x), a)
diff --git a/test/topic.jl b/test/topic.jl
index 6d32350..e4a218c 100644
--- a/test/topic.jl
+++ b/test/topic.jl
@@ -1,3 +1,5 @@
+using Text, Stage, Ollam, DataStructures
+
 function text(fn) 
   res = ""
   for l in map(l -> chomp(l), eachline(`iconv -f latin1 -t utf8 $fn`))