From a2708ce5688339dd1b974215e6523fb71d2b32c6 Mon Sep 17 00:00:00 2001 From: Elizabeth Salesky Date: Fri, 7 Aug 2015 16:34:29 -0400 Subject: [PATCH 1/4] - export BKG, ngrams to return all n-grams for all orders with truncated_start=false --- src/features.jl | 6 ++++-- src/models.jl | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/features.jl b/src/features.jl index e21ddcb..ffd0241 100644 --- a/src/features.jl +++ b/src/features.jl @@ -62,8 +62,10 @@ function ngrams(words::Array; order = 2, truncated_start = false) ret = String[] if !truncated_start - for wi = 1:min(order - 1, length(words)) - push!(ret, make_string(words, 1, wi)) + for o = 1:min(order - 1, length(words)) + for wi = 1:length(words)-(o-1) + push!(ret, make_string(words, wi, wi + o - 1)) + end end end diff --git a/src/models.jl b/src/models.jl index 7d67db4..0e9eb2d 100644 --- a/src/models.jl +++ b/src/models.jl @@ -16,7 +16,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -export make_background, stats, vocab_size, apply +export make_background, stats, vocab_size, apply, BKG type BKG dict :: Associative{String, Int32} From 921719d7c348125caeb1515c695537c58587c2d9 Mon Sep 17 00:00:00 2001 From: Elizabeth Salesky Date: Tue, 11 Aug 2015 10:55:24 -0400 Subject: [PATCH 2/4] - applied ngrams change to Strings, updated tests --- src/features.jl | 27 ++++++++++++++++----------- test/lid.jl | 2 +- test/runtests.jl | 38 ++++++++++++++++++-------------------- 3 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/features.jl b/src/features.jl index ffd0241..1358f8d 100644 --- a/src/features.jl +++ b/src/features.jl @@ -20,19 +20,19 @@ export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams! immutable NgramStringIterator string :: String - order :: Int32 + order :: Int32 truncated_start :: Bool end type StringPosition - start :: Int32 - fin :: Int32 - nth :: Int32 + start :: Int32 + fin :: Int32 + nth :: Int32 end function start(ngi :: NgramStringIterator) if ngi.truncated_start idx = 1 - for i = 1:(ngi.order-1) + for i = 1:(ngi.order-1) #necessary because strings are indexed to bytes, not characters idx = nextind(ngi.string, idx) end return StringPosition(1, idx, ngi.order) @@ -41,14 +41,20 @@ function start(ngi :: NgramStringIterator) end end -done(ngi :: NgramStringIterator, position) = position.fin > endof(ngi.string) +done(ngi :: NgramStringIterator, position) = position.nth > ngi.order || position.fin > endof(ngi.string) function next(ngi :: NgramStringIterator, position) str = make_string(ngi.string, position.start, position.fin) - if position.nth >= ngi.order - position.start = nextind(ngi.string, position.start) + + if position.fin >= endof(ngi.string) + position.start = 0 + position.fin = 1 + for i = 1:position.nth-1 + position.fin = nextind(ngi.string, position.fin) + end + position.nth += 1 end - position.nth += 1 - position.fin = nextind(ngi.string, position.fin) + position.start = nextind(ngi.string, position.start) + position.fin = nextind(ngi.string, position.fin) return str, position end @@ -100,7 +106,6 @@ function sparse_count(text, bkg) return vec end - function dict_count(tokens) map = DefaultDict{String,Int32}() for w in tokens diff --git a/test/lid.jl b/test/lid.jl index d573d21..f8ec04f 100644 --- a/test/lid.jl +++ b/test/lid.jl @@ -19,7 +19,7 @@ confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(Stri res = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0 @info @sprintf("mira test set error rate: %7.3f", res) print_confusion_matrix(confmat) -@expect abs(res - 0.596) < 0.01 +@expect abs(res - 0.700) < 0.01 # List specific errors # for (text, t) in zip(test, test_truth) diff --git a/test/runtests.jl b/test/runtests.jl index 273086a..fe09d0f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -52,10 +52,10 @@ end # feature extraction # ------------------------------------------------------------------------------------------------------------------------- # ngrams from arrays -@expect ngrams(["a", "b", "c"], order = 3) == ["a", "a b", "a b c"] +@expect ngrams(["a", "b", "c"], order = 3) == ["a", "b", "c", "a b", "b c", "a b c"] @expect ngrams(["a", "b", "c"], order = 3, truncated_start = true) == ["a b c"] -@expect ngrams(["a", "b", "c"], order = 2) == ["a", "a b", "b c"] +@expect ngrams(["a", "b", "c"], order = 2) == ["a", "b", "c", "a b", "b c"] @expect ngrams(["a", "b", "c"], order = 2, truncated_start = true) == ["a b", "b c"] @expect ngrams(["a", "b", "c"], order = 1) == ["a", "b", "c"] @@ -65,59 +65,59 @@ end @expect ngrams(["a"], order = 3, truncated_start = true) == [] # ngrams from strings -@expect ngrams("abc", order = 3) == ["a", "ab", "abc"] +@expect ngrams("abc", order = 3) == ["a", "b", "c", "ab", "bc", "abc"] @expect ngrams("abc", order = 3, truncated_start = true) == ["abc"] -@expect ngrams("abc", order = 2) == ["a", "ab", "bc"] +@expect ngrams("abc", order = 2) == ["a", "b", "c", "ab", "bc"] @expect ngrams("abc", order = 2, truncated_start = true) == ["ab", "bc"] @expect ngrams("abc", order = 1) == ["a", "b", "c"] @expect ngrams("abc", order = 1, truncated_start = true) == ["a", "b", "c"] @expect ngrams("a", order = 3) == ["a"] -@expect ngrams("ab", order = 3) == ["a", "ab"] -@expect ngrams("abcd", order = 3) == ["a", "ab", "abc", "bcd"] +@expect ngrams("ab", order = 3) == ["a", "b", "ab"] +@expect ngrams("abcd", order = 3) == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"] @expect ngrams("a", order = 3, truncated_start = true) == [] @expect ngrams("ab", order = 3, truncated_start = true) == [] @expect ngrams("abcd", order = 3, truncated_start = true) == ["abc", "bcd"] @expect ngrams("是的", order = 1) == ["是", "的"] -@expect ngrams("是的", order = 2) == ["是", "是的"] -@expect ngrams("是的", order = 3) == ["是", "是的"] +@expect ngrams("是的", order = 2) == ["是", "的", "是的"] +@expect ngrams("是的", order = 3) == ["是", "的", "是的"] @expect ngrams("是的", order = 3, truncated_start = true) == [] @expect ngrams("陇陇*", order = 1) == ["陇", "陇", "*"] -@expect ngrams("陇陇*", order = 2) == ["陇", "陇陇", "陇*"] -@expect ngrams("陇陇*", order = 3) == ["陇", "陇陇", "陇陇*"] +@expect ngrams("陇陇*", order = 2) == ["陇", "陇", "*", "陇陇", "陇*"] +@expect ngrams("陇陇*", order = 3) == ["陇", "陇", "*", "陇陇", "陇*", "陇陇*"] @expect ngrams("陇陇*", order = 3, truncated_start = true) == ["陇陇*"] @expect ngrams("", order = 1) == [] # ngram iterator -@expect collect(ngram_iterator("abc", order = 3)) == ["a", "ab", "abc"] +@expect collect(ngram_iterator("abc", order = 3)) == ["a", "b", "c", "ab", "bc", "abc"] @expect collect(ngram_iterator("abc", order = 3, truncated_start = true)) == ["abc"] -@expect collect(ngram_iterator("abc", order = 2)) == ["a", "ab", "bc"] +@expect collect(ngram_iterator("abc", order = 2)) == ["a", "b", "c", "ab", "bc"] @expect collect(ngram_iterator("abc", order = 2, truncated_start = true)) == ["ab", "bc"] @expect collect(ngram_iterator("abc", order = 1)) == ["a", "b", "c"] @expect collect(ngram_iterator("abc", order = 1, truncated_start = true)) == ["a", "b", "c"] @expect collect(ngram_iterator("a", order = 3)) == ["a"] -@expect collect(ngram_iterator("ab", order = 3)) == ["a", "ab"] -@expect collect(ngram_iterator("abcd", order = 3)) == ["a", "ab", "abc", "bcd"] +@expect collect(ngram_iterator("ab", order = 3)) == ["a", "b", "ab"] +@expect collect(ngram_iterator("abcd", order = 3)) == ["a", "b", "c", "d", "ab", "bc", "cd", "abc", "bcd"] @expect collect(ngram_iterator("a", order = 3, truncated_start = true)) == [] @expect collect(ngram_iterator("ab", order = 3, truncated_start = true)) == [] @expect collect(ngram_iterator("abcd", order = 3, truncated_start = true)) == ["abc", "bcd"] @expect collect(ngram_iterator("是的", order = 1)) == ["是", "的"] -@expect collect(ngram_iterator("是的", order = 2)) == ["是", "是的"] -@expect collect(ngram_iterator("是的", order = 3)) == ["是", "是的"] +@expect collect(ngram_iterator("是的", order = 2)) == ["是", "的", "是的"] +@expect collect(ngram_iterator("是的", order = 3)) == ["是", "的", "是的"] @expect collect(ngram_iterator("是的", order = 3, truncated_start = true)) == [] @expect collect(ngram_iterator("陇陇*", order = 1)) == ["陇", "陇", "*"] -@expect collect(ngram_iterator("陇陇*", order = 2)) == ["陇", "陇陇", "陇*"] -@expect collect(ngram_iterator("陇陇*", order = 3)) == ["陇", "陇陇", "陇陇*"] +@expect collect(ngram_iterator("陇陇*", order = 2)) == ["陇", "陇", "*", "陇陇", "陇*"] +@expect collect(ngram_iterator("陇陇*", order = 3)) == ["陇", "陇", "*", "陇陇", "陇*", "陇陇*"] @expect collect(ngram_iterator("陇陇*", order = 3, truncated_start = true)) == ["陇陇*"] @expect collect(ngram_iterator("", order = 1)) == [] @@ -150,5 +150,3 @@ bkg = make_background(lines, mincount = 2) include("lid.jl") include("topic.jl") - - From 0e662148ecd851c06b822227fa3269e95d57e0b7 Mon Sep 17 00:00:00 2001 From: Elizabeth Salesky Date: Tue, 5 Jan 2016 15:34:04 -0500 Subject: [PATCH 3/4] - moving to v0.4 julia --- .travis.yml | 2 +- REQUIRE | 2 +- src/constants.jl | 1351 ++++++++++++++++++++++----------------------- src/features.jl | 23 +- src/models.jl | 15 +- src/readers.jl | 14 +- src/tc.jl | 2 +- src/tokenizers.jl | 12 +- test/lid.jl | 2 +- test/runtests.jl | 6 +- test/topic.jl | 6 +- 11 files changed, 718 insertions(+), 717 deletions(-) diff --git a/.travis.yml b/.travis.yml index 59ec666..c1af4f9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,4 +14,4 @@ before_install: - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi script: - julia -e 'versioninfo(); Pkg.init(); Pkg.clone("https://github.com/saltpork/Stage.jl"); Pkg.clone("https://github.com/mit-nlp/Ollam.jl"); Pkg.clone(pwd())' - - cd test; julia --color runtests.jl + - cd test; julia --color=yes runtests.jl diff --git a/REQUIRE b/REQUIRE index 1859b86..1fab71a 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,4 +1,4 @@ -julia 0.3 0.4- +julia 0.4- DataStructures Devectorize Iterators diff --git a/src/constants.jl b/src/constants.jl index 7111d47..5b804f7 100644 --- a/src/constants.jl +++ b/src/constants.jl @@ -22,687 +22,686 @@ export default_space, english_stoplist, news_email_header, unk_token, html_entit const default_space = r"[\s\p{Zs}]+" # PHP's table -const html_entity_table = [ - " " => " ", - "¡" => "¡", - "¢" => "¢", - "£" => "£", - "¤" => "¤", - "¥" => "¥", - "¦" => "¦", - "§" => "§", - "¨" => "¨", - "©" => "©", - "ª" => "ª", - "«" => "«", - "¬" => "¬", - "­" => "­", - "®" => "®", - "¯" => "¯", - "°" => "°", - "±" => "±", - "²" => "²", - "³" => "³", - "´" => "´", - "µ" => "µ", - "¶" => "¶", - "·" => "·", - "¸" => "¸", - "¹" => "¹", - "º" => "º", - "»" => "»", - "¼" => "¼", - "½" => "½", - "¾" => "¾", - "¿" => "¿", - "À" => "À", - "Á" => "Á", - "Â" => "Â", - "Ã" => "Ã", - "Ä" => "Ä", - "Å" => "Å", - "Æ" => "Æ", - "Ç" => "Ç", - "È" => "È", - "É" => "É", - "Ê" => "Ê", - "Ë" => "Ë", - "Ì" => "Ì", - "Í" => "Í", - "Î" => "Î", - "Ï" => "Ï", - "Ð" => "Ð", - "Ñ" => "Ñ", - "Ò" => "Ò", - "Ó" => "Ó", - "Ô" => "Ô", - "Õ" => "Õ", - "Ö" => "Ö", - "×" => "×", - "Ø" => "Ø", - "Ù" => "Ù", - "Ú" => "Ú", - "Û" => "Û", - "Ü" => "Ü", - "Ý" => "Ý", - "Þ" => "Þ", - "ß" => "ß", - "à" => "à", - "á" => "á", - "â" => "â", - "ã" => "ã", - "ä" => "ä", - "å" => "å", - "æ" => "æ", - "ç" => "ç", - "è" => "è", - "é" => "é", - "ê" => "ê", - "ë" => "ë", - "ì" => "ì", - "í" => "í", - "î" => "î", - "ï" => "ï", - "ð" => "ð", - "ñ" => "ñ", - "ò" => "ò", - "ó" => "ó", - "ô" => "ô", - "õ" => "õ", - "ö" => "ö", - "÷" => "÷", - "ø" => "ø", - "ù" => "ù", - "ú" => "ú", - "û" => "û", - "ü" => "ü", - "ý" => "ý", - "þ" => "þ", - "ÿ" => "ÿ", - "&" => "&", - """ => "\"", - "<" => "<", - ">" => ">", -] +const html_entity_table = Dict( + " " => " ", + "¡" => "¡", + "¢" => "¢", + "£" => "£", + "¤" => "¤", + "¥" => "¥", + "¦" => "¦", + "§" => "§", + "¨" => "¨", + "©" => "©", + "ª" => "ª", + "«" => "«", + "¬" => "¬", + "­" => "­", + "®" => "®", + "¯" => "¯", + "°" => "°", + "±" => "±", + "²" => "²", + "³" => "³", + "´" => "´", + "µ" => "µ", + "¶" => "¶", + "·" => "·", + "¸" => "¸", + "¹" => "¹", + "º" => "º", + "»" => "»", + "¼" => "¼", + "½" => "½", + "¾" => "¾", + "¿" => "¿", + "À" => "À", + "Á" => "Á", + "Â" => "Â", + "Ã" => "Ã", + "Ä" => "Ä", + "Å" => "Å", + "Æ" => "Æ", + "Ç" => "Ç", + "È" => "È", + "É" => "É", + "Ê" => "Ê", + "Ë" => "Ë", + "Ì" => "Ì", + "Í" => "Í", + "Î" => "Î", + "Ï" => "Ï", + "Ð" => "Ð", + "Ñ" => "Ñ", + "Ò" => "Ò", + "Ó" => "Ó", + "Ô" => "Ô", + "Õ" => "Õ", + "Ö" => "Ö", + "×" => "×", + "Ø" => "Ø", + "Ù" => "Ù", + "Ú" => "Ú", + "Û" => "Û", + "Ü" => "Ü", + "Ý" => "Ý", + "Þ" => "Þ", + "ß" => "ß", + "à" => "à", + "á" => "á", + "â" => "â", + "ã" => "ã", + "ä" => "ä", + "å" => "å", + "æ" => "æ", + "ç" => "ç", + "è" => "è", + "é" => "é", + "ê" => "ê", + "ë" => "ë", + "ì" => "ì", + "í" => "í", + "î" => "î", + "ï" => "ï", + "ð" => "ð", + "ñ" => "ñ", + "ò" => "ò", + "ó" => "ó", + "ô" => "ô", + "õ" => "õ", + "ö" => "ö", + "÷" => "÷", + "ø" => "ø", + "ù" => "ù", + "ú" => "ú", + "û" => "û", + "ü" => "ü", + "ý" => "ý", + "þ" => "þ", + "ÿ" => "ÿ", + "&" => "&", + """ => "\"", + "<" => "<", + ">" => ">", + ) # ------------------------------------------------------------------------------------------------------------------------- # From Lewis04's SMART stop list # ------------------------------------------------------------------------------------------------------------------------- -const english_stoplist = Set(String["a", - "a's", - "able", - "about", - "above", - "according", - "accordingly", - "across", - "actually", - "after", - "afterwards", - "again", - "against", - "ain't", - "all", - "allow", - "allows", - "almost", - "alone", - "along", - "already", - "also", - "although", - "always", - "am", - "among", - "amongst", - "an", - "and", - "another", - "any", - "anybody", - "anyhow", - "anyone", - "anything", - "anyway", - "anyways", - "anywhere", - "apart", - "appear", - "appreciate", - "appropriate", - "are", - "aren't", - "around", - "as", - "aside", - "ask", - "asking", - "associated", - "at", - "available", - "away", - "awfully", - "b", - "be", - "became", - "because", - "become", - "becomes", - "becoming", - "been", - "before", - "beforehand", - "behind", - "being", - "believe", - "below", - "beside", - "besides", - "best", - "better", - "between", - "beyond", - "both", - "brief", - "but", - "by", - "c", - "c'mon", - "c's", - "came", - "can", - "can't", - "cannot", - "cant", - "cause", - "causes", - "certain", - "certainly", - "changes", - "clearly", - "co", - "com", - "come", - "comes", - "concerning", - "consequently", - "consider", - "considering", - "contain", - "containing", - "contains", - "corresponding", - "could", - "couldn't", - "course", - "currently", - "d", - "definitely", - "described", - "despite", - "did", - "didn't", - "different", - "do", - "does", - "doesn't", - "doing", - "don't", - "done", - "down", - "downwards", - "during", - "e", - "each", - "edu", - "eg", - "eight", - "either", - "else", - "elsewhere", - "enough", - "entirely", - "especially", - "et", - "etc", - "even", - "ever", - "every", - "everybody", - "everyone", - "everything", - "everywhere", - "ex", - "exactly", - "example", - "except", - "f", - "far", - "few", - "fifth", - "first", - "five", - "followed", - "following", - "follows", - "for", - "former", - "formerly", - "forth", - "four", - "from", - "further", - "furthermore", - "g", - "get", - "gets", - "getting", - "given", - "gives", - "go", - "goes", - "going", - "gone", - "got", - "gotten", - "greetings", - "h", - "had", - "hadn't", - "happens", - "hardly", - "has", - "hasn't", - "have", - "haven't", - "having", - "he", - "he's", - "hello", - "help", - "hence", - "her", - "here", - "here's", - "hereafter", - "hereby", - "herein", - "hereupon", - "hers", - "herself", - "hi", - "him", - "himself", - "his", - "hither", - "hopefully", - "how", - "howbeit", - "however", - "i", - "i'd", - "i'll", - "i'm", - "i've", - "ie", - "if", - "ignored", - "immediate", - "in", - "inasmuch", - "inc", - "indeed", - "indicate", - "indicated", - "indicates", - "inner", - "insofar", - "instead", - "into", - "inward", - "is", - "isn't", - "it", - "it'd", - "it'll", - "it's", - "its", - "itself", - "j", - "just", - "k", - "keep", - "keeps", - "kept", - "know", - "knows", - "known", - "l", - "last", - "lately", - "later", - "latter", - "latterly", - "least", - "less", - "lest", - "let", - "let's", - "like", - "liked", - "likely", - "little", - "look", - "looking", - "looks", - "ltd", - "m", - "mainly", - "many", - "may", - "maybe", - "me", - "mean", - "meanwhile", - "merely", - "might", - "more", - "moreover", - "most", - "mostly", - "much", - "must", - "my", - "myself", - "n", - "name", - "namely", - "nd", - "near", - "nearly", - "necessary", - "need", - "needs", - "neither", - "never", - "nevertheless", - "new", - "next", - "nine", - "no", - "nobody", - "non", - "none", - "noone", - "nor", - "normally", - "not", - "nothing", - "novel", - "now", - "nowhere", - "o", - "obviously", - "of", - "off", - "often", - "oh", - "ok", - "okay", - "old", - "on", - "once", - "one", - "ones", - "only", - "onto", - "or", - "other", - "others", - "otherwise", - "ought", - "our", - "ours", - "ourselves", - "out", - "outside", - "over", - "overall", - "own", - "p", - "particular", - "particularly", - "per", - "perhaps", - "placed", - "please", - "plus", - "possible", - "presumably", - "probably", - "provides", - "q", - "que", - "quite", - "qv", - "r", - "rather", - "rd", - "re", - "really", - "reasonably", - "regarding", - "regardless", - "regards", - "relatively", - "respectively", - "right", - "s", - "said", - "same", - "saw", - "say", - "saying", - "says", - "second", - "secondly", - "see", - "seeing", - "seem", - "seemed", - "seeming", - "seems", - "seen", - "self", - "selves", - "sensible", - "sent", - "serious", - "seriously", - "seven", - "several", - "shall", - "she", - "should", - "shouldn't", - "since", - "six", - "so", - "some", - "somebody", - "somehow", - "someone", - "something", - "sometime", - "sometimes", - "somewhat", - "somewhere", - "soon", - "sorry", - "specified", - "specify", - "specifying", - "still", - "sub", - "such", - "sup", - "sure", - "t", - "t's", - "take", - "taken", - "tell", - "tends", - "th", - "than", - "thank", - "thanks", - "thanx", - "that", - "that's", - "thats", - "the", - "their", - "theirs", - "them", - "themselves", - "then", - "thence", - "there", - "there's", - "thereafter", - "thereby", - "therefore", - "therein", - "theres", - "thereupon", - "these", - "they", - "they'd", - "they'll", - "they're", - "they've", - "think", - "third", - "this", - "thorough", - "thoroughly", - "those", - "though", - "three", - "through", - "throughout", - "thru", - "thus", - "to", - "together", - "too", - "took", - "toward", - "towards", - "tried", - "tries", - "truly", - "try", - "trying", - "twice", - "two", - "u", - "un", - "under", - "unfortunately", - "unless", - "unlikely", - "until", - "unto", - "up", - "upon", - "us", - "use", - "used", - "useful", - "uses", - "using", - "usually", - "uucp", - "v", - "value", - "various", - "very", - "via", - "viz", - "vs", - "w", - "want", - "wants", - "was", - "wasn't", - "way", - "we", - "we'd", - "we'll", - "we're", - "we've", - "welcome", - "well", - "went", - "were", - "weren't", - "what", - "what's", - "whatever", - "when", - "whence", - "whenever", - "where", - "where's", - "whereafter", - "whereas", - "whereby", - "wherein", - "whereupon", - "wherever", - "whether", - "which", - "while", - "whither", - "who", - "who's", - "whoever", - "whole", - "whom", - "whose", - "why", - "will", - "willing", - "wish", - "with", - "within", - "without", - "won't", - "wonder", - "would", - "would", - "wouldn't", - "x", - "y", - "yes", - "yet", - "you", - "you'd", - "you'll", - "you're", - "you've", - "your", - "yours", - "yourself", - "yourselves", - "z", - "zero", - "--number--", - ]) +const english_stoplist = Set(AbstractString["a", + "a's", + "able", + "about", + "above", + "according", + "accordingly", + "across", + "actually", + "after", + "afterwards", + "again", + "against", + "ain't", + "all", + "allow", + "allows", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "an", + "and", + "another", + "any", + "anybody", + "anyhow", + "anyone", + "anything", + "anyway", + "anyways", + "anywhere", + "apart", + "appear", + "appreciate", + "appropriate", + "are", + "aren't", + "around", + "as", + "aside", + "ask", + "asking", + "associated", + "at", + "available", + "away", + "awfully", + "b", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "behind", + "being", + "believe", + "below", + "beside", + "besides", + "best", + "better", + "between", + "beyond", + "both", + "brief", + "but", + "by", + "c", + "c'mon", + "c's", + "came", + "can", + "can't", + "cannot", + "cant", + "cause", + "causes", + "certain", + "certainly", + "changes", + "clearly", + "co", + "com", + "come", + "comes", + "concerning", + "consequently", + "consider", + "considering", + "contain", + "containing", + "contains", + "corresponding", + "could", + "couldn't", + "course", + "currently", + "d", + "definitely", + "described", + "despite", + "did", + "didn't", + "different", + "do", + "does", + "doesn't", + "doing", + "don't", + "done", + "down", + "downwards", + "during", + "e", + "each", + "edu", + "eg", + "eight", + "either", + "else", + "elsewhere", + "enough", + "entirely", + "especially", + "et", + "etc", + "even", + "ever", + "every", + "everybody", + "everyone", + "everything", + "everywhere", + "ex", + "exactly", + "example", + "except", + "f", + "far", + "few", + "fifth", + "first", + "five", + "followed", + "following", + "follows", + "for", + "former", + "formerly", + "forth", + "four", + "from", + "further", + "furthermore", + "g", + "get", + "gets", + "getting", + "given", + "gives", + "go", + "goes", + "going", + "gone", + "got", + "gotten", + "greetings", + "h", + "had", + "hadn't", + "happens", + "hardly", + "has", + "hasn't", + "have", + "haven't", + "having", + "he", + "he's", + "hello", + "help", + "hence", + "her", + "here", + "here's", + "hereafter", + "hereby", + "herein", + "hereupon", + "hers", + "herself", + "hi", + "him", + "himself", + "his", + "hither", + "hopefully", + "how", + "howbeit", + "however", + "i", + "i'd", + "i'll", + "i'm", + "i've", + "ie", + "if", + "ignored", + "immediate", + "in", + "inasmuch", + "inc", + "indeed", + "indicate", + "indicated", + "indicates", + "inner", + "insofar", + "instead", + "into", + "inward", + "is", + "isn't", + "it", + "it'd", + "it'll", + "it's", + "its", + "itself", + "j", + "just", + "k", + "keep", + "keeps", + "kept", + "know", + "knows", + "known", + "l", + "last", + "lately", + "later", + "latter", + "latterly", + "least", + "less", + "lest", + "let", + "let's", + "like", + "liked", + "likely", + "little", + "look", + "looking", + "looks", + "ltd", + "m", + "mainly", + "many", + "may", + "maybe", + "me", + "mean", + "meanwhile", + "merely", + "might", + "more", + "moreover", + "most", + "mostly", + "much", + "must", + "my", + "myself", + "n", + "name", + "namely", + "nd", + "near", + "nearly", + "necessary", + "need", + "needs", + "neither", + "never", + "nevertheless", + "new", + "next", + "nine", + "no", + "nobody", + "non", + "none", + "noone", + "nor", + "normally", + "not", + "nothing", + "novel", + "now", + "nowhere", + "o", + "obviously", + "of", + "off", + "often", + "oh", + "ok", + "okay", + "old", + "on", + "once", + "one", + "ones", + "only", + "onto", + "or", + "other", + "others", + "otherwise", + "ought", + "our", + "ours", + "ourselves", + "out", + "outside", + "over", + "overall", + "own", + "p", + "particular", + "particularly", + "per", + "perhaps", + "placed", + "please", + "plus", + "possible", + "presumably", + "probably", + "provides", + "q", + "que", + "quite", + "qv", + "r", + "rather", + "rd", + "re", + "really", + "reasonably", + "regarding", + "regardless", + "regards", + "relatively", + "respectively", + "right", + "s", + "said", + "same", + "saw", + "say", + "saying", + "says", + "second", + "secondly", + "see", + "seeing", + "seem", + "seemed", + "seeming", + "seems", + "seen", + "self", + "selves", + "sensible", + "sent", + "serious", + "seriously", + "seven", + "several", + "shall", + "she", + "should", + "shouldn't", + "since", + "six", + "so", + "some", + "somebody", + "somehow", + "someone", + "something", + "sometime", + "sometimes", + "somewhat", + "somewhere", + "soon", + "sorry", + "specified", + "specify", + "specifying", + "still", + "sub", + "such", + "sup", + "sure", + "t", + "t's", + "take", + "taken", + "tell", + "tends", + "th", + "than", + "thank", + "thanks", + "thanx", + "that", + "that's", + "thats", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "thence", + "there", + "there's", + "thereafter", + "thereby", + "therefore", + "therein", + "theres", + "thereupon", + "these", + "they", + "they'd", + "they'll", + "they're", + "they've", + "think", + "third", + "this", + "thorough", + "thoroughly", + "those", + "though", + "three", + "through", + "throughout", + "thru", + "thus", + "to", + "together", + "too", + "took", + "toward", + "towards", + "tried", + "tries", + "truly", + "try", + "trying", + "twice", + "two", + "u", + "un", + "under", + "unfortunately", + "unless", + "unlikely", + "until", + "unto", + "up", + "upon", + "us", + "use", + "used", + "useful", + "uses", + "using", + "usually", + "uucp", + "v", + "value", + "various", + "very", + "via", + "viz", + "vs", + "w", + "want", + "wants", + "was", + "wasn't", + "way", + "we", + "we'd", + "we'll", + "we're", + "we've", + "welcome", + "well", + "went", + "were", + "weren't", + "what", + "what's", + "whatever", + "when", + "whence", + "whenever", + "where", + "where's", + "whereafter", + "whereas", + "whereby", + "wherein", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whither", + "who", + "who's", + "whoever", + "whole", + "whom", + "whose", + "why", + "will", + "willing", + "wish", + "with", + "within", + "without", + "won't", + "wonder", + "would", + "would", + "wouldn't", + "x", + "y", + "yes", + "yet", + "you", + "you'd", + "you'll", + "you're", + "you've", + "your", + "yours", + "yourself", + "yourselves", + "z", + "zero", + "--number--", + ]) const news_email_header = r"^((to|from|subject|organization|keywords|summary|lines|distribution|reply-to|nntp-posting-host|disclaimer|x-newsreader|article-i.d.):|(-|\|).*$|-+$|in article.*writes:$|in article.*$|.* writes:$)" const unk_token = "--UNK--" - diff --git a/src/features.jl b/src/features.jl index 1358f8d..32854dd 100644 --- a/src/features.jl +++ b/src/features.jl @@ -16,14 +16,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import Base.norm export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams! immutable NgramStringIterator - string :: String + string :: AbstractString order :: Int32 truncated_start :: Bool end -type StringPosition +type AbstractStringPosition start :: Int32 fin :: Int32 nth :: Int32 @@ -35,9 +36,9 @@ function start(ngi :: NgramStringIterator) for i = 1:(ngi.order-1) #necessary because strings are indexed to bytes, not characters idx = nextind(ngi.string, idx) end - return StringPosition(1, idx, ngi.order) + return AbstractStringPosition(1, idx, ngi.order) else - return StringPosition(1, 1, 1) + return AbstractStringPosition(1, 1, 1) end end @@ -61,11 +62,11 @@ end # ------------------------------------------------------------------------------------------------------------------------- # feature extractors # ------------------------------------------------------------------------------------------------------------------------- -make_string(words :: String, b, e) = SubString(words, b, e) +make_string(words :: AbstractString, b, e) = SubString(words, b, e) make_string(words :: Array, b, e) = join(words[b:e], " ") function ngrams(words::Array; order = 2, truncated_start = false) - ret = String[] + ret = AbstractString[] if !truncated_start for o = 1:min(order - 1, length(words)) @@ -81,19 +82,19 @@ function ngrams(words::Array; order = 2, truncated_start = false) return ret end -function ngrams(words::String; order = 2, truncated_start = false) - ret = String[] +function ngrams(words :: AbstractString; order = 2, truncated_start = false) + ret = AbstractString[] return ngrams!(ret, words, order = order, truncated_start = truncated_start) end -function ngrams!(ret :: Array, words :: String; order = 2, truncated_start = false) +function ngrams!(ret :: Array, words :: AbstractString; order = 2, truncated_start = false) for x in ngram_iterator(words, order = order, truncated_start = truncated_start) push!(ret, x) end return ret end -ngram_iterator(words :: String; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start) +ngram_iterator(words :: AbstractString; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start) # ------------------------------------------------------------------------------------------------------------------------- # feature vector operations @@ -107,7 +108,7 @@ function sparse_count(text, bkg) end function dict_count(tokens) - map = DefaultDict{String,Int32}() + map = DefaultDict{AbstractString,Int32}() for w in tokens map[w] += 1 end diff --git a/src/models.jl b/src/models.jl index 0e9eb2d..92c9a90 100644 --- a/src/models.jl +++ b/src/models.jl @@ -16,16 +16,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import Main.apply export make_background, stats, vocab_size, apply, BKG type BKG - dict :: Associative{String, Int32} - index :: Array{String} + dict :: Associative{AbstractString, Int32} + index :: Array{AbstractString} stats :: Vector{Float64} end vocab_size(bkg::BKG) = length(bkg.index) -getindex(bkg::BKG, token :: String) = bkg.dict[token] -stats(bkg::BKG, s::String) = bkg.stats[bkg[s]] +getindex(bkg::BKG, token :: AbstractString) = bkg.dict[token] +stats(bkg::BKG, s::AbstractString) = bkg.stats[bkg[s]] function tfnorm(stats; cutoff = 1e10, squash :: Function = log) for i = 1:length(stats) @@ -43,7 +44,7 @@ function apply(bkg::BKG, counts) end function make_background(features; mincount = 1, prune = 0.0, unk = true, norm = stats -> min(1.0 ./ stats, 1e10), logger = Log(STDERR)) - dict = DefaultDict(String, Int32, 0) + dict = DefaultDict(AbstractString, Int32, 0) @timer logger "building background dictionary" begin # Count @@ -67,8 +68,8 @@ function make_background(features; mincount = 1, prune = 0.0, unk = true, norm = end # timer # index - index = (String)[unk_token] - rev = DefaultDict(String, Int32, 1) + index = (AbstractString)[unk_token] + rev = DefaultDict(AbstractString, Int32, 1) rev[unk_token] = 1 i = 2 @timer logger "building index" begin diff --git a/src/readers.jl b/src/readers.jl index aa8a5f7..ebc178a 100644 --- a/src/readers.jl +++ b/src/readers.jl @@ -21,12 +21,12 @@ export read_tweets, read_usenet, filelines, zopen # ------------------------------------------------------------------------------------------------------------------------- # Basic utilities # ------------------------------------------------------------------------------------------------------------------------- -function zopen(fn :: String) +function zopen(fn :: AbstractString) return ismatch(r"^.*\.gz$", fn) ? gzopen(fn) : open(fn) end type FileLines - name :: String + name :: AbstractString end start(itr :: FileLines) = zopen(itr.name) @@ -45,7 +45,7 @@ eltype(itr :: FileLines) = ByteString # get a file line iterator from a file name, open with gzip as needed -filelines(fn :: String) = FileLines(fn) +filelines(fn :: AbstractString) = FileLines(fn) streamlines(f) = eachline(f) # convenience # ------------------------------------------------------------------------------------------------------------------------- @@ -53,7 +53,7 @@ streamlines(f) = eachline(f) # convenience # ------------------------------------------------------------------------------------------------------------------------- # read collection of tweets from a file -function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR) +function read_tweets(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR) ret = Dict{String, Float32}[] rlat = Float32[] rlong = Float32[] @@ -78,8 +78,8 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_ # validate text for c in text - if '\ud800' <= c <= '\udfff' || '\U10ffff' < c - valid = false + if 0xd800 <= c <= 0xdfff || 0x10ffff < c # same check made by isvalid(Char,ch) and deprecated is_valid_char + valid = false end end @@ -124,7 +124,7 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_ end # usenet/email single document reader -- 20ng -function read_usenet(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR) +function read_usenet(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR) ignore = false @info lg @sprintf("reading: %s", fn) vec = Dict{String, Float32}() diff --git a/src/tc.jl b/src/tc.jl index c9f9b7d..0811aca 100644 --- a/src/tc.jl +++ b/src/tc.jl @@ -62,7 +62,7 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2 mapper = iteration_method == :eager ? map : lazy_map # define class index - classes = Dict{String, Int32}() + classes = Dict{AbstractString, Int32}() i = 1 @timer logger "indexing truth" for t in truth if !(t in keys(classes)) diff --git a/src/tokenizers.jl b/src/tokenizers.jl index 8289b2f..42fd260 100644 --- a/src/tokenizers.jl +++ b/src/tokenizers.jl @@ -27,11 +27,11 @@ const url_pattern = r"http://[^\s]*" const hashtag_pattern = r"^#.*$" const mention_pattern = r"^@.*$" -function replace_html_entities(s :: String) +function replace_html_entities(s :: AbstractString) replace(s, r"&[^;]+?;", s -> s in keys(html_entity_table) ? html_entity_table[s] : s) end -function pattern_replace(w :: String) +function pattern_replace(w :: AbstractString) if ismatch(r"^[+-]?\p{Sc}\d+([.,]\d+)*$", w) return "--currency--" elseif ismatch(r"^[+-]?\d+([.,]\d+)*%$", w) return "--percent--" elseif ismatch(r"^[+-]?\d+([.,]\d+)*$", w) return "--number--" @@ -44,14 +44,14 @@ function pattern_replace(w :: String) end end -function prereplace(sent :: String) +function prereplace(sent :: AbstractString) r = replace(sent, r"n't\b", " not") r = replace(r, r"'s\b", " s's") r = replace(r, r"'d\b", " d'd") end -function english_tokenizer(s :: String) +function english_tokenizer(s :: AbstractString) return [ begin m = match(punct_word, w) @@ -61,7 +61,7 @@ function english_tokenizer(s :: String) ] end -function twenglish_tokenizer(s :: String) +function twenglish_tokenizer(s :: AbstractString) return [ begin m = match(r"^(\p{P}*)(.*?)\p{P}*$", w) @@ -71,7 +71,7 @@ function twenglish_tokenizer(s :: String) ] end -function twenglish_cleaner(tw :: String; urls = true, hashtags = true, mentions = true) +function twenglish_cleaner(tw :: AbstractString; urls = true, hashtags = true, mentions = true) ctw = replace(normalize_string(tw, :NFKC), default_space, " ") ctw = urls ? replace(ctw, url_pattern, "\u0030\u20E3") : ctw diff --git a/test/lid.jl b/test/lid.jl index f8ec04f..d65e6a7 100644 --- a/test/lid.jl +++ b/test/lid.jl @@ -15,7 +15,7 @@ bkgmodel, fextractor, model = tc_train(train, train_truth, lid_iterating_tokeniz trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 2, C = 0.01, average = true), iteration_method = :eager) -confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0)) +confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0)) res = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0 @info @sprintf("mira test set error rate: %7.3f", res) print_confusion_matrix(confmat) diff --git a/test/runtests.jl b/test/runtests.jl index fe09d0f..9a88fde 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -125,7 +125,7 @@ end # ------------------------------------------------------------------------------------------------------------------------- # feature vector tests # ------------------------------------------------------------------------------------------------------------------------- -lines = (Array{String})[] +lines = (Array{AbstractString})[] for l in filelines("data/test.txt") tokens = split(strip(l), r"\s+") push!(lines, tokens) @@ -141,8 +141,8 @@ bkg = make_background(lines, mincount = 2) @expect stats(bkg, unk_token) == 19.0 @info "bkg[c] = $(stats(bkg, "c"))" -@expect sparse_count(lines[1], bkg) == sparsevec((Int64=>Float64)[ bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0], vocab_size(bkg)) -@expect sparse_count(lines[end], bkg) == sparsevec((Int64=>Float64)[ bkg[unk_token] => 1.0 ], vocab_size(bkg)) +@expect sparse_count(lines[1], bkg) == sparsevec(Dict{Int64,Float64}( bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0), vocab_size(bkg)) +@expect sparse_count(lines[end], bkg) == sparsevec(Dict{Int64,Float64}( bkg[unk_token] => 1.0 ), vocab_size(bkg)) @info "sparse[c] = $(sparse_count(lines[1], bkg)[2])" @expect norm(sparse_count(lines[1], bkg), bkg)[2] == 3.166666666666667 diff --git a/test/topic.jl b/test/topic.jl index 141fcf3..6d32350 100644 --- a/test/topic.jl +++ b/test/topic.jl @@ -12,8 +12,8 @@ function text(fn) end function getinstances(dir) - docs = String[] - truth = String[] + docs = AbstractString[] + truth = AbstractString[] for t in filter(d -> d != "." && d != "..", readdir(dir)) for d in filter(d -> d != "." && d != "..", readdir("$dir/$t")) @@ -52,7 +52,7 @@ test, test_truth = getinstances("20ng/test") bkgmodel, fextractor, model = tc_train(train, train_truth, tokenize_file, mincount = 2, cutoff = 1e10, trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 19, C = 0.01, average = true), iteration_method = :eager) -confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0)) +confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0)) res = test_classification(model, lazy_map(x -> fextractor(tokenize_file(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0 @info @sprintf("mira test set error rate: %7.3f", res) From a8247b7b7ad698425df71ff7ecb6eeb0cd1d84d8 Mon Sep 17 00:00:00 2001 From: Elizabeth Salesky Date: Tue, 14 Jun 2016 11:24:29 -0400 Subject: [PATCH 4/4] - updating for ilr data (array{array}) (amended) --- REQUIRE | 2 +- src/readers.jl | 8 ++++++++ src/tc.jl | 40 ++++++++++++++++++++++++++++++++++++++-- test/topic.jl | 2 ++ 4 files changed, 49 insertions(+), 3 deletions(-) diff --git a/REQUIRE b/REQUIRE index 1fab71a..d7f65dd 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,4 +1,4 @@ -julia 0.4- +julia 0.4 DataStructures Devectorize Iterators diff --git a/src/readers.jl b/src/readers.jl index ebc178a..e4520c2 100644 --- a/src/readers.jl +++ b/src/readers.jl @@ -48,6 +48,14 @@ eltype(itr :: FileLines) = ByteString filelines(fn :: AbstractString) = FileLines(fn) streamlines(f) = eachline(f) # convenience +#function getfile(name) +# file = joinpath(savedir, name) +# if !isfile(file) +# file = download(urlbase*name, file) +# end +# file +#end + # ------------------------------------------------------------------------------------------------------------------------- # Text format readers # ------------------------------------------------------------------------------------------------------------------------- diff --git a/src/tc.jl b/src/tc.jl index 0811aca..33766b3 100644 --- a/src/tc.jl +++ b/src/tc.jl @@ -49,13 +49,19 @@ end # ------------------------------------------------------------------------------------------------------------------------- # training for text classifiers # ------------------------------------------------------------------------------------------------------------------------- -function tc_features(text, bkgmodel) +function tc_features(text::Array, bkgmodel) counts = sparse_count(text, bkgmodel) counts /= sum(counts) return apply(bkgmodel, counts) end -function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, +function tc_features{T<:Array,N}(text::Array{T,N}, bkgmodel) + counts = sparse_count(flatten(text), bkgmodel) + counts /= sum(counts) + return apply(bkgmodel, counts) +end + +function tc_train(text::Array, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, iteration_method = :lazy, trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true), logger = Log(STDERR)) @@ -81,3 +87,33 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2 return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model end + +function tc_train{T<:Array,N}(text::Array{T,N}, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0, + iteration_method = :lazy, + trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true), + logger = Log(STDERR)) + mapper = iteration_method == :eager ? map : lazy_map + + # define class index + classes = Dict{AbstractString, Int32}() + i = 1 + @timer logger "indexing truth" for t in truth + if !(t in keys(classes)) + classes[t] = i + i += 1 + end + end + + # prep model + @timer logger "preprocessing input" preprocessed_text = mapper((x) -> mapper(preprocess, x), text) + @timer logger "making background model" bkgmodel = make_background(flatten_iter(preprocessed_text), mincount = mincount, prune = prune, + norm = stats -> tfnorm(stats, squash = sqrt, cutoff = cutoff)) + @timer logger "making feature vectors" fvs = mapper(text -> tc_features(text, bkgmodel), preprocessed_text) + @timer logger "initializating model" init_model = LinearModel(classes, vocab_size(bkgmodel)) + @timer logger "training final model" model = trainer(fvs, truth, init_model) + + return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model +end + +flatten(a) = mapreduce(x -> isa(x, Array) ? flatten(x) : x, vcat, [], a) +flatten_iter(a) = map((x) -> flatten(x), a) diff --git a/test/topic.jl b/test/topic.jl index 6d32350..e4a218c 100644 --- a/test/topic.jl +++ b/test/topic.jl @@ -1,3 +1,5 @@ +using Text, Stage, Ollam, DataStructures + function text(fn) res = "" for l in map(l -> chomp(l), eachline(`iconv -f latin1 -t utf8 $fn`))