Skip to content

- update to ngrams #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ before_install:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
script:
- julia -e 'versioninfo(); Pkg.init(); Pkg.clone("https://github.com/saltpork/Stage.jl"); Pkg.clone("https://github.com/mit-nlp/Ollam.jl"); Pkg.clone(pwd())'
- cd test; julia --color runtests.jl
- cd test; julia --color=yes runtests.jl
2 changes: 1 addition & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
julia 0.3 0.4-
julia 0.4
DataStructures
Devectorize
Iterators
Expand Down
1,351 changes: 675 additions & 676 deletions src/constants.jl

Large diffs are not rendered by default.

56 changes: 32 additions & 24 deletions src/features.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,54 +16,63 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import Base.norm
export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams!

immutable NgramStringIterator
string :: String
order :: Int32
string :: AbstractString
order :: Int32
truncated_start :: Bool
end
type StringPosition
start :: Int32
fin :: Int32
nth :: Int32
type AbstractStringPosition
start :: Int32
fin :: Int32
nth :: Int32
end

function start(ngi :: NgramStringIterator)
if ngi.truncated_start
idx = 1
for i = 1:(ngi.order-1)
for i = 1:(ngi.order-1) #necessary because strings are indexed to bytes, not characters
idx = nextind(ngi.string, idx)
end
return StringPosition(1, idx, ngi.order)
return AbstractStringPosition(1, idx, ngi.order)
else
return StringPosition(1, 1, 1)
return AbstractStringPosition(1, 1, 1)
end
end

done(ngi :: NgramStringIterator, position) = position.fin > endof(ngi.string)
done(ngi :: NgramStringIterator, position) = position.nth > ngi.order || position.fin > endof(ngi.string)
function next(ngi :: NgramStringIterator, position)
str = make_string(ngi.string, position.start, position.fin)
if position.nth >= ngi.order
position.start = nextind(ngi.string, position.start)

if position.fin >= endof(ngi.string)
position.start = 0
position.fin = 1
for i = 1:position.nth-1
position.fin = nextind(ngi.string, position.fin)
end
position.nth += 1
end
position.nth += 1
position.fin = nextind(ngi.string, position.fin)
position.start = nextind(ngi.string, position.start)
position.fin = nextind(ngi.string, position.fin)
return str, position
end

# -------------------------------------------------------------------------------------------------------------------------
# feature extractors
# -------------------------------------------------------------------------------------------------------------------------
make_string(words :: String, b, e) = SubString(words, b, e)
make_string(words :: AbstractString, b, e) = SubString(words, b, e)
make_string(words :: Array, b, e) = join(words[b:e], " ")

function ngrams(words::Array; order = 2, truncated_start = false)
ret = String[]
ret = AbstractString[]

if !truncated_start
for wi = 1:min(order - 1, length(words))
push!(ret, make_string(words, 1, wi))
for o = 1:min(order - 1, length(words))
for wi = 1:length(words)-(o-1)
push!(ret, make_string(words, wi, wi + o - 1))
end
end
end

Expand All @@ -73,19 +82,19 @@ function ngrams(words::Array; order = 2, truncated_start = false)
return ret
end

function ngrams(words::String; order = 2, truncated_start = false)
ret = String[]
function ngrams(words :: AbstractString; order = 2, truncated_start = false)
ret = AbstractString[]
return ngrams!(ret, words, order = order, truncated_start = truncated_start)
end

function ngrams!(ret :: Array, words :: String; order = 2, truncated_start = false)
function ngrams!(ret :: Array, words :: AbstractString; order = 2, truncated_start = false)
for x in ngram_iterator(words, order = order, truncated_start = truncated_start)
push!(ret, x)
end
return ret
end

ngram_iterator(words :: String; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
ngram_iterator(words :: AbstractString; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)

# -------------------------------------------------------------------------------------------------------------------------
# feature vector operations
Expand All @@ -98,9 +107,8 @@ function sparse_count(text, bkg)
return vec
end


function dict_count(tokens)
map = DefaultDict{String,Int32}()
map = DefaultDict{AbstractString,Int32}()
for w in tokens
map[w] += 1
end
Expand Down
17 changes: 9 additions & 8 deletions src/models.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export make_background, stats, vocab_size, apply
import Main.apply
export make_background, stats, vocab_size, apply, BKG

type BKG
dict :: Associative{String, Int32}
index :: Array{String}
dict :: Associative{AbstractString, Int32}
index :: Array{AbstractString}
stats :: Vector{Float64}
end
vocab_size(bkg::BKG) = length(bkg.index)
getindex(bkg::BKG, token :: String) = bkg.dict[token]
stats(bkg::BKG, s::String) = bkg.stats[bkg[s]]
getindex(bkg::BKG, token :: AbstractString) = bkg.dict[token]
stats(bkg::BKG, s::AbstractString) = bkg.stats[bkg[s]]

function tfnorm(stats; cutoff = 1e10, squash :: Function = log)
for i = 1:length(stats)
Expand All @@ -43,7 +44,7 @@ function apply(bkg::BKG, counts)
end

function make_background(features; mincount = 1, prune = 0.0, unk = true, norm = stats -> min(1.0 ./ stats, 1e10), logger = Log(STDERR))
dict = DefaultDict(String, Int32, 0)
dict = DefaultDict(AbstractString, Int32, 0)

@timer logger "building background dictionary" begin
# Count
Expand All @@ -67,8 +68,8 @@ function make_background(features; mincount = 1, prune = 0.0, unk = true, norm =
end # timer

# index
index = (String)[unk_token]
rev = DefaultDict(String, Int32, 1)
index = (AbstractString)[unk_token]
rev = DefaultDict(AbstractString, Int32, 1)
rev[unk_token] = 1
i = 2
@timer logger "building index" begin
Expand Down
22 changes: 15 additions & 7 deletions src/readers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ export read_tweets, read_usenet, filelines, zopen
# -------------------------------------------------------------------------------------------------------------------------
# Basic utilities
# -------------------------------------------------------------------------------------------------------------------------
function zopen(fn :: String)
function zopen(fn :: AbstractString)
return ismatch(r"^.*\.gz$", fn) ? gzopen(fn) : open(fn)
end

type FileLines
name :: String
name :: AbstractString
end

start(itr :: FileLines) = zopen(itr.name)
Expand All @@ -45,15 +45,23 @@ eltype(itr :: FileLines) = ByteString


# get a file line iterator from a file name, open with gzip as needed
filelines(fn :: String) = FileLines(fn)
filelines(fn :: AbstractString) = FileLines(fn)
streamlines(f) = eachline(f) # convenience

#function getfile(name)
# file = joinpath(savedir, name)
# if !isfile(file)
# file = download(urlbase*name, file)
# end
# file
#end

# -------------------------------------------------------------------------------------------------------------------------
# Text format readers
# -------------------------------------------------------------------------------------------------------------------------

# read collection of tweets from a file
function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
function read_tweets(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
ret = Dict{String, Float32}[]
rlat = Float32[]
rlong = Float32[]
Expand All @@ -78,8 +86,8 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_

# validate text
for c in text
if '\ud800' <= c <= '\udfff' || '\U10ffff' < c
valid = false
if 0xd800 <= c <= 0xdfff || 0x10ffff < c # same check made by isvalid(Char,ch) and deprecated is_valid_char
valid = false
end
end

Expand Down Expand Up @@ -124,7 +132,7 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
end

# usenet/email single document reader -- 20ng
function read_usenet(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
function read_usenet(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
ignore = false
@info lg @sprintf("reading: %s", fn)
vec = Dict{String, Float32}()
Expand Down
42 changes: 39 additions & 3 deletions src/tc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,26 @@ end
# -------------------------------------------------------------------------------------------------------------------------
# training for text classifiers
# -------------------------------------------------------------------------------------------------------------------------
function tc_features(text, bkgmodel)
function tc_features(text::Array, bkgmodel)
counts = sparse_count(text, bkgmodel)
counts /= sum(counts)
return apply(bkgmodel, counts)
end

function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0,
function tc_features{T<:Array,N}(text::Array{T,N}, bkgmodel)
counts = sparse_count(flatten(text), bkgmodel)
counts /= sum(counts)
return apply(bkgmodel, counts)
end

function tc_train(text::Array, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0,
iteration_method = :lazy,
trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true),
logger = Log(STDERR))
mapper = iteration_method == :eager ? map : lazy_map

# define class index
classes = Dict{String, Int32}()
classes = Dict{AbstractString, Int32}()
i = 1
@timer logger "indexing truth" for t in truth
if !(t in keys(classes))
Expand All @@ -81,3 +87,33 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2

return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model
end

function tc_train{T<:Array,N}(text::Array{T,N}, truth, preprocess::Function; cutoff = 1e10, mincount = 2, prune = 0.0,
iteration_method = :lazy,
trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 3, average = true),
logger = Log(STDERR))
mapper = iteration_method == :eager ? map : lazy_map

# define class index
classes = Dict{AbstractString, Int32}()
i = 1
@timer logger "indexing truth" for t in truth
if !(t in keys(classes))
classes[t] = i
i += 1
end
end

# prep model
@timer logger "preprocessing input" preprocessed_text = mapper((x) -> mapper(preprocess, x), text)
@timer logger "making background model" bkgmodel = make_background(flatten_iter(preprocessed_text), mincount = mincount, prune = prune,
norm = stats -> tfnorm(stats, squash = sqrt, cutoff = cutoff))
@timer logger "making feature vectors" fvs = mapper(text -> tc_features(text, bkgmodel), preprocessed_text)
@timer logger "initializating model" init_model = LinearModel(classes, vocab_size(bkgmodel))
@timer logger "training final model" model = trainer(fvs, truth, init_model)

return bkgmodel, ptext -> tc_features(ptext, bkgmodel), model
end

flatten(a) = mapreduce(x -> isa(x, Array) ? flatten(x) : x, vcat, [], a)
flatten_iter(a) = map((x) -> flatten(x), a)
12 changes: 6 additions & 6 deletions src/tokenizers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ const url_pattern = r"http://[^\s]*"
const hashtag_pattern = r"^#.*$"
const mention_pattern = r"^@.*$"

function replace_html_entities(s :: String)
function replace_html_entities(s :: AbstractString)
replace(s, r"&[^;]+?;", s -> s in keys(html_entity_table) ? html_entity_table[s] : s)
end

function pattern_replace(w :: String)
function pattern_replace(w :: AbstractString)
if ismatch(r"^[+-]?\p{Sc}\d+([.,]\d+)*$", w) return "--currency--"
elseif ismatch(r"^[+-]?\d+([.,]\d+)*%$", w) return "--percent--"
elseif ismatch(r"^[+-]?\d+([.,]\d+)*$", w) return "--number--"
Expand All @@ -44,14 +44,14 @@ function pattern_replace(w :: String)
end
end

function prereplace(sent :: String)
function prereplace(sent :: AbstractString)
r = replace(sent, r"n't\b", " not")
r = replace(r, r"'s\b", " s's")
r = replace(r, r"'d\b", " d'd")
end


function english_tokenizer(s :: String)
function english_tokenizer(s :: AbstractString)
return [
begin
m = match(punct_word, w)
Expand All @@ -61,7 +61,7 @@ function english_tokenizer(s :: String)
]
end

function twenglish_tokenizer(s :: String)
function twenglish_tokenizer(s :: AbstractString)
return [
begin
m = match(r"^(\p{P}*)(.*?)\p{P}*$", w)
Expand All @@ -71,7 +71,7 @@ function twenglish_tokenizer(s :: String)
]
end

function twenglish_cleaner(tw :: String; urls = true, hashtags = true, mentions = true)
function twenglish_cleaner(tw :: AbstractString; urls = true, hashtags = true, mentions = true)
ctw = replace(normalize_string(tw, :NFKC), default_space, " ")
ctw = urls ? replace(ctw, url_pattern, "\u0030\u20E3") : ctw

Expand Down
4 changes: 2 additions & 2 deletions test/lid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ bkgmodel, fextractor, model = tc_train(train, train_truth, lid_iterating_tokeniz
trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 2, C = 0.01, average = true),
iteration_method = :eager)

confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
res = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0
@info @sprintf("mira test set error rate: %7.3f", res)
print_confusion_matrix(confmat)
@expect abs(res - 0.596) < 0.01
@expect abs(res - 0.700) < 0.01

# List specific errors
# for (text, t) in zip(test, test_truth)
Expand Down
Loading