Skip to content

Commit c63d291

Browse files
committed
- moving to v0.4 julia
1 parent 921719d commit c63d291

11 files changed

+717
-717
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ before_install:
1414
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
1515
script:
1616
- julia -e 'versioninfo(); Pkg.init(); Pkg.clone("https://github.com/saltpork/Stage.jl"); Pkg.clone("https://github.com/mit-nlp/Ollam.jl"); Pkg.clone(pwd())'
17-
- cd test; julia --color runtests.jl
17+
- cd test; julia --color=yes runtests.jl

REQUIRE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
julia 0.3 0.4-
1+
julia 0.4-
22
DataStructures
33
Devectorize
44
Iterators

src/constants.jl

+675-676
Large diffs are not rendered by default.

src/features.jl

+12-11
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,15 @@
1616
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1717
# See the License for the specific language governing permissions and
1818
# limitations under the License.
19+
import Base.norm
1920
export ngrams, count, tfnorm, sparse_count, norm, znorm, ngram_iterator, ngrams!
2021

2122
immutable NgramStringIterator
22-
string :: String
23+
string :: AbstractString
2324
order :: Int32
2425
truncated_start :: Bool
2526
end
26-
type StringPosition
27+
type AbstractStringPosition
2728
start :: Int32
2829
fin :: Int32
2930
nth :: Int32
@@ -35,9 +36,9 @@ function start(ngi :: NgramStringIterator)
3536
for i = 1:(ngi.order-1) #necessary because strings are indexed to bytes, not characters
3637
idx = nextind(ngi.string, idx)
3738
end
38-
return StringPosition(1, idx, ngi.order)
39+
return AbstractStringPosition(1, idx, ngi.order)
3940
else
40-
return StringPosition(1, 1, 1)
41+
return AbstractStringPosition(1, 1, 1)
4142
end
4243
end
4344

@@ -61,11 +62,11 @@ end
6162
# -------------------------------------------------------------------------------------------------------------------------
6263
# feature extractors
6364
# -------------------------------------------------------------------------------------------------------------------------
64-
make_string(words :: String, b, e) = SubString(words, b, e)
65+
make_string(words :: AbstractString, b, e) = SubString(words, b, e)
6566
make_string(words :: Array, b, e) = join(words[b:e], " ")
6667

6768
function ngrams(words::Array; order = 2, truncated_start = false)
68-
ret = String[]
69+
ret = AbstractString[]
6970

7071
if !truncated_start
7172
for o = 1:min(order - 1, length(words))
@@ -81,19 +82,19 @@ function ngrams(words::Array; order = 2, truncated_start = false)
8182
return ret
8283
end
8384

84-
function ngrams(words::String; order = 2, truncated_start = false)
85-
ret = String[]
85+
function ngrams(words :: AbstractString; order = 2, truncated_start = false)
86+
ret = AbstractString[]
8687
return ngrams!(ret, words, order = order, truncated_start = truncated_start)
8788
end
8889

89-
function ngrams!(ret :: Array, words :: String; order = 2, truncated_start = false)
90+
function ngrams!(ret :: Array, words :: AbstractString; order = 2, truncated_start = false)
9091
for x in ngram_iterator(words, order = order, truncated_start = truncated_start)
9192
push!(ret, x)
9293
end
9394
return ret
9495
end
9596

96-
ngram_iterator(words :: String; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
97+
ngram_iterator(words :: AbstractString; order = 2, truncated_start = false) = NgramStringIterator(words, order, truncated_start)
9798

9899
# -------------------------------------------------------------------------------------------------------------------------
99100
# feature vector operations
@@ -107,7 +108,7 @@ function sparse_count(text, bkg)
107108
end
108109

109110
function dict_count(tokens)
110-
map = DefaultDict{String,Int32}()
111+
map = DefaultDict{AbstractString,Int32}()
111112
for w in tokens
112113
map[w] += 1
113114
end

src/models.jl

+7-7
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919
export make_background, stats, vocab_size, apply, BKG
2020

2121
type BKG
22-
dict :: Associative{String, Int32}
23-
index :: Array{String}
22+
dict :: Associative{AbstractString, Int32}
23+
index :: Array{AbstractString}
2424
stats :: Vector{Float64}
2525
end
2626
vocab_size(bkg::BKG) = length(bkg.index)
27-
getindex(bkg::BKG, token :: String) = bkg.dict[token]
28-
stats(bkg::BKG, s::String) = bkg.stats[bkg[s]]
27+
getindex(bkg::BKG, token :: AbstractString) = bkg.dict[token]
28+
stats(bkg::BKG, s::AbstractString) = bkg.stats[bkg[s]]
2929

3030
function tfnorm(stats; cutoff = 1e10, squash :: Function = log)
3131
for i = 1:length(stats)
@@ -43,7 +43,7 @@ function apply(bkg::BKG, counts)
4343
end
4444

4545
function make_background(features; mincount = 1, prune = 0.0, unk = true, norm = stats -> min(1.0 ./ stats, 1e10), logger = Log(STDERR))
46-
dict = DefaultDict(String, Int32, 0)
46+
dict = DefaultDict(AbstractString, Int32, 0)
4747

4848
@timer logger "building background dictionary" begin
4949
# Count
@@ -67,8 +67,8 @@ function make_background(features; mincount = 1, prune = 0.0, unk = true, norm =
6767
end # timer
6868

6969
# index
70-
index = (String)[unk_token]
71-
rev = DefaultDict(String, Int32, 1)
70+
index = (AbstractString)[unk_token]
71+
rev = DefaultDict(AbstractString, Int32, 1)
7272
rev[unk_token] = 1
7373
i = 2
7474
@timer logger "building index" begin

src/readers.jl

+7-7
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ export read_tweets, read_usenet, filelines, zopen
2121
# -------------------------------------------------------------------------------------------------------------------------
2222
# Basic utilities
2323
# -------------------------------------------------------------------------------------------------------------------------
24-
function zopen(fn :: String)
24+
function zopen(fn :: AbstractString)
2525
return ismatch(r"^.*\.gz$", fn) ? gzopen(fn) : open(fn)
2626
end
2727

2828
type FileLines
29-
name :: String
29+
name :: AbstractString
3030
end
3131

3232
start(itr :: FileLines) = zopen(itr.name)
@@ -45,15 +45,15 @@ eltype(itr :: FileLines) = ByteString
4545

4646

4747
# get a file line iterator from a file name, open with gzip as needed
48-
filelines(fn :: String) = FileLines(fn)
48+
filelines(fn :: AbstractString) = FileLines(fn)
4949
streamlines(f) = eachline(f) # convenience
5050

5151
# -------------------------------------------------------------------------------------------------------------------------
5252
# Text format readers
5353
# -------------------------------------------------------------------------------------------------------------------------
5454

5555
# read collection of tweets from a file
56-
function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
56+
function read_tweets(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=tenglish_tokenizer, limit=-1, keepFn=x->true, lg=STDERR)
5757
ret = Dict{String, Float32}[]
5858
rlat = Float32[]
5959
rlong = Float32[]
@@ -78,8 +78,8 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
7878

7979
# validate text
8080
for c in text
81-
if '\ud800' <= c <= '\udfff' || '\U10ffff' < c
82-
valid = false
81+
if 0xd800 <= c <= 0xdfff || 0x10ffff < c # same check made by isvalid(Char,ch) and deprecated is_valid_char
82+
valid = false
8383
end
8484
end
8585

@@ -124,7 +124,7 @@ function read_tweets(fn :: String; stopList=english_stoplist, header=news_email_
124124
end
125125

126126
# usenet/email single document reader -- 20ng
127-
function read_usenet(fn :: String; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
127+
function read_usenet(fn :: AbstractString; stopList=english_stoplist, header=news_email_header, tokenizer=english_tokenizer, lg=STDERR)
128128
ignore = false
129129
@info lg @sprintf("reading: %s", fn)
130130
vec = Dict{String, Float32}()

src/tc.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ function tc_train(text, truth, preprocess::Function; cutoff = 1e10, mincount = 2
6262
mapper = iteration_method == :eager ? map : lazy_map
6363

6464
# define class index
65-
classes = Dict{String, Int32}()
65+
classes = Dict{AbstractString, Int32}()
6666
i = 1
6767
@timer logger "indexing truth" for t in truth
6868
if !(t in keys(classes))

src/tokenizers.jl

+6-6
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ const url_pattern = r"http://[^\s]*"
2727
const hashtag_pattern = r"^#.*$"
2828
const mention_pattern = r"^@.*$"
2929

30-
function replace_html_entities(s :: String)
30+
function replace_html_entities(s :: AbstractString)
3131
replace(s, r"&[^;]+?;", s -> s in keys(html_entity_table) ? html_entity_table[s] : s)
3232
end
3333

34-
function pattern_replace(w :: String)
34+
function pattern_replace(w :: AbstractString)
3535
if ismatch(r"^[+-]?\p{Sc}\d+([.,]\d+)*$", w) return "--currency--"
3636
elseif ismatch(r"^[+-]?\d+([.,]\d+)*%$", w) return "--percent--"
3737
elseif ismatch(r"^[+-]?\d+([.,]\d+)*$", w) return "--number--"
@@ -44,14 +44,14 @@ function pattern_replace(w :: String)
4444
end
4545
end
4646

47-
function prereplace(sent :: String)
47+
function prereplace(sent :: AbstractString)
4848
r = replace(sent, r"n't\b", " not")
4949
r = replace(r, r"'s\b", " s's")
5050
r = replace(r, r"'d\b", " d'd")
5151
end
5252

5353

54-
function english_tokenizer(s :: String)
54+
function english_tokenizer(s :: AbstractString)
5555
return [
5656
begin
5757
m = match(punct_word, w)
@@ -61,7 +61,7 @@ function english_tokenizer(s :: String)
6161
]
6262
end
6363

64-
function twenglish_tokenizer(s :: String)
64+
function twenglish_tokenizer(s :: AbstractString)
6565
return [
6666
begin
6767
m = match(r"^(\p{P}*)(.*?)\p{P}*$", w)
@@ -71,7 +71,7 @@ function twenglish_tokenizer(s :: String)
7171
]
7272
end
7373

74-
function twenglish_cleaner(tw :: String; urls = true, hashtags = true, mentions = true)
74+
function twenglish_cleaner(tw :: AbstractString; urls = true, hashtags = true, mentions = true)
7575
ctw = replace(normalize_string(tw, :NFKC), default_space, " ")
7676
ctw = urls ? replace(ctw, url_pattern, "\u0030\u20E3") : ctw
7777

test/lid.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ bkgmodel, fextractor, model = tc_train(train, train_truth, lid_iterating_tokeniz
1515
trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 2, C = 0.01, average = true),
1616
iteration_method = :eager)
1717

18-
confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
18+
confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
1919
res = test_classification(model, lazy_map(x -> fextractor(lid_iterating_tokenizer(x)), test), test_truth, record = (t, h) -> confmat[t][h] += 1) * 100.0
2020
@info @sprintf("mira test set error rate: %7.3f", res)
2121
print_confusion_matrix(confmat)

test/runtests.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ end
125125
# -------------------------------------------------------------------------------------------------------------------------
126126
# feature vector tests
127127
# -------------------------------------------------------------------------------------------------------------------------
128-
lines = (Array{String})[]
128+
lines = (Array{AbstractString})[]
129129
for l in filelines("data/test.txt")
130130
tokens = split(strip(l), r"\s+")
131131
push!(lines, tokens)
@@ -141,8 +141,8 @@ bkg = make_background(lines, mincount = 2)
141141
@expect stats(bkg, unk_token) == 19.0
142142

143143
@info "bkg[c] = $(stats(bkg, "c"))"
144-
@expect sparse_count(lines[1], bkg) == sparsevec((Int64=>Float64)[ bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0], vocab_size(bkg))
145-
@expect sparse_count(lines[end], bkg) == sparsevec((Int64=>Float64)[ bkg[unk_token] => 1.0 ], vocab_size(bkg))
144+
@expect sparse_count(lines[1], bkg) == sparsevec(Dict{Int64,Float64}( bkg["a"] => 1.0, bkg["b"] => 1.0, bkg["c"] => 1.0), vocab_size(bkg))
145+
@expect sparse_count(lines[end], bkg) == sparsevec(Dict{Int64,Float64}( bkg[unk_token] => 1.0 ), vocab_size(bkg))
146146

147147
@info "sparse[c] = $(sparse_count(lines[1], bkg)[2])"
148148
@expect norm(sparse_count(lines[1], bkg), bkg)[2] == 3.166666666666667

test/topic.jl

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ function text(fn)
1212
end
1313

1414
function getinstances(dir)
15-
docs = String[]
16-
truth = String[]
15+
docs = AbstractString[]
16+
truth = AbstractString[]
1717

1818
for t in filter(d -> d != "." && d != "..", readdir(dir))
1919
for d in filter(d -> d != "." && d != "..", readdir("$dir/$t"))
@@ -52,7 +52,7 @@ test, test_truth = getinstances("20ng/test")
5252
bkgmodel, fextractor, model = tc_train(train, train_truth, tokenize_file, mincount = 2, cutoff = 1e10,
5353
trainer = (fvs, truth, init_model) -> train_mira(fvs, truth, init_model, iterations = 20, k = 19, C = 0.01, average = true),
5454
iteration_method = :eager)
55-
confmat = DefaultDict(String, DefaultDict{String, Int32}, () -> DefaultDict(String, Int32, 0))
55+
confmat = DefaultDict(AbstractString, DefaultDict{AbstractString, Int32}, () -> DefaultDict(AbstractString, Int32, 0))
5656
res = test_classification(model, lazy_map(x -> fextractor(tokenize_file(x)), test), test_truth,
5757
record = (t, h) -> confmat[t][h] += 1) * 100.0
5858
@info @sprintf("mira test set error rate: %7.3f", res)

0 commit comments

Comments
 (0)