-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathword_tokenizer.rb
More file actions
55 lines (41 loc) · 1.49 KB
/
word_tokenizer.rb
File metadata and controls
55 lines (41 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- encoding : utf-8 -*-
module WordTokenizer
@@tokenize_regexps = [
# Uniform Quotes
[/''|``|“|”/, '"'],
# Separate punctuation (except for periods) from words.
[/(^|[:space:])(')/u, '\1\2'],
[/(?=[\("`{\[:;&#*@])(.)/, '\1 '],
[/(.)(?=[?!\)";}\]*:@'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|[:space:])-)(?=[^-])/u, '\1 '],
# Treat double-hyphen as a single token.
[/([^-])(--+)([^-])/, '\1 \2 \3'],
[/([:space:]|^)(,)(?=(^[:space:]))/u, '\1\2 '],
# Only separate a comma if a space follows.
[/(.)(,)([:space:]|$)/u, '\1 \2\3'],
# Combine dots separated by whitespace to be a single token.
[/\.[:space:]\.[:space:]\./u, '...'],
# Separate "No.6"
[/(^[:upper]^[:lower:]\.)(\d+)/, '\1 \2'],
# Md. or MD. for Ruby 1.8
[/M[d|D]./, '\1'],
# Separate words from ellipses
[/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
[/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1\2 \3'],
[/(^|[:space:])(\.{2,})([^\.[:space:]])/u, '\1 \2\3'],
##### Some additional fixes.
# Fix %, $, &
[/(\d)%/, '\1 %'],
[/\$(\.?\d)/, '$ \1'],
[/(^[:lower:]^[:upper:])& (^[:lower:]^[:upper:])/u, '\1&\2'],
[/(^[:lower:]^[:upper:]+)&(^[:lower:]^[:upper:]+)/u, '\1 & \2'],
# Fix (n 't) -> ( n't)
[/n 't( |$)/, " n't\\1"],
[/N 'T( |$)/, " N'T\\1"],
# Treebank tokenizer special words
[/([Cc])annot/, '\1an not']
];
def tokenize(s)
rules = []
@@tokenize_regexps.each {|rules| s.gsub!(rules[0], rules[1])}
end
end