-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcount_3_grams.py
179 lines (138 loc) · 5.62 KB
/
count_3_grams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Fast counting of 3-grams for short strings.
Quick benchmarking seems to show that pure Python code is faster when
for strings less that 1000 characters, and numpy versions is faster for
longer strings.
Very long strings would benefit from probabilistic counting (bloom
filter, count min sketch) as implemented eg in the "bounter" module.
Run unit tests with "pytest count_3_grams.py"
"""
###############################################################################
from collections import Counter
def number_of_unique_3grams(string):
""" Return the number of different tri-grams in a string
"""
# Pure Python implementation: no numpy
string = '$$' + string + '$$'
return len(list(zip(string, string[1:], string[2:])))
def number_of_3grams(string):
""" Return the number of different tri-grams in a string
"""
# Pure Python implementation: no numpy
string = '$$' + string + '$$'
return len(string) - 3 + 1
def get_unique_3grams(string):
""" Return the set of different tri-grams in a string
"""
# Pure Python implementation: no numpy
string = '$$' + string + '$$'
return set(zip(string, string[1:], string[2:]))
def get_3grams(string):
""" Return the set of different tri-grams in a string
"""
# Pure Python implementation: no numpy
string = '$$' + string + '$$'
return list(zip(string, string[1:], string[2:]))
def dictionary_of_3grams(strings):
""" Given a list of strings, return a dictionary with key
all the unique grams and with values a list of index indicating
the strings that contain the respective ngram.
"""
ngram_dict1 = {}
for i, string in enumerate(strings):
ngrams = get_3grams(string)
for ngram in ngrams:
ngram_dict1.setdefault(ngram, {})
ngram_dict1[ngram].setdefault(i, 0)
ngram_dict1[ngram][i] += 1
return ngram_dict1
def strings_length(strings):
""" Given a list of strings, return a vector with the respective lenght
of each string in the list
"""
strings_len = np.array([number_of_3grams(string)
for string in strings])
return strings_len
def ngram_similarity(string1, strings_len, gram_dict):
string1_ngrams = get_3grams(string1)
count_string1_ngrams = {s: string1_ngrams.count(s)
for s in set(string1_ngrams)}
samegrams = np.zeros(len(strings_len)).astype(float)
for gram in count_string1_ngrams:
try:
samegrams[list(gram_dict[gram]
.keys())] += np.minimum(np.array(list(gram_dict[gram]
.values())),
count_string1_ngrams[gram])
except KeyError:
continue
allgrams = number_of_3grams(string1) + strings_len - samegrams
similarity = samegrams/allgrams
return similarity
def number_of_common_3grams(string1, string2):
""" Return the number of common tri-grams in two strings
"""
# Pure Python implementation: no numpy
tri_grams = set(zip(string1, string1[1:], string1[2:]))
tri_grams = tri_grams.intersection(zip(string2, string2[1:],
string2[2:]))
return len(tri_grams)
def test_number_of_3grams():
# Small unit tests
assert number_of_3grams('abc') == 1
for i in range(1, 7):
assert number_of_3grams(i * 'aaa') == 1
assert number_of_3grams('abcdefghifk'[:i+2]) == i
def test_number_of_common_3grams():
# Small unit tests
for i in range(1, 7):
assert number_of_common_3grams(i * 'aaa', i * 'aaa') == 1
assert number_of_common_3grams('abcdefghifk'[:i+2],
'abcdefghifk'[:i+2]) == i
###############################################################################
# Numpy versions
import numpy as np
def number_of_3grams_np(string):
""" Return the number of different tri-grams in a string
"""
arr = np.frombuffer(string, dtype='S1')
# Hard coding the 3 gram
# Appending 3 shifted versions of the strings
arr = np.concatenate((arr[:-2, None], arr[1:-1, None], arr[2:, None]),
axis=1)
# "concatenate" strings
arr = arr.view('S3')
return np.unique(arr).size
def number_of_common_3grams_np(string1, string2):
""" Return the number of common tri-grams in two strings
"""
arr1 = np.frombuffer(string1, dtype='S1')
arr2 = np.frombuffer(string2, dtype='S1')
# Hard coding the 3 gram
# Appending 3 shifted versions of the strings
arr1 = np.concatenate((arr1[:-2, None], arr1[1:-1, None], arr1[2:, None]),
axis=1)
# "concatenate" strings
arr1 = arr1.view('S3')
arr2 = np.concatenate((arr2[:-2, None], arr2[1:-1, None], arr2[2:, None]),
axis=1)
arr2 = arr2.view('S3')
return np.lib.arraysetops.intersect1d(arr1, arr2).size
def test_number_of_3grams_np():
# Small unit tests
assert number_of_3grams_np('abc') == 1
for i in range(1, 7):
assert number_of_3grams_np(i * 'aaa') == 1
assert number_of_3grams_np('abcdefghifk'[:i+2]) == i
def test_number_of_common_3grams_np():
# Small unit tests
for i in range(1, 7):
assert number_of_common_3grams_np(i * 'aaa', i * 'aaa') == 1
assert number_of_common_3grams_np('abcdefghifk'[:i+2],
'abcdefghifk'[:i+2]) == i
###############################################################################
if __name__ == '__main__':
# Our two example strings
s1 = 'patricio'
s2 = 'patricia'
print(number_of_common_3grams(s1, s2))