-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_text.py
250 lines (216 loc) · 8.45 KB
/
process_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import re
from pythainlp.util import normalize
from pythainlp.tokenize import sent_tokenize, word_tokenize
from pythainlp.util import num_to_thaiword
import fileinput
import sys
from multiprocessing import Pool
import argparse
import pythaispell
# Parameters from https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/languages/th.js
MIN_LENGTH = 6
MAX_LENGTH = 100
CV_INVALIDATION = [{
"regex": '[0-9๐-๙]',
"error": 'Sentence should not contain numbers',
}, {
"regex": '[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]',
"error": 'Sentence should not contain symbols, including Paiyannoi and Maiyamok',
}, {
"regex": '[A-Za-z]',
"error": 'Sentence should not contain latin alphabet characters',
}, {
"regex": '[ก-ฮ]\.[ก-ฮ]+\.',
"error": 'Sentence should not contain abbreviations',
}, {
"regex": '(^|\s)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
"error": 'Word should not start with unexpected characters, like follow vowel and tone mark',
}, {
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44](\s|$)',
"error": 'Word should not end with leading vowels',
}, {
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44]{2}',
"error": 'Sentence should not contain repeating lead vowels',
}, {
"regex": '[\u0E32\u0E33\u0E45]{2}',
"error": 'Sentence should not contain repeating follow vowels',
}, {
"regex": '\u0E30{2}',
"error": 'Sentence should not contain repeating Sara A',
}, {
"regex": '\u0E3A{2}|\u0E4C{2}|\u0E4D{2}|\u0E4E{2}',
"error": 'Sentence should not contain repeating Phinthu / Thanthakhat / Nikhahit / Yamakkan',
}, {
"regex": '[\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47]{2}',
"error": 'Sentence should not contain repeating above vowels',
}, {
"regex": '[\u0E38\u0E39]{2}',
"error": 'Sentence should not contain repeating below vowels',
}, {
"regex": '[\u0E48\u0E49\u0E4A\u0E4B]{2}',
"error": 'Sentence should not contain repeating tone marks',
}, {
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44\u0E30\u0E32\u0E33\u0E45][\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
"error": 'Sentence should not contain invalid symbols after lead/follow vowels',
}, {
"regex": '[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E][\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]',
"error": 'Sentence should not contain invalid symbols before above/below vowels',
}, {
"regex": '[\u0E33\u0E45][\u0E30]',
"error": 'Sentence should not contain Sara A after Sara Am or Lakkhangyao',
}, {
"regex": '[\u0E30][\u0E32\u0E33\u0E45]',
"error": 'Sentence should not contain Sara Aa, Sara Am or Lakkhangyao after Sara A',
}, {
"regex": '[\u200b\u200c\u2063\u0E01-\u0E4E]{71}',
"error": 'Sentence should not contain more than 70 consonants and vowels running without a space',
}, {
"regex": """[\u200b\u200c\u2063\u0E01-\u0E4E.,\-"'“”‘’\u0060?!:;]{81}""",
"error": 'Sentence should not contain more than 80 characters running without a space',
}, {
"regex": '[\u200b\u200c\u2063ก-ฮ]{31}',
"error": 'Sentence should not contain more than 30 consonants running without a space',
}, {
"regex": '(.)\1{6}',
"error": 'Sentence should not contain more than 7 of the same character in a row',
}, {
"regex": '(\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff])',
"error": 'Sentence should not contain emojis or other special Unicode symbols',
}]
TP_INVALIDATION = [{
"regex": '\u0E3A',
"error": 'Sentence should not contain Pinthu as its difficult to read',
}, {
"regex": r"([ก-ฮ])\1{2,}",
"error": 'Sentence should not contain three repeating characters',
}]
INVALIDATION = CV_INVALIDATION + TP_INVALIDATION
def is_length_valid(s):
if len(s) < MIN_LENGTH or len(s) > MAX_LENGTH:
return False
else:
return True
def is_sentence_valid(s):
rules = INVALIDATION
valid = True
if not is_length_valid(s):
print("INVALID LENGTH: " + s)
return False
for r in rules:
if re.search(r["regex"], s):
print(r["error"] + ": " + s)
return False
return valid
def is_spelling_valid(text):
spelling = pythaispell.spell(text)
if text == spelling:
return True
else:
return False
def remove_symbols(text):
symbols = ["●","*","•","★", "◆",","]
# From https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
symbols = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
u"●•★◆"
"*"
","
"]+", re.UNICODE)
return re.sub(symbols, '', text)
def remove_number_dot_space(text):
output = re.sub(r"\d+\.(\s|$)", "", text)
return output
def remove_english_in_brackets(text):
output = re.sub(r"\s\([a-zA-Z ]+\)\s", " ", text)
return output
def normalize_text(text):
return normalize(text)
def repeat_last_word(text):
words = word_tokenize(text)
return ''.join(words) + words[-1]
def expand_maiyamok(text):
return re.sub("([^ๆ]+?)\s*ๆ", lambda x: repeat_last_word(x.group(1)), text)
def strip_whitespace(text):
text = text.strip()
text = " ".join(text.split())
return text
def split_sentence(text):
tokenized_sentences = sent_tokenize(text)
tokenized_sentences = [s.replace("\n", " ") for s in tokenized_sentences]
tokenized_sentences = [remove_symbols(s) for s in tokenized_sentences]
tokenized_sentences = [remove_english_in_brackets(s) for s in tokenized_sentences]
tokenized_sentences = [remove_number_dot_space(s) for s in tokenized_sentences]
tokenized_sentences = [replace_percent(s) for s in tokenized_sentences]
tokenized_sentences = [number_to_word(s) for s in tokenized_sentences]
tokenized_sentences = [expand_maiyamok(s) for s in tokenized_sentences]
tokenized_sentences = [strip_whitespace(s) for s in tokenized_sentences]
tokenized_sentences = [s.strip('"') for s in tokenized_sentences]
return tokenized_sentences
def number_to_word(text):
return re.sub(r"\s*([0-9๐-๙]+)\s*", lambda x: num_to_thaiword(int(x.group(1))), text )
def remove_all_quotes(text):
return text.replace('"','')
def replace_percent(text):
return text.replace('%','เปอร์เซ็นต์')
def replace_time(text):
if re.search("(\d{1,2})[:.](\d{1,2})[-]+(\d{1,2})[:.](\d{1,2})\s?น.", text):
text = re.sub(r"\s*(\d{1,2})[:.](\d{1,2})[-]+(\d{1,2})[:.](\d{1,2})\s?น.\s*", lambda x: "blah " + x.group(1) + "halb " + x.group(2), text )
return text
elif re.search("(\d{1,2})[:.](\d{1,2})\s?น.", text):
return "match2"
else:
return "Match 3"
def second_split_sentence(sentences):
new_sentences = []
for n in sentences.split(" "):
if is_sentence_valid(n):
n = remove_all_quotes(n)
n = n.strip('"')
new_sentences.append(n)
return new_sentences
def pool_is_spelling_valid(text):
if is_spelling_valid(text):
return text
else:
return None
def main():
# parser = argparse.ArgumentParser()
# parser.add_argument('-i', help='Input file')
# parser.add_argument('-o', help='Output file')
# parser.add_argument('-s', help='Perform spell check')
# args = parser.parse_args()
input = open(sys.argv[1],"r").read()
output = open(sys.argv[2],"w")
inputs = input.split("\n")
pool = Pool()
sentences = set()
for q in pool.imap_unordered(split_sentence,inputs):
for s in q:
if is_sentence_valid(s) == True:
sentences.add(s)
else:
try:
sentences.update(second_split_sentence(s))
except ValueError:
sentences.add(s)
# sentences = [x for x in pool.imap_unordered(pool_is_spelling_valid,sentences) if x is not None]
output.writelines([s + "\n" for s in sentences])
if __name__ == "__main__":
main()