-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduce_data.py
132 lines (90 loc) · 3.84 KB
/
reduce_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import re
import os
import csv
def readReplacements(file='./data/fb_news_comments_replacements.dat'):
with open(file, 'r', encoding='utf-8') as file:
content = list(file.read())
return dict([(a, b if b != '\0' else '') for a, b in zip(content[::2], content[1::2])])
def saveReplacements(replacements, file='./data/fb_news_comments_replacements.dat'):
chars = [item if item != '' else '\0' for items in sorted(replacements.items(), key=lambda i:i[0]) for item in items]
with open(file, 'w', encoding='utf-8') as f:
f.write(''.join(chars))
def addReplacement(replacements, source, target):
replacements[source] = target
for k, v in replacements.items():
if v == source:
replacements[k] = target
# replacements = readReplacements('./data/fb_news_comments_replacements.dat')
# addReplacement(replacements, '%', '')
# addReplacement(replacements, '(', '')
# addReplacement(replacements, ')', '')
# addReplacement(replacements, '%', '')
# addReplacement(replacements, '*', '')
# addReplacement(replacements, ':', '')
# addReplacement(replacements, ',', '')
# addReplacement(replacements, ':', '')
# addReplacement(replacements, ';', '')
# addReplacement(replacements, '=', '')
# addReplacement(replacements, '|', '')
# addReplacement(replacements, '^', '')
# addReplacement(replacements, '_', '')
# for c in list('ABCDEFGHIJKLMONPQRSTUVWXYZ'):
# addReplacement(replacements, c, c.lower())
# saveReplacements(replacements)
with open('./data/fb_news_comments_1000K.csv', 'rU', encoding="utf-8") as file:
content = file.read()
print("Fixing CSV file")
def removeComment(c):
ch = ord(c)
return (ch >= 192 and ch <= 696) or (ch >= 902 and ch <= 8207) or (ch >= 12354 and ch <= 127487)
content = re.sub(
r'([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\+[0-9]{4},[0-9]+,[^",]+,)(([^\n",]*\n)+[^,]*)',
lambda match: f'{match.group(1)}"{match.group(2)}"',
content, 0, re.MULTILINE | re.DOTALL)
with open('./data/fb_news_comments_1000K_tmp.csv', 'w', encoding="utf-8") as file:
file.write(content)
print("CSV file fixed")
with open('./data/fb_news_comments_1000K_tmp.csv', 'rU', encoding="utf-8") as file:
reader = csv.reader(file, dialect=csv.unix_dialect)
next(reader)
comments = list(map(lambda item: item[3], reader))
os.remove('./data/fb_news_comments_1000K_tmp.csv')
print("Initial comments: " + str(len(comments)))
comments = list(filter(lambda comment: len(comment) > 100 and not any(removeComment(x) for x in comment), comments))
print("Comments after removing: " + str(len(comments)))
# c = ''.join(sorted(list(set([char for comment in comments for char in comment]))))
# with open('./chars.txt', 'w', encoding="utf-8") as file:
# for ch in c:
# file.write(ch.ljust(10) + str(ord(ch)).ljust(10) + '\n')
# exit()
print("Loading replacements")
replacements = readReplacements()
addReplacement(replacements, '=', ' ')
addReplacement(replacements, '*', ' ')
addReplacement(replacements, '|', ' ')
addReplacement(replacements, '_', ' ')
addReplacement(replacements, '^', '')
addReplacement(replacements, chr(10084), '')
addReplacement(replacements, chr(128577), '')
addReplacement(replacements, chr(128578), '')
l = len(comments)
print("Starting normalizing comments")
fixedComments = []
j = 0
for comment in comments:
tab = list(comment)
for i in range(0, len(tab)):
c = tab[i]
a = replacements.get(c, c)
if a != c:
tab[i] = a
comment = ''.join(filter(lambda c: c != '\0', tab))
comment = re.sub(r'\s+', ' ', comment, 0, re.MULTILINE).strip()
if len(comment) > 0:
fixedComments.append(comment)
if j % 1000 == 0:
print(f'{j} / {l} ({j / l * 100}%)')
j = j + 1
comments = '\n'.join(fixedComments)
with open('./data/fb_news_comments.txt', 'w', encoding='utf-8') as file:
file.write(comments)