-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtranslation_stemming.py
170 lines (120 loc) · 4.39 KB
/
translation_stemming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
from nltk.corpus import stopwords
import nltk, string
import pandas as pd
import goslate
import re
from collections import Counter
import csv
#load the data
#the code that translate and normalise japanese tweets works also on spanish and can be applied to that by changng the variables' names
nazca_tweets_ja = pd.read_csv('tweets_jap.csv', header=0)
#nazca_tweets_es = pd.read_csv('tweets_es.csv', header=0)
nazca_tweets_eng = pd.read_csv('tweets_eng.csv', header=0)
#load goslate API
gs = goslate.Goslate()
contents_ja = nazca_tweets_ja['content'] #contents_es = nazca_tweets_es['content']
contents_eng = nazca_tweets_eng['content']
#translate tweets' content
translated_tweets_ja = []
for content in contents_ja:
translated = gs.translate(content, 'en')
translated_tweets_ja.append(translated)
#encoding to prevent errors when writing the csv files
translated_encoded_ja = []
for i in translated_tweets_ja:
enc = i.encode('utf-8')
translated_encoded_ja.append(enc)
#for english
content_encoded = []
for i in contents_eng:
enc = str(i)
content_encoded.append(enc)
nazca_tweets_ja['translated_tweets'] = translated_encoded_ja
#apply stemming
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
#the two following functions are not used
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.translate(remove_punctuation_map)))
def normalize_eng(text):
return stem_tokens(nltk.word_tokenize(text))
#the stopwords list is applied after translation, for the lack of a correspondent Japanese list
stops = set(stopwords.words('english'))
def apply_stopwords(text_list):
cleaned_content = []
for text in text_list:
new_text = []
for w in text.split():
if w not in stops:
new_text.append(w)
new_string = " ".join(new_text)
cleaned_content.append(new_string)
return cleaned_content
#apply stopwords list
stopped_text_enc = apply_stopwords(translated_encoded_ja)
stopped_text = apply_stopwords(translated_tweets_ja)
nazca_tweets_ja['stopped_tweets'] = stopped_text_enc
stopped_text_eng1 = apply_stopwords(contents_eng)
stopped_text_eng2 = apply_stopwords(content_encoded)
#normalise texts
stopped_tweets_ja = nazca_tweets_ja['stopwords_tweets']
#text in lower case
lowered_text = []
for i in stopped_text_enc:
encoded_text = i.lower().decode('utf-8')
lowered_text.append(encoded_text)
#stem text
normalised_ja = []
for tweet in lowered_text:
stemmed_tweets = normalize(tweet)
normalised_ja.append(stemmed_tweets)
#tokenise does not work with the english text, we apply another method
exclude = set(string.punctuation)
new_text = []
for i in stopped_text_eng2:
s = ''.join(ch for ch in i if ch not in exclude)
encoded_s = s.lower().decode('utf-8')
new_text.append(encoded_s)
#stemming
nlist2 = []
for element in new_text:
nlist2.append(re.split(r' ', element))
normalised_eng = []
for i in nlist2:
stemmed_tweets = stem_tokens(i)
normalised_eng.append(stemmed_tweets)
#add columns to dataframes
nazca_tweets_eng['normalised_tweets'] = normalised_eng
nazca_tweets_eng['stopwords_tweets'] = stopped_text_eng1
nazca_tweets_ja['normalised_tweets'] = normalised_ja
#write files
nazca_tweets_eng.to_csv('nazca_tweets_en.csv')
nazca_tweets_ja.to_csv('nazca_tweets_ja.csv')
#word frequencies
#texts have to be encoded, otherwise it is not possible to write the results on csv
new_normalised_eng = []
for i in normalised_eng:
new_tweet = []
for j in i:
dec_string = j.encode('utf-8')
new_tweet.append(dec_string)
new_normalised_eng.append(new_tweet)
eng_freq = Counter(word for sublist in new_normalised_eng for word in sublist)
#japanese
new_normalised_ja = []
for i in normalised_ja:
new_tweet = []
for j in i:
dec_string = j.encode('utf-8')
new_tweet.append(dec_string)
new_normalised_ja.append(new_tweet)
ja_freq = Counter(word for sublist in new_normalised_ja for word in sublist)
#write frequencies on files
writer = csv.writer(open('eng_freq.csv', 'wb'))
for key, value in eng_freq.items():
writer.writerow([key, value])
writer = csv.writer(open('ja_freq.csv', 'wb'))
for key, value in ja_freq.items():
writer.writerow([key, value])