-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser_module.py
157 lines (137 loc) · 6.75 KB
/
parser_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import string
from nltk import TweetTokenizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from document import Document
from parsers import UrlParser, EmojiParser, HashtagParser, NumbersParser
from parsers.Covid19Parser import Covid19Parser
from parsers.SubdivisionParser import SubdivisionParser
class Parse:
def __init__(self, should_stem=False):
self.tokenizer = TweetTokenizer()
single_chars = list(string.printable.replace("0123456789", ""))
self.stop_words = frozenset(stopwords.words('english'))
internal_stop_words = ['..', '.', '—', '️', '...', '”', '“', '…', '‘', '', 'he\'s', 'she\'s', 'rt',
'tl;dr', 'you\'re', 'i\'m', 'you\'ll', 'i\'ll',
'she\'ll', 'https', 'http', 't.co', 'he\'ll', '’'] + \
single_chars
self.stop_words = set(list(self.stop_words) + internal_stop_words)
self.subdivisionParser = SubdivisionParser()
self.covid19Parser = Covid19Parser()
self.porter = PorterStemmer()
self.should_stem = should_stem
def parse_sentence(self, original_text, stop_word=True):
"""
This function tokenize, remove stop words and apply lower case for every word within the text
:param original_text: the text to parse
:param stop_word: boolean param. will be false when most of the world is stop words
:return: the text as tokens
"""
text = original_text
text_tokens_without_stopwords = []
url_result = UrlParser.url_func(text)
url_exist = url_result[0]
if url_exist:
if url_result[1]:
self.stop_words.update(url_result[1])
# text_tokens_without_stopwords += url_result[1]
text = url_result[2]
text_tokens = self.tokenizer.tokenize(text)
num_of_words = len(text_tokens)
i = 0
while i < num_of_words:
word = text_tokens[i]
lower_word = word.lower()
is_find, i, translated_word = self.subdivisionParser.parse_word(lower_word, i, text_tokens)
if is_find:
text_tokens_without_stopwords.append(translated_word)
i = i + 1
continue
if self.is_stop_word(lower_word) and stop_word:
i = i + 1
continue
is_find, i, translated_word = self.covid19Parser.parse_word(lower_word, i, text_tokens)
if is_find:
text_tokens_without_stopwords.append(translated_word)
i = i + 1
continue
if word[0].islower():
if self.should_stem:
word = self.porter.stem(word)
if stop_word:
self.add_to_terms(text_tokens_without_stopwords, word, lower_word)
else:
text_tokens_without_stopwords.append(word)
i = i + 1
continue
emoji_parser_result = EmojiParser.parse_word(word)
# translate emojies to list of words
if emoji_parser_result[0]:
text_tokens_without_stopwords += emoji_parser_result[1]
i = i + 1
continue
if word[0] == '#':
word = text_tokens[i]
new_words, num_of_words_in_hashtag = HashtagParser.parse_hashtag(word)
text_tokens_without_stopwords += new_words
elif word[0] == '@':
text_tokens_without_stopwords.append(word)
elif NumbersParser.is_number(word):
word, i = NumbersParser.collect_trailing_number(i, text_tokens, word, text)
if i < num_of_words - 1:
next_word = text_tokens[i + 1]
next_word_lower = next_word.lower()
if NumbersParser.is_percentage(next_word_lower):
text_tokens_without_stopwords.append(NumbersParser.to_numeric(word) + '%')
i += 1
elif NumbersParser.is_number_notation(next_word_lower):
text_tokens_without_stopwords.append(word + NumbersParser.kmb_sign(next_word_lower))
i += 1
elif NumbersParser.is_fraction(next_word):
text_tokens_without_stopwords.append(word + " " + next_word)
i += 1
else:
text_tokens_without_stopwords.append(NumbersParser.normalize_number(word))
else:
text_tokens_without_stopwords.append(NumbersParser.normalize_number(word))
elif NumbersParser.is_number_with_notation(word):
text_tokens_without_stopwords.append(word.upper())
# Here will be only upper case word - append as they are
else:
text_tokens_without_stopwords.append(word)
i += 1
# in case of all of the word is built of stop words, will not be recursion because of stop_word param.
if len(text_tokens_without_stopwords) == 0 and stop_word:
return self.parse_sentence(original_text, stop_word=False)
return text_tokens_without_stopwords
def parse_doc(self, doc_as_list):
"""
This function takes a tweet document as list and break it into different fields
:param doc_as_list: list re-presenting the tweet.
:return: Document object with corresponding fields.
"""
tweet_id = doc_as_list[0]
tweet_date = doc_as_list[1]
full_text = doc_as_list[2]
url = doc_as_list[3]
retweet_text = doc_as_list[4]
retweet_url = doc_as_list[5]
quote_text = doc_as_list[6]
quote_url = doc_as_list[7]
term_dict = {}
tokenized_text = self.parse_sentence(full_text)
doc_length = len(tokenized_text) # after text operations.
# Counts for every term number of mentions in the tweet.
for term in tokenized_text:
if term not in term_dict.keys():
term_dict[term] = 1
else:
term_dict[term] += 1
document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text,
quote_url, term_dict, doc_length)
return document
def is_stop_word(self, lower_word):
return lower_word in self.stop_words
def add_to_terms(self, text_tokens_without_stopwords, word, lower_word):
if not self.is_stop_word(lower_word):
text_tokens_without_stopwords.append(word)