-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
147 lines (120 loc) · 4.46 KB
/
tokenizer.py
File metadata and controls
147 lines (120 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import spacy
import configparser
config = configparser.ConfigParser()
config.read("./config/config.ini")
class Spacy_Tokenizer:
"""
The main tokenization function
"""
def __init__(self) -> None:
self.nlp_lang_model = spacy.load(config["SPACY_PARAM"]["LANG_MODEL"])
@staticmethod
def clean_word(word: str) -> str:
"""
This part is based on visual investigation of the output words.
"""
if word in ["-", "’", "–", "−"]:
return ""
word = word.lower()
word = word.replace("-", "").replace("–", "").replace("”", "")
if len(word) > 0:
if (word[-1] == "’") or (word[-1] == "'"):
word = word.replace("’", "e")
word = word.replace("'", "e")
return word
def _concatenate_article_text(self, article_data):
"""
Concatenate the title, description and context into a single string
"""
return (
article_data["article_content"]["title"]
+ " "
+ article_data["article_content"]["description"]
+ " "
+ article_data["article_content"]["content"]
)
def __call__(self, article_data):
"""
This is the main function
Form a single string for the whole article: title + description + content,
in order to go for the tokenization.
"""
article_main_text = self._concatenate_article_text(article_data=article_data)
(
_,
all_words,
verbs,
nouns,
adverbs,
entities,
) = self.get_words_from_article(article_main_text)
return all_words, verbs, nouns, adverbs, entities
def tokenize_article(self, article_data):
"""
This function returns a single tokenized article
This is useful to ease the analysis later
"""
article_data["article_content"]["title"], *_ = self.get_words_from_article(
article_data["article_content"]["title"]
)
# print(f'Tokenized title: {article_data["article_content"]["title"]}')
(
article_data["article_content"]["description"],
*_,
) = self.get_words_from_article(article_data["article_content"]["description"])
# print(f'Tokenized title: {article_data["article_content"]["description"]}')
article_data["article_content"]["content"], *_ = self.get_words_from_article(
article_data["article_content"]["content"]
)
# print(f'Tokenized title: {article_data["article_content"]["content"]}')
return article_data
def get_words_from_article(self, article_text):
"""
Extract the tokens from the article text
"""
doc = self.nlp_lang_model(article_text)
words = {}
verbs = {}
nouns = {}
adverbs = {}
entities = {}
tokens_sequence = []
for entity in doc.ents:
try:
entities[entity.label_][entity.text.lower()] += 1
except KeyError:
try:
entities[entity.label_][entity.text.lower()] = 1
except KeyError:
entities[entity.label_] = {entity.text.lower(): 1}
for token in doc:
if token.pos_ in ["SPACE", "PUNCT", "NUM", "SYM", "ADP", "DET"]:
continue
# Perform extract cleaning on the word
final_token = self.clean_word(token.lemma_)
if len(final_token) == 0:
continue
tokens_sequence.append(final_token)
if token.pos_ in ["VERB"]:
try:
verbs[final_token] += 1
except KeyError:
verbs[final_token] = 1
if token.pos_ in ["NOUN"]:
try:
nouns[final_token] += 1
except KeyError:
nouns[final_token] = 1
if token.pos_ in ["ADV"]:
try:
adverbs[final_token] += 1
except KeyError:
adverbs[final_token] = 1
try:
words[final_token] += 1
except KeyError:
words[final_token] = 1
# all_words.append(final_token)
# return words, " ".join(all_words)
tokens_sequence = " ".join(tokens_sequence)
return tokens_sequence, words, verbs, nouns, adverbs, entities