-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutilities.py
More file actions
137 lines (110 loc) · 4.22 KB
/
utilities.py
File metadata and controls
137 lines (110 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Facilities required for a better re-factorization of the code
"""
import configurations
import logging
import os
import tqdm
import json
CONFIG = configurations.get_config_file()
class DataLoader:
def __init__(self, limit: int = -1, use_tokenized=False) -> None:
self.use_tokenized = use_tokenized
if use_tokenized:
self.src_data_path = CONFIG["DIR_PATH"]["TOKENIZED_DIR"]
else:
self.src_data_path = CONFIG["DIR_PATH"]["SRC_DIR"]
self.all_file_names = os.listdir(self.src_data_path)
self.limit = limit
self._get_clean_file_names()
def __enter__(self):
print("Entered the DataLoader")
return self._load_files_iterator()
def __exit__(self, exc_type, exc_value, exc_traceback):
del self.src_data_path
del self.use_tokenized
del self.all_file_names
del self.limit
del self.clean_file_names
print("Exited the DataLoader")
return True
def _load_files_iterator(self):
for file_index, file_name in enumerate(
tqdm.tqdm(self.clean_file_names, desc="Processing files", colour="green")
):
try:
article_data = json.load(open(f"{self.src_data_path}/{file_name}", "r"))
except:
# print("corrupt file")
os.remove(f"{self.src_data_path}/{file_name}")
continue
yield article_data
return article_data
def _get_clean_file_names(self):
self.clean_file_names = []
if self.limit == -1:
print("All files will be loaded")
else:
print("%s files ONLY will be loaded", self.limit)
file_counter = 0
for file_index, file_name in enumerate(self.all_file_names):
files_to_exclude = parse_single_filter_list(
CONFIG["FILES_TO_EXCLUDE"]["FILES"]
)
if file_name not in files_to_exclude:
if self.limit != -1:
if file_counter > self.limit:
break
self.clean_file_names.append(file_name)
file_counter += 1
def parse_single_filter_list(value):
"""
It takes a string, comma separated, of options, and return a list of them
"""
options_list = value.split(",")
if options_list[-1] == "":
return options_list[:-1]
return options_list
def apply_filter_value(article_data: dict, filters: dict):
years = filters.get("years", [])
days = filters.get("days", [])
months = filters.get("months", [])
topics = filters.get("topics", [])
words = filters.get("words", []) # List of words that has to exist in the article
article_pass_validation = True
if len(years) > 0:
years = parse_single_filter_list(years)
if article_data["publication_date"]["year"] not in years:
article_pass_validation = False
if len(days) > 0:
days = parse_single_filter_list(days)
if article_data["publication_date"]["day"] not in days:
article_pass_validation = False
if len(months) > 0:
months = parse_single_filter_list(months)
if article_data["publication_date"]["month"] not in months:
article_pass_validation = False
if len(topics) > 0:
topics = parse_single_filter_list(topics)
if article_data["topic_category"] not in topics:
article_pass_validation = False
return article_pass_validation
def get_existing_links():
"""
Return the list of all the links in the current data directory
Necessary to recover from a failed connection
"""
list_of_links = []
with DataLoader() as data_handle:
for file_index, article_data in enumerate(data_handle):
list_of_links.append(article_data["article_link"])
return list_of_links
def check_file_for_content(file_contents: dict):
"""
Check if the file has proper contents. A damage/corruption can happen due to server problems, connection problems, interruption in the scrapping process, ....etc.
STATUS: NOT-TESTED
"""
if len(file_contents["article_content"]["content"]) == 0: # Corrupted content
return False
else: # Good content
return True