-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_metadata.py
More file actions
65 lines (52 loc) · 2.01 KB
/
extract_metadata.py
File metadata and controls
65 lines (52 loc) · 2.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
I want to extract the following:
1. The date
2. The topic/category to which this article belongs
Those two I will extract mainly from the URL.
"""
import json
import os
SRC_DIR = "./data_3"
TGT_DIR = "./data_4"
def main():
file_names = os.listdir(SRC_DIR)
all_topics = {}
for file_index, file_name in enumerate(file_names):
# if file_index >= 100:
# break
# if file_index <= 11049:
# continue
# print("/-/" * 20)
if (file_name != "all_links_recorded.json") and (
file_name != "all_topic_collected.json"
):
print(file_index, " -->> ", file_name)
article_data = json.load(open(f"{SRC_DIR}/{file_name}", "r"))
article_url = article_data["article_link"]
article_url_split = article_url.split("/")
article_url_topic = article_url_split[3]
article_url_date = article_url_split[5:8]
# print(f"Whole URL: {article_url}")
# print(f"Category: {article_url_topic}")
# print(f"Date: {article_url_date}")
try:
all_topics[article_url_topic] += 1
except:
all_topics[article_url_topic] = 1
article_data["topic_category"] = article_url_topic
try: # There can be very rare case, where it is not really an article, thus, doesn't follow the normal schema
article_data["publication_date"] = {
"year": article_url_date[0],
"month": article_url_date[1],
"day": article_url_date[2],
}
except:
print("not a good article")
continue
json.dump(article_data, open(f"{TGT_DIR}/{file_name}", "w"))
json.dump(all_topics, open(f"{TGT_DIR}/all_topic_collected.json", "w"))
print(f"Nb of unique topics: {len(all_topics.keys())}")
for item in all_topics:
print(f"{item} -->> {all_topics[item]}")
if __name__ == "__main__":
main()