-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword_vectorization.py
More file actions
56 lines (42 loc) · 1.5 KB
/
word_vectorization.py
File metadata and controls
56 lines (42 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Using different word vectorization technicques, what can tell about our documents?
1. What are the important words?
2. What the cluster of topics?
"""
import configparser
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenization import apply_filter_value
import numpy as np
from collections.abc import Mapping, Sequence
import os
from typing import List
import json
config = configparser.ConfigParser()
config.read("./config/config.ini")
FILTERS = dict(config["FILTERS"])
def load_articles(directory: str) -> List[dict]:
file_names = os.listdir(directory)
counter = 0
list_of_all_text = []
for file_index, file_name in enumerate(file_names):
# if file_index >= 1000:
# break
if (file_name != "all_links_recorded.json") and (
file_name != "all_topic_collected.json"
):
article_data = json.load(open(f"{directory}/{file_name}", "r"))
if apply_filter_value(article_data, filters=FILTERS):
counter += 1
list_of_all_text.append(article_data["article_content"]["content"])
print(f"All articles: {counter}")
return list_of_all_text
def tf_idf_transformation(documents: List[str]) -> np.ndarray:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)
pass
def clustering(vectorized_documents: np.ndarray, documents_id: List[str]):
pass
def main():
load_articles(directory=config["DIR_PATH"]["SRC_DIR"])
if __name__ == "__main__":
main()