Language-Analysis/word_vectorization.py at main · osm3000/Language-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Using different word vectorization technicques, what can tell about our documents?
1. What are the important words?
2. What the cluster of topics?
"""
import configparser
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenization import apply_filter_value
import numpy as np
from collections.abc import Mapping, Sequence
import os
from typing import List
import json

config = configparser.ConfigParser()
config.read("./config/config.ini")

FILTERS = dict(config["FILTERS"])


def load_articles(directory: str) -> List[dict]:
    file_names = os.listdir(directory)
    counter = 0
    list_of_all_text = []
    for file_index, file_name in enumerate(file_names):
        # if file_index >= 1000:
        #     break
        if (file_name != "all_links_recorded.json") and (
            file_name != "all_topic_collected.json"
        ):
            article_data = json.load(open(f"{directory}/{file_name}", "r"))

            if apply_filter_value(article_data, filters=FILTERS):
                counter += 1
                list_of_all_text.append(article_data["article_content"]["content"])

    print(f"All articles: {counter}")
    return list_of_all_text


def tf_idf_transformation(documents: List[str]) -> np.ndarray:
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(documents)
    pass


def clustering(vectorized_documents: np.ndarray, documents_id: List[str]):
    pass


def main():
    load_articles(directory=config["DIR_PATH"]["SRC_DIR"])


if __name__ == "__main__":
    main()