-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlda_intro_train_1_28_09_22.py
117 lines (84 loc) · 2.9 KB
/
lda_intro_train_1_28_09_22.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 28 00:10:54 2022
@author: aq75iwit
"""
'''
Topic Modeling with LDA
References:
[1] LDA with Gensim: https://radimrehurek.com/gensim/models/ldamodel.html
'''
# Import dependencies
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import spacy
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings("ignore", category=DeprecationWarning)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stops = nlp.Defaults.stop_words
PATH='D:/01_Diss_Data/'
sample = pd.read_pickle(PATH+'data_transcripts_28_09_22.pkl')
dates_train_1=['2007','2008','2009','2010','2011','2012']
train_1=sample[sample['year'].isin(dates_train_1)].reset_index()
def prepare(data):
docs=[]
for d in data['general_prepro']:
docs.append(d)
return docs
def normalize(comment, remove_stopwords):
new_dat=[]
for trans in docs:
new_trans=[]
for comment in trans:
comment = nlp(comment)
lemmatized = list()
#lemmatized_docs = []
for word in comment:
lemma = word.lemma_.strip()
if lemma:
if not remove_stopwords or (remove_stopwords and lemma not in stops):
lemmatized.append(lemma)
new_trans.append(" ".join(lemmatized))
new_dat.append(new_trans)
return new_dat
def tokenize(docs):
tokenized_docs = []
for doc in lemmatized_docs:
temp=[]
for d in doc:
tokens = gensim.utils.simple_preprocess(d, deacc=True)
temp.append(tokens)
tokenized_docs.append(temp)
return tokenized_docs
def pre_dict(data):
pre_dict=[]
for t in tokenized_docs:
for e in t:
pre_dict.append(e)
return pre_dict
# Pre-process input: stopwords, lemmatization and tokenization
docs = prepare(train_1)
lemmatized_docs = normalize(docs, remove_stopwords=True)
tokenized_docs = tokenize(lemmatized_docs)
corp=pre_dict(tokenized_docs)
# Mapping from word IDs to words
id2word = corpora.Dictionary(corp)
class MyCorpus(object):
def __iter__(self):
for text in corp:
yield id2word.doc2bow(text)
corpus = MyCorpus()
num_top=[5,10,15,20,25,30,35,40,45]
for n in num_top:
# Fit LDA model: See [1] for more details
topic_model=gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=n,
id2word=id2word, workers=4, chunksize=1000,
passes=25,
alpha=0.5,
iterations=1000,
random_state=4,
dtype=np.float64)
topic_model.save(PATH+'LDA_gen_'+str(n)+'.model')