-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgermaparl2_topicmodelling.Rmd
111 lines (87 loc) · 2.57 KB
/
germaparl2_topicmodelling.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
---
title: "Searching the Super Topic Model - Update"
author: "Andreas Blaette, Christoph Leonhardt"
date: "`r Sys.Date()`"
bibliography: "`r path.expand('~/Lab/github/cookbook/references.bib')`"
editor_options:
chunk_output_type: console
---
## Initialization and settings
```{r, eval = TRUE}
options(java.parameters = "-Xmx28g")
cores_to_use <- parallel::detectCores() - 1L
outdir <- "~/Lab/tmp/"
```
```{r, eval = TRUE}
library(fs)
library(polmineR) # get dev version, minimum v0.8.6.9008
library(biglda) # get dev version from PolMine/biglda, minimum v0.0.0.9006
library(purrr)
library(stringi)
if (!mallet_is_installed()) mallet_install()
```
## Define stopwords
```{r stopwords, eval = TRUE}
stopwords <- c(tm::stopwords("de"), capitalize(tm::stopwords("de")))
punctuation_to_drop <- grep(
"^[[:punct:]]+$",
terms("GERMAPARL2", p_attribute = "word"),
value = TRUE,
perl = TRUE
)
discard <- c(
stopwords,
punctuation_to_drop,
c("dass", "Dass", "Damen", "Herren", "Beifall")
)
```
```{r}
min_doc_length <- 50L
```
## Get speeches as character vectors
```{r make_speeches, echo = FALSE, eval = FALSE}
speeches <- corpus("GERMAPARL2") %>%
subset(as.integer(protocol_lp) %in% 18:21) %>%
as.speeches(
s_attribute_name = "speaker_name",
gap = 50, # gap is 500 by default
mc = cores_to_use
) %>%
get_token_stream(p_attribute = "word", subset = {!word %in% discard}) %>%
keep(function(x) length(x) >= min_doc_length) %>% # drop short documents
sapply(stri_c, collapse = "\n") %>%
discard(function(x) nchar(x) == 0L) # drop empty documents
```
### Create Instance List
```{r make instance_list}
instance_list <- as.instance_list(speeches)
rm(speeches); gc()
```
### Fitting Topic Models
```{r fit_models, eval = FALSE}
for (k in c(250, 300)){
message("... fitting model for k: ", k)
fname <- sprintf("%s_%s_%s.bin", "GERMAPARL2", Sys.Date(), k)
outfile <- path.expand(fs::path(outdir, fname))
lda <- BigTopicModel(n_topics = k, alpha_sum = 5.1, beta = 0.1)
lda$addInstances(instance_list)
lda$setNumThreads(cores_to_use)
lda$setNumIterations(2000L)
lda$setTopicDisplay(100L, 10L)
lda$estimate()
lda$write(rJava::.jnew("java/io/File", outfile))
rm(lda)
}
```
## Data transformation
```{r, eval = FALSE}
for (fname in Sys.glob(fs::path(outdir, "*.bin"))){
message("... reading: ", fname)
mallet_model <- mallet_load_topicmodel(fname)
message("... processing model")
lda <- as_LDA(mallet_model)
ldafile <- sprintf("%s.rds", tools::file_path_sans_ext(fname))
message("... writing: ", ldafile)
saveRDS(lda, file = ldafile)
}
```