forked from kshirley/LDAtools
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
131 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
LDAviz | ||
====== | ||
|
||
R package to create interactive d3 visualization of LDA topic model | ||
R package for fitting and visualizing topic models. To install the package: | ||
|
||
```library(devtools); install_github("LDAviz", "cpsievert"); library(LDAviz)``` | ||
|
||
To run the shiny web app: | ||
|
||
```library(shiny); runApp(system.file('shiny', 'hover', package='LDAviz'))``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#' @title Compute topic-word and document-topic probability distribution matrices, and re-label topic indices | ||
#' | ||
#' @description This function assumes the ordering of \code{word.id}, \code{doc.id}, \code{topic.id} matters! | ||
#' That is, the first element of \code{word.id} corresponds to the first element of \code{doc.id} which corresponds to the first | ||
#' element of \code{topic.id}. Similarly, the second element of tokens corresponds to the second element of \code{doc.id} | ||
#' which corresponds to the second element of \code{topic.id} (and so on). Also, the ordering of the elements of \code{vocab} | ||
#' are assumed to correspond to the elements of \code{word.id}, so that the first element of \code{vocab} is the token with \code{word.id} | ||
#' equal to 1, the second element of \code{vocab} is the token with \code{word.id} equal to 2, etc. | ||
#' | ||
#' @param word.id a numeric vector with the token id of each token occurrence in the data. | ||
#' @param doc.id a numeric vector containing the document id number of each token occurrence in the data. | ||
#' @param topic.id a numeric vector with a unique value for each topic. | ||
#' @param vocab a character vector of the unique words included in the corpus. The length of this vector should match the max value of \code{word.id}. | ||
#' @param alpha Dirichlet hyperparameter. See \link{fitLDA}. | ||
#' @param beta Dirichlet hyperparameter. See \link{fitLDA}. | ||
#' @param sort.topics Sorting criterion for topics. Supported methods include: "byDocs" to sort topics by the | ||
#' number of documents for which they are the most probable or "byTerms" to sort topics by the number of terms within topic. | ||
#' @param sort.terms Sorting criterion for terms. Supported methods include: "freq" for term frequency (in the corpus), | ||
#' "distinct" for distinctiveness as defined by Chuang, Manning and Heer (2012), "saliency" for p(word)*distinctiveness. | ||
#' | ||
#' @return A list of two matrices and one vector. The first matrix is, \code{phi.hat}, contains the distribution over tokens for each topic, | ||
#' where the rows correspond to topics. The second matrix, \code{theta.hat}, contains the distribution over topics for each document, where | ||
#' the rows correspond to documents. The vector returned by the function, \code{topic.id}, is the vector of sampled topics from the LDA fit, | ||
#' with topic indices re-labeled in decreasing order of frequency by the \code{sort.topics} argument. | ||
#' @export | ||
#' @examples | ||
#' data(APinput) | ||
#' #takes a while | ||
#' \dontrun{o <- fitLDA(APinput$word.id, APinput$doc.id)} | ||
#' data(APtopics) #load output instead for demonstration purposes | ||
#' probs <- getProbs(word.id=APinput$word.id, doc.id=APinput$doc.id, topic.id=APtopics$topics, | ||
#' vocab=APinput$vocab, sort.terms="saliency") | ||
#' head(probs$phi.hat[,1:5]) | ||
#' head(probs$theta.hat) | ||
#' | ||
|
||
|
||
getProbsOld <- function(word.id=numeric(), doc.id=numeric(), topic.id=numeric(), vocab=character(), | ||
alpha=0.01, beta=0.01, sort.topics=c("None", "byDocs", "byTerms"), | ||
sort.terms=c("none", "freq", "distinct", "saliency")) { | ||
stopifnot(sort.topics[1] %in% c("None", "byDocs", "byTerms")) | ||
stopifnot(sort.terms[1] %in% c("none", "freq", "distinct", "saliency")) | ||
if (!all(sort.topics == c("None", "byDocs", "byTerms")) & length(sort.topics) > 1) stop("Please enter only one topic sorting choice") | ||
if (!all(sort.terms == c("none", "freq", "distinct", "saliency")) & length(sort.terms) > 1) stop("Please enter only one term sorting choice") | ||
N <- length(word.id) | ||
stopifnot(N == length(doc.id), N == length(topic.id)) | ||
# compute phi, the matrix of topic-word probability distributions | ||
df <- table(topic.id, word.id) | ||
k <- max(topic.id) | ||
W <- max(word.id) | ||
D <- max(doc.id) | ||
stopifnot(W == length(vocab)) | ||
|
||
CTW <- matrix(0, k, W) | ||
CTW[, as.numeric(colnames(df))] <- df | ||
|
||
# compute theta, the matrix of document-topic probability distributions | ||
df <- table(doc.id, topic.id) | ||
CDT <- matrix(0, D, k) | ||
CDT[as.numeric(rownames(df)),] <- df | ||
|
||
# compute posterior point estimates of phi.hat and theta.hat: | ||
CTW.b <- CTW + beta | ||
phi.hat <- CTW.b/apply(CTW.b, 1, sum) | ||
CDT.a <- CDT + alpha | ||
theta.hat <- CDT.a/apply(CDT.a, 1, sum) | ||
|
||
#set relevant names for the two matrices | ||
rownames(phi.hat) <- rownames(phi.hat, do.NULL=FALSE, prefix= "Topic") | ||
colnames(phi.hat) <- vocab | ||
colnames(theta.hat) <- colnames(theta.hat, do.NULL=FALSE, prefix= "Topic") | ||
|
||
#sort topics (if necessary) | ||
topic.o <- NULL | ||
if (sort.topics[1] == "byDocs") { | ||
# compute the main topic discussed in each verbatim: | ||
#maxs <- apply(theta.hat, 1, which.max) | ||
main.topic <- max.col(CDT) | ||
# order the topics by the number of documents for which they are the main topic: | ||
main.topic.table <- table(main.topic) | ||
topic.o <- order(main.topic.table, decreasing=TRUE) | ||
main.topic <- match(main.topic, topic.o) | ||
} | ||
if (sort.topics[1] == "byTerms") { | ||
topic.o <- order(apply(CTW, 1, sum), decreasing=TRUE) | ||
} | ||
if (!is.null(topic.o)) { | ||
phi.hat <- phi.hat[topic.o,] | ||
theta.hat <- theta.hat[,topic.o] | ||
topic.id <- match(topic.id, topic.o) | ||
} | ||
#sort terms (if necessary) | ||
term.o <- NULL | ||
if (sort.terms[1] != "none") { | ||
word.tab <- table(word.id) | ||
if (sort.terms[1] == "freq") { | ||
term.o <- order(word.tab, decreasing=TRUE) | ||
} else { | ||
if (sort.terms[1] %in% c("distinct", "saliency")) { | ||
topic.tab <- table(topic.id) | ||
pt <- topic.tab/sum(topic.tab) | ||
t.w <- t(t(phi.hat)/apply(phi.hat, 2, sum)) #P(T|w) | ||
kernel <- t.w*log(t.w/as.vector(pt)) | ||
distinct <- apply(kernel, 2, sum) | ||
} | ||
if (sort.terms == "distinct") term.o <- order(distinct, decreasing=TRUE) | ||
if (sort.terms == "saliency") { | ||
pw <- word.tab/sum(word.tab) | ||
saliency <- pw*distinct | ||
term.o <- order(saliency, decreasing=TRUE) | ||
} | ||
} | ||
} | ||
if (!is.null(term.o)) phi.hat <- phi.hat[,term.o] | ||
if (sort.topics[1] != "byDocs") main.topic=NULL | ||
return(list(phi.hat=phi.hat, theta.hat=theta.hat, topic.id=topic.id, main.topic=main.topic)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters