Skip to content

Commit

Permalink
Fix shiny app
Browse files Browse the repository at this point in the history
  • Loading branch information
cpsievert committed Sep 20, 2013
1 parent 4aa87d0 commit cd93d72
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 7 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
LDAviz
======

R package to create interactive d3 visualization of LDA topic model
R package for fitting and visualizing topic models. To install the package:

```library(devtools); install_github("LDAviz", "cpsievert"); library(LDAviz)```

To run the shiny web app:

```library(shiny); runApp(system.file('shiny', 'hover', package='LDAviz'))```
117 changes: 117 additions & 0 deletions inst/shiny/hover/getProbsOld.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#' @title Compute topic-word and document-topic probability distribution matrices, and re-label topic indices
#'
#' @description This function assumes the ordering of \code{word.id}, \code{doc.id}, \code{topic.id} matters!
#' That is, the first element of \code{word.id} corresponds to the first element of \code{doc.id} which corresponds to the first
#' element of \code{topic.id}. Similarly, the second element of tokens corresponds to the second element of \code{doc.id}
#' which corresponds to the second element of \code{topic.id} (and so on). Also, the ordering of the elements of \code{vocab}
#' are assumed to correspond to the elements of \code{word.id}, so that the first element of \code{vocab} is the token with \code{word.id}
#' equal to 1, the second element of \code{vocab} is the token with \code{word.id} equal to 2, etc.
#'
#' @param word.id a numeric vector with the token id of each token occurrence in the data.
#' @param doc.id a numeric vector containing the document id number of each token occurrence in the data.
#' @param topic.id a numeric vector with a unique value for each topic.
#' @param vocab a character vector of the unique words included in the corpus. The length of this vector should match the max value of \code{word.id}.
#' @param alpha Dirichlet hyperparameter. See \link{fitLDA}.
#' @param beta Dirichlet hyperparameter. See \link{fitLDA}.
#' @param sort.topics Sorting criterion for topics. Supported methods include: "byDocs" to sort topics by the
#' number of documents for which they are the most probable or "byTerms" to sort topics by the number of terms within topic.
#' @param sort.terms Sorting criterion for terms. Supported methods include: "freq" for term frequency (in the corpus),
#' "distinct" for distinctiveness as defined by Chuang, Manning and Heer (2012), "saliency" for p(word)*distinctiveness.
#'
#' @return A list of two matrices and one vector. The first matrix is, \code{phi.hat}, contains the distribution over tokens for each topic,
#' where the rows correspond to topics. The second matrix, \code{theta.hat}, contains the distribution over topics for each document, where
#' the rows correspond to documents. The vector returned by the function, \code{topic.id}, is the vector of sampled topics from the LDA fit,
#' with topic indices re-labeled in decreasing order of frequency by the \code{sort.topics} argument.
#' @export
#' @examples
#' data(APinput)
#' #takes a while
#' \dontrun{o <- fitLDA(APinput$word.id, APinput$doc.id)}
#' data(APtopics) #load output instead for demonstration purposes
#' probs <- getProbs(word.id=APinput$word.id, doc.id=APinput$doc.id, topic.id=APtopics$topics,
#' vocab=APinput$vocab, sort.terms="saliency")
#' head(probs$phi.hat[,1:5])
#' head(probs$theta.hat)
#'


getProbsOld <- function(word.id=numeric(), doc.id=numeric(), topic.id=numeric(), vocab=character(),
alpha=0.01, beta=0.01, sort.topics=c("None", "byDocs", "byTerms"),
sort.terms=c("none", "freq", "distinct", "saliency")) {
stopifnot(sort.topics[1] %in% c("None", "byDocs", "byTerms"))
stopifnot(sort.terms[1] %in% c("none", "freq", "distinct", "saliency"))
if (!all(sort.topics == c("None", "byDocs", "byTerms")) & length(sort.topics) > 1) stop("Please enter only one topic sorting choice")
if (!all(sort.terms == c("none", "freq", "distinct", "saliency")) & length(sort.terms) > 1) stop("Please enter only one term sorting choice")
N <- length(word.id)
stopifnot(N == length(doc.id), N == length(topic.id))
# compute phi, the matrix of topic-word probability distributions
df <- table(topic.id, word.id)
k <- max(topic.id)
W <- max(word.id)
D <- max(doc.id)
stopifnot(W == length(vocab))

CTW <- matrix(0, k, W)
CTW[, as.numeric(colnames(df))] <- df

# compute theta, the matrix of document-topic probability distributions
df <- table(doc.id, topic.id)
CDT <- matrix(0, D, k)
CDT[as.numeric(rownames(df)),] <- df

# compute posterior point estimates of phi.hat and theta.hat:
CTW.b <- CTW + beta
phi.hat <- CTW.b/apply(CTW.b, 1, sum)
CDT.a <- CDT + alpha
theta.hat <- CDT.a/apply(CDT.a, 1, sum)

#set relevant names for the two matrices
rownames(phi.hat) <- rownames(phi.hat, do.NULL=FALSE, prefix= "Topic")
colnames(phi.hat) <- vocab
colnames(theta.hat) <- colnames(theta.hat, do.NULL=FALSE, prefix= "Topic")

#sort topics (if necessary)
topic.o <- NULL
if (sort.topics[1] == "byDocs") {
# compute the main topic discussed in each verbatim:
#maxs <- apply(theta.hat, 1, which.max)
main.topic <- max.col(CDT)
# order the topics by the number of documents for which they are the main topic:
main.topic.table <- table(main.topic)
topic.o <- order(main.topic.table, decreasing=TRUE)
main.topic <- match(main.topic, topic.o)
}
if (sort.topics[1] == "byTerms") {
topic.o <- order(apply(CTW, 1, sum), decreasing=TRUE)
}
if (!is.null(topic.o)) {
phi.hat <- phi.hat[topic.o,]
theta.hat <- theta.hat[,topic.o]
topic.id <- match(topic.id, topic.o)
}
#sort terms (if necessary)
term.o <- NULL
if (sort.terms[1] != "none") {
word.tab <- table(word.id)
if (sort.terms[1] == "freq") {
term.o <- order(word.tab, decreasing=TRUE)
} else {
if (sort.terms[1] %in% c("distinct", "saliency")) {
topic.tab <- table(topic.id)
pt <- topic.tab/sum(topic.tab)
t.w <- t(t(phi.hat)/apply(phi.hat, 2, sum)) #P(T|w)
kernel <- t.w*log(t.w/as.vector(pt))
distinct <- apply(kernel, 2, sum)
}
if (sort.terms == "distinct") term.o <- order(distinct, decreasing=TRUE)
if (sort.terms == "saliency") {
pw <- word.tab/sum(word.tab)
saliency <- pw*distinct
term.o <- order(saliency, decreasing=TRUE)
}
}
}
if (!is.null(term.o)) phi.hat <- phi.hat[,term.o]
if (sort.topics[1] != "byDocs") main.topic=NULL
return(list(phi.hat=phi.hat, theta.hat=theta.hat, topic.id=topic.id, main.topic=main.topic))
}
9 changes: 5 additions & 4 deletions inst/shiny/hover/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ library(LDAviz)
library(proxy)
library(reshape)
library(plyr)
source("getProbsOld.R") #Kenny made some changes to getProbs -- use the old version -- getProbsOld

options(shiny.maxRequestSize=100*1024^2) #change default file upload size from 5MB to 100MB

Expand Down Expand Up @@ -68,10 +69,10 @@ shinyServer(function(input, output) {

getMatrices <- reactive({
stuff <- filter()
#getProbs(word.id=as.integer(stuff$words), doc.id=as.integer(stuff$docs), topic.id=as.integer(stuff$topics),
# vocab=as.character(stuff$vocab), sort.topics="byDocs", sort.terms="saliency")
getProbs(word.id=as.integer(stuff$words), doc.id=as.integer(stuff$docs), topic.id=as.integer(stuff$topics),
vocab=as.character(stuff$vocab), sort.topics="byDocs", K=30)
getProbsOld(word.id=as.integer(stuff$words), doc.id=as.integer(stuff$docs), topic.id=as.integer(stuff$topics),
vocab=as.character(stuff$vocab), sort.topics="byDocs", sort.terms="saliency")
# getProbsOld(word.id=as.integer(stuff$words), doc.id=as.integer(stuff$docs), topic.id=as.integer(stuff$topics),
# vocab=as.character(stuff$vocab), sort.topics="byDocs", K=30)
})

output$mdsDat <- reactive({
Expand Down
4 changes: 2 additions & 2 deletions inst/shiny/hover/ui.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
library(shiny)
library(LDAviz)
#anytime a file under the assets folder is changed, ldatools must be reinstalled (to reflect the change)!
addResourcePath('assets', system.file('shiny', 'hover', 'assets', package='ldatools'))
#anytime a file under the assets folder is changed, LDAviz must be reinstalled (to reflect the change)!
addResourcePath('assets', system.file('shiny', 'hover', 'assets', package='LDAviz'))

#nice idea, but it doesn't adapt to uploaded data
# data(input)
Expand Down

0 comments on commit cd93d72

Please sign in to comment.