added functions to preprocess; corrected terminology

kshirley · May 6, 2014 · 05e06b9 · 05e06b9
1 parent d269baa
commit 05e06b9
Show file tree

Hide file tree

Showing 28 changed files with 1,316 additions and 483 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,27 +1,21 @@
 Package: LDAtools
 Title: Tools to fit a topic model using Latent Dirichlet Allocation (LDA)
 Version: 0.1
-Authors@R: c(person("Carson", "Sievert", role = "aut", email = "[email protected]"), 
-             person("Kenny", "Shirley", role =  c("aut", "cre"), email = "[email protected]"))
+Authors@R: c(person("Carson", "Sievert", role = "aut", email =
+    "[email protected]"), person("Kenny", "Shirley", role = c("aut", "cre"),
+    email = "[email protected]"))
 Description: This package implements a collapsed Gibbs Sampler algorithm to fit
-    a topic model to a set of unstructured text documents. It contains three basic groups of
-    functions: (1) pre-processing of unstructured text, including
-    substitutions, tokenization, and stemming, (2) fitting the Latent Dirichlet
-    Allocation (LDA) topic model to training data and making model-based
-    predictions on test data, and (3) visualizing and summarizing the fitted
-    model.
+    a topic model to a set of unstructured text documents. It contains three
+    basic groups of functions: (1) pre-processing of unstructured text,
+    including substitutions, tokenization, and stemming, (2) fitting the Latent
+    Dirichlet Allocation (LDA) topic model to training data and making
+    model-based predictions on test data, and (3) visualizing and summarizing
+    the fitted model.
 Depends:
     R (>= 2.15),
     SnowballC
-License: MIT
+License: 
 Imports:
     foreach
-License: 
 Suggests:
     shiny
-Collate:
-    'help.r'
-    'preprocess.R'
-    'fitLDA.R'
-    'postprocess.R'
-    'utils.R'
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,19 +1,22 @@
 export(KL)
+export(bigram.table)
+export(collapse.bigrams)
 export(ent)
 export(fitLDA)
 export(flag.exact)
 export(flag.partial)
 export(getProbs)
 export(jsviz)
 export(lu)
-export(norm)
+export(normalize)
+export(perplexity.bounds)
 export(plotLoglik)
 export(plotTokens)
 export(predictLDA)
 export(preprocess)
 export(preprocess.newdocs)
+export(remap.terms)
 export(su)
-export(sum.na)
 export(token.rank)
 export(topdocs)
-useDynLib(LDAtools)
+useDynLib(LDAviz)
diff --git a/R/help.r b/R/help.r
@@ -1,5 +1,5 @@
-#' LDAviz
+#' LDAtools
 #'
-#' @name LDAviz
+#' @name LDAtools
 #' @docType package
 NULL
diff --git a/R/preprocess.R b/R/preprocess.R
diff --git a/man/KL.Rd b/man/KL.Rd
@@ -2,7 +2,7 @@
 \alias{KL}
 \title{Compute symmetric version of Kullback-Leibler (KL) divergence between two categorical distributions}
 \usage{
-  KL(x, y)
+KL(x, y)
 }
 \arguments{
   \item{x}{The vector of probabilities in the first
@@ -12,7 +12,7 @@
   distribution}
 }
 \description{
-  Compute symmetric version of Kullback-Leibler (KL)
-  divergence between two categorical distributions
+Compute symmetric version of Kullback-Leibler (KL)
+divergence between two categorical distributions
 }
 
diff --git a/man/LDAviz.Rd b/man/LDAviz.Rd
@@ -4,6 +4,6 @@
 \alias{LDAviz-package}
 \title{LDAviz}
 \description{
-  LDAviz
+LDAviz
 }
 
diff --git a/man/bigram.table.Rd b/man/bigram.table.Rd
@@ -0,0 +1,43 @@
+\name{bigram.table}
+\alias{bigram.table}
+\title{Compute table of bigrams}
+\usage{
+bigram.table(term.id = integer(), doc.id = integer(), vocab = character(),
+  n = integer())
+}
+\arguments{
+  \item{term.id}{an integer vector containing the term ID
+  number of every token in the corpus. Should take values
+  between 1 and W, where W is the number of terms in the
+  vocabulary.}
+
+  \item{doc.id}{an interger vector containing the document
+  ID number of every token in the corpus. Should take
+  values between 1 and D, where D is the total number of
+  documents in the corpus.}
+
+  \item{vocab}{a character vector of length W, containing
+  the terms in the vocabulary. This vector must align with
+  \code{term.id}, such that a term.id of 1 indicates the
+  first element of \code{vocab}, a term.id of 2 indicates
+  the second element of \code{vocab}, etc.}
+
+  \item{n}{an integer specifying how large the bigram table
+  should be. The function will return the top n most
+  frequent bigrams. This argument is here because the
+  number of bigrams can be as large as W^2.}
+}
+\value{
+a dataframe with three columns and \code{n} rows,
+containing the bigrams (column 2), their frequencies
+(column 3), and their rank in decreasing order of frequency
+(column 1). The table is sorted by default in decreasing
+order of frequency.
+}
+\description{
+This function counts the bigrams in the data. It's based on
+the vector of term IDs and document IDs -- that is, the
+vocabulary has already been established, and this function
+simply counts occurrences of consecutive terms in the data.
+}
+
diff --git a/man/collapse.bigrams.Rd b/man/collapse.bigrams.Rd
@@ -0,0 +1,51 @@
+\name{collapse.bigrams}
+\alias{collapse.bigrams}
+\title{Replace specified bigrams with terms representing the bigrams}
+\usage{
+collapse.bigrams(bigrams = character(), doc.id = integer(),
+  term.id = integer(), vocab = character())
+}
+\arguments{
+  \item{bigrams}{A character vector, each element of which
+  is a bigram represented by two terms separated by a
+  hyphen, such as 'term1-term2'. Every consecutive
+  occurrence of 'term1' and 'term2' in the data will be
+  replaced by a single token representing this bigram.}
+
+  \item{doc.id}{an interger vector containing the document
+  ID number of every token in the corpus. Should take
+  values between 1 and D, where D is the total number of
+  documents in the corpus.}
+
+  \item{term.id}{an integer vector containing the term ID
+  number of every token in the corpus. Should take values
+  between 1 and W, where W is the number of terms in the
+  vocabulary.}
+
+  \item{vocab}{a character vector of length W, containing
+  the terms in the vocabulary. This vector must align with
+  \code{term.id}, such that a term.id of 1 indicates the
+  first element of \code{vocab}, a term.id of 2 indicates
+  the second element of \code{vocab}, etc.}
+}
+\value{
+Returns a list of length three. The first element,
+\code{new.vocab}, is a character vector containing the new
+vocabulary. The second element, \code{new.term.id} is the
+new vector of term ID numbers for all tokens in the data,
+taking integer values from 1 to the length of the new
+vocabulary. The third element is \code{new.doc.id}, which
+is the new version of the document id vector. If any of the
+specified bigrams were present in the data, then
+\code{new.term.id} and \code{new.doc.id} will be shorter
+vectors than the original \code{term.id} and \code{doc.id}
+vectors.
+}
+\description{
+After tokenization, use this function to replace all
+occurrences of a given bigram with a single token
+representing the bigram, and 'delete' the occurrences of
+the two individual tokens that comprised the bigram (so
+that it is still a generative model for text).
+}
+
diff --git a/man/ent.Rd b/man/ent.Rd
@@ -2,16 +2,25 @@
 \alias{ent}
 \title{Entropy}
 \usage{
-  ent(x)
+ent(x)
 }
 \arguments{
-  \item{x}{a vector or a data frame or an array or NULL.}
+  \item{x}{a vector that contains either a normalized or
+  un-normalized vector of probabilities of a discrete
+  (categorical) distribution}
 }
 \description{
-  This function calculates entropy
+This function calculates the entropy of a discrete
+(categorical) probability distribution
 }
 \examples{
 x <- c(1, 1, 2, 2, 3)
 ent(x)
+y <- c(1/9, 1/9, 2/9, 2/9, 3/9)
+ent(y) # should be same as ent(x)
+z <- c(1/3, 1/3, 1/6, 1/6)
+ent(z)
+z2 <- c(1/3, 1/3, 1/6, 1/6, 0)
+ent(z2) # should be the same as ent(z), since this version of entropy assumes 0*log(0) = 0
 }
 
diff --git a/man/fitLDA.Rd b/man/fitLDA.Rd
@@ -2,9 +2,8 @@
 \alias{fitLDA}
 \title{Fit LDA model via Gibbs sampler}
 \usage{
-  fitLDA(word.id = integer(), doc.id = integer(), k = 10,
-    n.chains = 1, n.iter = 1000, topics.init = NULL,
-    alpha = 0.01, beta = 0.01)
+fitLDA(word.id = integer(), doc.id = integer(), k = 10, n.chains = 1,
+  n.iter = 1000, topics.init = NULL, alpha = 0.01, beta = 0.01)
 }
 \arguments{
   \item{word.id}{Unique token ID. Can be taken directly
@@ -31,34 +30,35 @@
   \item{beta}{Dirichlet hyperparameter}
 }
 \value{
-  A list of length two. The first element is the sampled
-  latent topic value from the last iteration (for each
-  token). The second element is a vector with the
-  log-likelihood values for every iteration of the gibbs
-  sampler.
+A list of length two. The first element is the sampled
+latent topic value from the last iteration (for each
+token). The second element is a vector with the
+log-likelihood values for every iteration of the gibbs
+sampler.
 }
 \description{
-  This function implements the Gibbs sampling method
-  described by Griffiths and Steyvers (2004). The Gibbs
-  sampler portion of the function is a call to C code. Note
-  that we only return the latent topic assignments (for
-  each token) from the last iteration. Thus, memory
-  limitations aren't really an issue. However, the run time
-  is O(num.chains*n.iter*N*k) where \code{n.chains} is
-  number of MCMC chains, \code{n.iter} is the number of
-  iterations, N is the total number of tokens in teh data,
-  and k is the number of topics. It is possible to resume a
-  Gibbs sampler from a previous fit by using the topics
-  from that fit to initiate the next set of iterations
-  using \code{topics.init}.
+This function implements the Gibbs sampling method
+described by Griffiths and Steyvers (2004). The Gibbs
+sampler portion of the function is a call to C code. Note
+that we only return the latent topic assignments (for each
+token) from the last iteration. Thus, memory limitations
+aren't really an issue. However, the run time is
+O(num.chains*n.iter*N*k) where \code{n.chains} is number of
+MCMC chains, \code{n.iter} is the number of iterations, N
+is the total number of tokens in the data, and k is the
+number of topics. It is possible to resume a Gibbs sampler
+from a previous fit by using the topics from that fit to
+initiate the next set of iterations using
+\code{topics.init}.
 }
 \examples{
 data(APinput)
 #takes a while
 \dontrun{o <- fitLDA(APinput$word.id, APinput$doc.id, k=20)}
 }
 \references{
-  Griffiths and Steyvers (2004). Finding Scientific Topics.
-  Proceedings of the National Academy of Sciences.
+Griffiths and Steyvers (2004). Finding Scientific Topics.
+Proceedings of the National Academy of Sciences. 101:
+5228-5235.
 }
 
diff --git a/man/flag.exact.Rd b/man/flag.exact.Rd
@@ -2,51 +2,52 @@
 \alias{flag.exact}
 \title{Flag the documents that exactly match a pre-specified list of strings}
 \usage{
-  flag.exact(data, exact, D, verbose = FALSE,
-    quiet = FALSE)
+flag.exact(data, exact, verbose = FALSE, quiet = FALSE)
 }
 \arguments{
   \item{data}{a character vector containing the raw corpus.
   Each element should correspond to a 'document'.}
 
   \item{exact}{a character vector in which each element is
   a string, phrase, or longer snippet of text that you wish
-  to discard (if the element(s) match the entire content of
-  a document).}
+  to discard, if the element matches the entire content of
+  a document.}
 
-  \item{D}{the original number of documents}
+  \item{verbose}{logical. Track the categories of exact
+  matches. For instance, if a document exactly matches the
+  third element of \code{exact}, then the corresponding
+  value returned will be 3.}
 
-  \item{verbose}{logical vector. Should a summary of the
-  documents being trimmed be reported?}
+  \item{quiet}{logical. Should a summary of the
+  preprocessing steps be printed to the screen?}
 }
 \value{
-  category an integer vector of the same length as
-  \code{data}, where 0 indicates that the document did not
-  match any of the strings in \code{match.exact}, and an
-  integer j = 1, ..., K that indicates that a document was
-  an exact match to the jth element of \code{match.exact}.
+category an integer vector of the same length as
+\code{data}, where, if verbose=TRUE, 0 indicates that the
+document did not match any of the strings in \code{exact},
+and an integer j = 1, ..., K indicates that a document was
+an exact match to the jth element of \code{exact}, and if
+verbose=FALSE, an indicator vector of whether the document
+exactly matched any of the elements of \code{exact}
+(without indicating which element it matched).
 }
 \description{
-  If there are certain (typically very short) documents
-  that occur frequently in your data, and you wish to
-  remove them from the data before you fit the LDA model,
-  this function can be used to flag those documents. It's a
-  trivial operation, but it's a useful reminder that users
-  should visually inspect their data before running LDA (so
-  as to throw out documents that don't require topic
-  modeling in the first place). An example in customer care
-  rep notes are the phrases "Not offered" (in reference to
-  deals that were specifically not offered to the customer
-  during the phone call) and "Too expensive" (in reference
-  to an offer that was declined because if was too
-  expensive).
+If there are certain (typically very short) documents that
+occur frequently in your data, and you wish to remove them
+from the data before you fit the LDA model, this function
+can be used to flag those documents. It's a trivial
+operation, but it's a useful reminder that users should
+visually inspect their data before running LDA (so as to
+throw out documents that don't require topic modeling in
+the first place).
 }
 \examples{
 data <- c("bla bla bla", "foo", "bar", "text")
 match.exact <- c("foo", "junk")
-flag.exact(data, match.exact) # c(0, 2, 0, 0)
+flag.exact(data, match.exact, verbose=FALSE, quiet=FALSE) # c(0, 1, 0, 0)
+flag.exact(data, match.exact, verbose=TRUE, quiet=FALSE) # c(0, 2, 0, 0)
 }
 \seealso{
-  flag.partial
+flag.partial
 }