google · tkaitchuck · Oct 6, 2015 · Oct 6, 2015 · Oct 7, 2015 · Oct 8, 2015
diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R
@@ -0,0 +1,169 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reads map files, report files, and RAPPOR parameters to run
+# an EM algorithm to estimate joint distribution over two variables one of
+# which is boolean (or Basic Rappor)
+#
+# Usage:
+#       $ ./assoc.R --inp <JSON file>
+#
+# Input: JSON file with the following fields
+#        "maps" for map files of each var
+#        "reports" for a list of reports
+#        "counts" for 2 way marginal counts, individual marginal counts 
+#                 respectively
+#        "params" for params file with RAPPOR params
+#        "results" for a file name into which results will be written
+#                 as comma separated values
+#
+# Output: A table with joint distribution to stdout and csv file with results
+
+library("jsonlite")
+library("optparse")
+library("reshape2")  # For "unrolling" joint results to csv file
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  option_list <- list(
+    make_option(c("--inp"), default = "inp.json",
+                help = "JSON file with inputs for assoc.R"))
+  opts <- parse_args(OptionParser(option_list = option_list))
+}
+
+# source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+# source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+# source("tests/gen_counts.R")
+
+# Implements the EM algorithm on two variables
+# NOTE: var 2 assumed to be reported via Basic RAPPOR
+# TODO(pseudorandom): Remove this assumption on var 2.
+EMAlgBasic <- function(inp) {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)
+  cat("Finished loading parameters.\n")
+  # Ensure sufficient maps as required by number of vars
+  stopifnot(inp$numvars == length(inp$maps))
+  # Correct map from ReadMapFile() for assoc analysis
+  map <- lapply(inp$maps, function(o)
+    CorrectMapForAssoc(LoadMapFile(o, params = params),
+                       params = params))
+
+  cat("Finished parsing map(s).\n")
+
+  # For Basic rappor, we need to setup a map file manually for var 2
+  # The map file has 2 components: 
+  # - an array of simple (0, 1) map to represent Basic rappor values for each
+  # cohort
+  # - a combined map (rmap) that collapses cohorts
+  m1 <- lapply(1:params$m, function(z) {
+    m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
+    colnames(m) <- c("FALSE", "TRUE")
+    m
+  })
+  m2 <- sparseMatrix(1:params$m, rep(2, params$m))
+  colnames(m2) <- colnames(m1[[1]])
+  map[[2]]$map <- m1
+  map[[2]]$rmap <- m2
+
+  # Reports must be of the format
+  #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
+  # Reading to temporary data frame
+  tmp_reports_df <- read.csv(inp$reports,
+                             colClasses = c("character", "integer",
+                                            rep("character", inp$numvars)),
+                             header = TRUE)
+  # Ignore the first column
+  tmp_reports_df <- tmp_reports_df[,-1]
+
+  # TODO(pseudorandom): Do not assume var 2 is always boolean
+  # Fix which params$k is set to 1 based on which var is boolean
+  # params now is a tuple of params in the standard format as required by
+  # ComputeDistributionEM
+  params = list(params, params)
+  params[[2]]$k = 1
+
+  # Parsing tmp_reports_df
+  # ComputeDistributionEM allows for different sets of cohorts
+  # for each variable. Here, both sets of cohorts are identical
+  tmp_cohorts <- as.list(tmp_reports_df[1])$cohort
+  tmp_cohorts <- tmp_cohorts + 1  # 1 indexing for EM algorithm
+  # Expanded list of report cohorts
+  cohorts <- rep(list(tmp_cohorts), inp$numvars)
+  # Parse reports from tmp_reports_df cols 2, 3, ...
+  # into a reports array
+  reports <- lapply(1:inp$numvars, function(x) as.list(tmp_reports_df[x + 1]))
+
+  # Split ASCII strings into array of numerics (as required by assoc analysis)
+  reports <- lapply(1:inp$numvars, function(i) {
+    # apply the following function to each of reports[[1]] and reports[[2]]
+    # TODO(pseudorandom): Do not assume var 2 is always boolean in [[1]]
+    lapply(reports[[i]][[1]], function(x) {
+      # function splits strings and converts them to numeric values
+      # rev needed for endianness
+      rev(as.integer(strsplit(x, split = "")[[1]]))
+    })
+  })
+
+  cat("Finished parsing reports.\n")
+
+  # ignore_other = FALSE because in doing association with Basic RAPPOR, we're
+  # going to use the Other category to estimate no. of reports where the bit
+  # is unset
+  # marginals and estimate_var disabled
+  # verbose outputs timing information for efficiency analysis; set to inp$time
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                      ignore_other = FALSE,
+                                      params, marginals = NULL,
+                                      estimate_var = FALSE,
+                                      verbose = inp$time)
+  em <- joint_dist$fit
+  time_taken <- proc.time() - ptm
+  cat("Came here \n\n")
+  # Replace Other column name with FALSE. In Basic RAPPOR we use Other
+  # category to estimate no. of reports with unset bits (which are essentially
+  # FALSE)
+  colnames(em)[which(colnames(em) == "Other")] <- "FALSE"
+
+  # Unroll and write to results.csv
+  if(is.null(inp$results))
+    inp$results <- "results.csv"
+  write.csv(melt(em), file = inp$results, quote = FALSE,
+            row.names = FALSE)
+
+  print("EM Algorithm Results")
+  print(em[order(-rowSums(em)), order(-colSums(em))])
+  if(inp$time == TRUE)
+    print(time_taken)
+}
+
+main <- function(opts) {
+  inp <- fromJSON(opts$inp)
+  if(inp$algo == "EM")
+    EMAlgBasic(inp)
+  # Recommendation: Use EMAlg; TwoWayAlgBasic is still experimental
+  # and code is not completely checked in yet.
+  # if(inp$algo == "2Way")
+  #   TwoWayAlgBasic(inp)
+}
+
+if(!interactive()) {
+  main(opts)
+}
diff --git a/analysis/R/association.R b/analysis/R/association.R
@@ -17,6 +17,15 @@
 #     in RAPPOR. This contains the functions relevant to estimating joint
 #     distributions.
 
+library(parallel)  # mclapply
+
+# Wrapper function to print strings only if verbose flag is passed in
+PrintIfVerbose <- function(string, flag = FALSE) {
+  if(flag == TRUE) {
+    print(string)
+  }
+}
+
 GetOtherProbs <- function(counts, map, marginal, params) {
   # Computes the marginal for the "other" category.
   #
@@ -44,7 +53,7 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   # Counts to remove from each cohort.
   top_counts <- ceiling(marginal$proportion * N / params$m)
   sum_top <- sum(top_counts)
-  candidate_map <- lapply(map, function(x) x[, candidate_strings])
+  candidate_map <- lapply(map, function(x) x[, candidate_strings, drop = FALSE])
 
   # Counts set by known strings without noise considerations.
   if (length(marginal) > 0) {
@@ -63,6 +72,10 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   pstar <- (1 - f / 2) * p + (f / 2) * q
   top_counts_cohort <- (sum_top - top_counts_cohort) * pstar +
       top_counts_cohort * qstar
+
+  # Adjustment for basic rappor
+  if(nrow(top_counts_cohort) == 1) 
+    top_counts_cohort <- t(top_counts_cohort)
   top_counts_cohort <- cbind(sum_top, top_counts_cohort)
 
   # Counts set by the "other" category.
@@ -72,18 +85,20 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   props_other[props_other > 1] <- 1
   props_other[is.nan(props_other)] <- 0
   props_other[is.infinite(props_other)] <- 0
+  # Adjustment for basic rappor
+  if(is.null(nrow(props_other)))
+    props_other <- t(props_other)
   as.list(as.data.frame(props_other))
 }
 
-GetCondProb <- function(report, candidate_strings, params, map,
-                        prob_other = NULL) {
+GetCondProb <- function(report, pstar, qstar, bit_indices, prob_other = NULL) {
   # Given the observed bit array, estimate P(report | true value).
   # Probabilities are estimated for all truth values.
   #
   # Args:
   #   report: a single observed RAPPOR report (binary vector of length k)
-  #   candidate_strings: vector of strings in the dictionary (i.e. not the
-  #       "other" category)
+  #   bit_indices: list of integer vectors of length h, specifying which bits
+  #   are set
   #   params: System parameters
   #   map: list of matrices encoding locations of hashes for each string
   #   prob_other: vector of length k, indicating how often each bit in the
@@ -93,22 +108,13 @@ GetCondProb <- function(report, candidate_strings, params, map,
   #   Conditional probability of report given each of the strings in
   #       candidate_strings
 
-  p <- params$p
-  q <- params$q
-  f <- params$f
   ones <- sum(report)
   zeros <- length(report) - ones
 
-  qstar <- (1 - f / 2) * q + (f / 2) * p
-  pstar <- (1 - f / 2) * p + (f / 2) * q
   probs <- ifelse(report == 1, pstar, 1 - pstar)
 
-  # Find the bits set by the candidate strings
-  inds <- lapply(candidate_strings, function(x)
-                 which(map[, x]))
-
   # Find the likelihood of report given each candidate string
-  prob_obs_vals <- sapply(inds, function(x) {
+  prob_obs_vals <- sapply(bit_indices, function(x) {
     prod(c(probs[-x], ifelse(report[x] == 1, qstar, 1 - qstar)))
   })
 
@@ -147,6 +153,7 @@ UpdatePij <- function(pij, cond_prob) {
   # Returns:
   #   Updated pijs from em algorithm (maximization)
 
+  # TODO: use mclapply, since it goes over all reports
   wcp <- lapply(cond_prob, function(x) {
     z <- x * pij
     z <- z / sum(z)
@@ -213,8 +220,20 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
   if (nrow(pij[[1]]) > 0) {
     # Run EM
     for (i in 1:max_iter) {
+      if(i %% 100 == 0) {
+        cat(sprintf("EM iteration %d\n", i))
+      }
+      if (i == 1) {
+        ptm_iter <- proc.time()
+      }
       pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob)
       dif <- max(abs(pij[[i + 1]] - pij[[i]]))
+
+      # Timing information for debugging purposes
+      if (i == 1) {
+        PrintIfVerbose("ONE ITERATION", verbose)
+        PrintIfVerbose(proc.time() - ptm_iter, verbose)
+      }
       if (dif < epsilon) {
         break
       }
@@ -285,7 +304,8 @@ ComputeDistributionEM <- function(reports, report_cohorts,
                                   maps, ignore_other = FALSE,
                                   params,
                                   marginals = NULL,
-                                  estimate_var = FALSE) {
+                                  estimate_var = FALSE,
+                                  verbose = FALSE) {
   # Computes the distribution of num_variables variables, where
   #     num_variables is chosen by the client, using the EM algorithm.
   #
@@ -312,17 +332,24 @@ ComputeDistributionEM <- function(reports, report_cohorts,
   # Compute the counts for each variable and then do conditionals.
   joint_conditional = NULL
   found_strings <- list()
+  cd_for_reports <- list()
 
   for (j in (1:num_variables)) {
+    ptm <- proc.time()
     variable_report <- reports[[j]]
     variable_cohort <- report_cohorts[[j]]
     map <- maps[[j]]
 
     # Compute the probability of the "other" category
     variable_counts <- NULL
     if (is.null(marginals)) {
-      variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
-      marginal <- Decode(variable_counts, map$rmap, params, quiet = TRUE)$fit
+      ptm2 <- proc.time()
+      variable_counts <- ComputeCounts(variable_report, variable_cohort, params[[j]])
+      marginal <- Decode(variable_counts, map$rmap, params[[j]],
+                         quiet = TRUE)$fit
+      print(marginal)
+      PrintIfVerbose("TIME IN MARGINALS", verbose)
+      PrintIfVerbose(proc.time() - ptm2, verbose)
       if (nrow(marginal) == 0) {
         return (NULL)
       }
@@ -332,38 +359,64 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     found_strings[[j]] <- marginal$string
 
     if (ignore_other) {
-      prob_other <- vector(mode = "list", length = params$m)
+      prob_other <- vector(mode = "list", length = params[[j]]$m)
     } else {
       if (is.null(variable_counts)) {
         variable_counts <- ComputeCounts(variable_report, variable_cohort,
-                                         params)
+                                         params[[j]])
       }
       prob_other <- GetOtherProbs(variable_counts, map$map, marginal,
-                                  params)
+                                  params[[j]])
       found_strings[[j]] <- c(found_strings[[j]], "Other")
     }
 
+    p <- params[[j]]$p
+    q <- params[[j]]$q
+    f <- params[[j]]$f
+
+    pstar <- (1 - f/2) * p + (f / 2) * q
+    qstar <- (1 - f/2) * q + (f / 2) * p
+    candidate_strings <- rownames(marginal)
+
+    bit_indices_by_cohort <- lapply(1:params[[j]]$m, function(cohort) {
+        map_for_cohort <- map$map[[cohort]]
+        # Find the bits set by the candidate strings
+        bit_indices <- lapply(candidate_strings, function(x) {
+          which(map_for_cohort[, x])})
+        bit_indices
+      })
+
+    num_cores <- 10
+
+    cat("Computing conditional probabilities\n")
     # Get the joint conditional distribution
-    cond_report_dist <- lapply(seq(length(variable_report)), function(i) {
+    cond_report_dist <- mclapply(seq(length(variable_report)), function(i) {
+      if (i %% 100000 == 0) {
+        cat(sprintf("Processed %d reports ...\n", i))
+      }
       idx <- variable_cohort[i]
+      bit_indices <- bit_indices_by_cohort[[idx]]
       rep <- GetCondProb(variable_report[[i]],
-                         candidate_strings = rownames(marginal),
-                         params = params,
-                         map$map[[idx]],
+                         pstar, qstar, bit_indices,
                          prob_other[[idx]])
       rep
-    })
+    }, mc.cores = num_cores)
 
     # Update the joint conditional distribution of all variables
     joint_conditional <- UpdateJointConditional(cond_report_dist,
-                                                joint_conditional)
+                                              joint_conditional)
+    PrintIfVerbose("TIME IN COND_REPORT_DIST", verbose)
+    PrintIfVerbose(proc.time()-ptm, verbose)
   }
 
+  ptm <- proc.time()
   # Run expectation maximization to find joint distribution
   em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE,
            estimate_var = estimate_var)
+  PrintIfVerbose("TIME IN EM", verbose)
+  PrintIfVerbose(proc.time() - ptm, verbose)
   dimnames(em$est) <- found_strings
+
   # Return results in a usable format
   list(fit = em$est, sd = em$sd, em = em)
-
 }