google · tkaitchuck · Apr 29, 2015 · Apr 30, 2015 · May 7, 2015 · May 7, 2015
diff --git a/analysis/R/assoc.R b/analysis/R/assoc.R
@@ -0,0 +1,185 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reads map files, report files, and RAPPOR parameters to run
+# an EM algorithm to estimate joint distribution over two or more variables
+#
+# Usage:
+#       $ ./assoc.R --inp <JSON file>
+#
+# Input: JSON file with the following fields
+#        "maps" for map files of each var
+#        "reports" for a list of reports
+#        "counts" for 2 way marginal counts, individual marginal counts 
+#                 respectively
+#        "params" for params file with RAPPOR params
+#        "csv_out" for a file name into which results will be written
+#                 as comma separated values
+#
+# Output: A table with joint distribution to stdout and csv file with results
+
+library("jsonlite")
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  option_list <- list(
+    make_option(c("--inp"), default = "inp.json",
+                help = "JSON file with inputs for assoc.R"))
+  opts <- parse_args(OptionParser(option_list = option_list))
+}
+
+source("analysis/R/decode2way.R")
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+source("tests/gen_counts.R")
+source("tests/compare_assoc.R")  # For CombineMaps; it should be moved elsewhere
+
+TwoWayAlg <- function(inp) {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  # Correct map from ReadMapFile() for assoc analysis
+  stopifnot(inp$numvars == length(inp$maps))
+  map <- lapply(inp$maps, function(o)
+    CorrectMapForAssoc(ReadMapFile(o, params = params),
+                       params = params))
+
+  # (2 way counts, marginal 1 counts, marginal 2 counts)
+  counts <- lapply(1:3, function(i) ReadCountsFile(inp$counts[[i]]))
+
+  # TODO: account for different parameters across different variables
+  params2 <- params
+  params2$k <- (params$k ** 2) * 4
+
+  # Prune candidates
+  fit <- lapply(1:2, function(i)
+    Decode(counts[[i + 1]],
+           map[[i]]$rmap,
+           params, quick = FALSE)$fit)
+
+  found_strings = list(fit[[1]][,"string"], fit[[2]][,"string"])
+
+  if (length(found_strings[[1]]) == 0 || length(found_strings[[2]]) == 0) {
+    stop("No strings found in 1-way marginal.")
+  }
+
+  # Combine maps to feed into Decode2Way
+  # Prune first to found_strings from Decode on 1-way counts
+  pruned <- lapply(1:2, function(i)
+    lapply(map[[i]]$map, function(z) z[,found_strings[[i]], drop = FALSE]))
+  crmap <- CombineMaps(pruned[[1]], pruned[[2]])$crmap
+  marginal <- Decode2Way(counts[[1]], crmap, params2, fit = fit)$fit
+
+  # Reconstruct 2-way table from marginals
+  ed <- matrix(0, nrow = length(found_strings[[1]]), ncol = length(found_strings[[2]]))
+  colnames(ed) <- found_strings[[2]]
+  rownames(ed) <- found_strings[[1]]
+  for (cols in found_strings[[2]]) {
+    for (rows in found_strings[[1]]) {
+      ed[rows, cols] <- marginal[paste(rows, cols, sep = "x"), "Estimate"]
+    }
+  }
+  ed[is.na(ed)] <- 0
+  ed[ed<0] <- 0
+
+  time_taken <- proc.time() - ptm
+  print("Two Way Algorithm Results")
+  print(ed[order(-rowSums(ed)), order(-colSums(ed))])
+  if(inp$time == TRUE)
+    print(time_taken)
+}
+
+EMAlg <- function(inp) {
+  ptm <- proc.time()
+  params <- ReadParameterFile(inp$params)
+  # Ensure sufficient maps as required by number of vars
+  stopifnot(inp$numvars == length(inp$maps))
+  # Correct map from ReadMapFile() for assoc analysis
+  map <- lapply(inp$maps, function(o)
+    CorrectMapForAssoc(LoadMapFile(o, params = params),
+                       params = params))
+
+  # For BASIC only
+  m1 <- lapply(1:params$m, function(z) {
+    m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
+    colnames(m) <- c("FALSE", "TRUE")
+    m
+  })
+  m2 <- sparseMatrix(1:params$m, rep(2, params$m))
+  colnames(m2) <- colnames(m1[[1]])
+  map[[2]]$map <- m1
+  map[[2]]$rmap <- m2
+
+  # Reports must be of the format
+  #     client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
+  reportsObj <- read.csv(inp$reports,
+                         colClasses = c("character", "integer",
+                                        rep("character", inp$numvars)),
+                         header = TRUE)
+  # Ignore the first column
+  reportsObj <- reportsObj[,-1]
+
+  params = list(params, params)
+  params[[2]]$k = 1
+
+  # Parsing reportsObj
+  # ComputeDistributionEM allows for different sets of cohorts
+  # for each variable. Here, both sets of cohorts are identical
+  co <- as.list(reportsObj[1])[[1]]
+  co <- co + 1  # 1 indexing
+  cohorts <- rep(list(co), inp$numvars)
+  # Parse reports from reportObj cols 2, 3, ...
+  reports <- lapply(1:inp$numvars, function(x) as.list(reportsObj[x + 1]))
+
+  # Split strings into bit arrays (as required by assoc analysis)
+  reports <- lapply(1:inp$numvars, function(i) {
+    # apply the following function to each of reports[[1]] and reports[[2]]
+    lapply(reports[[i]][[1]], function(x) {
+      # function splits strings and converts them to numeric values
+      # rev needed for endianness
+      rev(as.numeric(strsplit(x, split = "")[[1]]))
+    })
+  })
+
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                      ignore_other = FALSE,
+                                      quick = TRUE,
+                                      params, marginals = NULL,
+                                      estimate_var = FALSE,
+                                      verbose = inp$time)
+  em <- joint_dist$fit
+  time_taken <- proc.time() - ptm
+  print("EM Algorithm Results")
+  print(em[order(-rowSums(em)), order(-colSums(em))])
+  if(inp$time == TRUE)
+    print(time_taken)
+}
+
+main <- function(opts) {
+  inp <- fromJSON(opts$inp)
+  TwoWayAlg(inp)
+  if(inp$also_em == TRUE)
+    EMAlg(inp)
+}
+
+if(!interactive()) {
+  main(opts)
+}
diff --git a/analysis/R/association.R b/analysis/R/association.R
@@ -44,7 +44,7 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   # Counts to remove from each cohort.
   top_counts <- ceiling(marginal$proportion * N / params$m)
   sum_top <- sum(top_counts)
-  candidate_map <- lapply(map, function(x) x[, candidate_strings])
+  candidate_map <- lapply(map, function(x) x[, candidate_strings, drop = FALSE])
 
   # Counts set by known strings without noise considerations.
   if (length(marginal) > 0) {
@@ -63,6 +63,10 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   pstar <- (1 - f / 2) * p + (f / 2) * q
   top_counts_cohort <- (sum_top - top_counts_cohort) * pstar +
       top_counts_cohort * qstar
+
+  # Adjustment for basic rappor
+  if(nrow(top_counts_cohort) == 1) 
+    top_counts_cohort <- t(top_counts_cohort)
   top_counts_cohort <- cbind(sum_top, top_counts_cohort)
 
   # Counts set by the "other" category.
@@ -72,6 +76,9 @@ GetOtherProbs <- function(counts, map, marginal, params) {
   props_other[props_other > 1] <- 1
   props_other[is.nan(props_other)] <- 0
   props_other[is.infinite(props_other)] <- 0
+  # Adjustmet for basic rappor
+  if(is.null(nrow(props_other)))
+    props_other <- t(props_other)
   as.list(as.data.frame(props_other))
 }
 
@@ -213,8 +220,15 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
   if (nrow(pij[[1]]) > 0) {
     # Run EM
     for (i in 1:max_iter) {
+      if (i == 1) {
+        ptm_iter <- proc.time()
+      }
       pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob)
       dif <- max(abs(pij[[i + 1]] - pij[[i]]))
+      if (i == 1) {
+        PrintIfVerbose("ONE ITERATION", verbose)
+        PrintIfVerbose(proc.time() - ptm_iter, verbose)
+      }
       if (dif < epsilon) {
         break
       }
@@ -283,9 +297,10 @@ UpdateJointConditional <- function(cond_report_dist, joint_conditional = NULL) {
 
 ComputeDistributionEM <- function(reports, report_cohorts,
                                   maps, ignore_other = FALSE,
-                                  params,
+                                  params, quick = FALSE,
                                   marginals = NULL,
-                                  estimate_var = FALSE) {
+                                  estimate_var = FALSE,
+                                  verbose = FALSE) {
   # Computes the distribution of num_variables variables, where
   #     num_variables is chosen by the client, using the EM algorithm.
   #
@@ -312,17 +327,23 @@ ComputeDistributionEM <- function(reports, report_cohorts,
   # Compute the counts for each variable and then do conditionals.
   joint_conditional = NULL
   found_strings <- list()
+  cd_for_reports <- list()
 
   for (j in (1:num_variables)) {
+    ptm <- proc.time()
     variable_report <- reports[[j]]
     variable_cohort <- report_cohorts[[j]]
     map <- maps[[j]]
 
     # Compute the probability of the "other" category
     variable_counts <- NULL
     if (is.null(marginals)) {
-      variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
-      marginal <- Decode(variable_counts, map$rmap, params, quiet = TRUE)$fit
+      ptm2 <- proc.time()
+      variable_counts <- ComputeCounts(variable_report, variable_cohort, params[[j]])
+      marginal <- Decode(variable_counts, map$rmap, params[[j]], quick,
+                         quiet = TRUE)$fit
+      PrintIfVerbose("TIME IN MARGINALS", verbose)
+      PrintIfVerbose(proc.time() - ptm2, verbose)
       if (nrow(marginal) == 0) {
         return (NULL)
       }
@@ -332,14 +353,14 @@ ComputeDistributionEM <- function(reports, report_cohorts,
     found_strings[[j]] <- marginal$string
 
     if (ignore_other) {
-      prob_other <- vector(mode = "list", length = params$m)
+      prob_other <- vector(mode = "list", length = params[[j]]$m)
     } else {
       if (is.null(variable_counts)) {
         variable_counts <- ComputeCounts(variable_report, variable_cohort,
-                                         params)
+                                         params[[j]])
       }
       prob_other <- GetOtherProbs(variable_counts, map$map, marginal,
-                                  params)
+                                  params[[j]])
       found_strings[[j]] <- c(found_strings[[j]], "Other")
     }
 
@@ -348,22 +369,27 @@ ComputeDistributionEM <- function(reports, report_cohorts,
       idx <- variable_cohort[i]
       rep <- GetCondProb(variable_report[[i]],
                          candidate_strings = rownames(marginal),
-                         params = params,
+                         params = params[[j]],
                          map$map[[idx]],
                          prob_other[[idx]])
       rep
     })
 
     # Update the joint conditional distribution of all variables
     joint_conditional <- UpdateJointConditional(cond_report_dist,
-                                                joint_conditional)
+                                              joint_conditional)
+    PrintIfVerbose("TIME IN COND_REPORT_DIST", verbose)
+    PrintIfVerbose(proc.time()-ptm, verbose)
   }
 
+  ptm <- proc.time()
   # Run expectation maximization to find joint distribution
   em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE,
            estimate_var = estimate_var)
+  PrintIfVerbose("TIME IN EM", verbose)
+  PrintIfVerbose(proc.time() - ptm, verbose)
   dimnames(em$est) <- found_strings
+
   # Return results in a usable format
   list(fit = em$est, sd = em$sd, em = em)
-
 }
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
@@ -74,9 +74,14 @@ EstimateBloomCounts <- function(params, obs_counts) {
 
   # Transform counts from absolute values to fractional, removing bias due to
   #      variability of reporting between cohorts.
-  ests <- apply(ests, 1, function(x) x / obs_counts[,1])
-  stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
-
+  if (ncol(obs_counts) == 2) {
+    ests <- apply(t(ests), 1, function(x) x / obs_counts[,1])
+    stds <- apply(t(variances^.5), 1, function(x) x / obs_counts[,1])
+  } else {
+    ests <- apply((ests), 1, function(x) x / obs_counts[,1])
+    stds <- apply((variances^.5), 1, function(x) x / obs_counts[,1])
+  }
+
   # Some estimates may be set to infinity, e.g. if f=1. We want to
   #     account for this possibility, and set the corresponding counts
   #     to 0.