From 2dba1e683d1ab63898d36a915b72fdebecc197bc Mon Sep 17 00:00:00 2001
From: Michel Rivas Hernandez <michel@molecularwarehouse.com>
Date: Wed, 27 Sep 2017 15:13:11 +0100
Subject: [PATCH 1/4] Fixing association test samples generation

Adding column names to cohort maps
---
 tests/analyze_assoc.R | 18 ++++++++++--------
 tests/assoc_sim.R     | 12 ++++++------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
index 5d78806f..d49bed60 100755
--- a/tests/analyze_assoc.R
+++ b/tests/analyze_assoc.R
@@ -49,11 +49,11 @@ if(!interactive()) {
   opts <- parse_args(OptionParser(option_list = option_list))
 }    
 
-source("../analysis/R/encode.R")
-source("../analysis/R/decode.R")
-source("../analysis/R/simulation.R")
-source("../analysis/R/read_input.R")
-source("../analysis/R/association.R")
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
 
 # This function processes the maps loaded using ReadMapFile
 # Association analysis requires a map object with a map
@@ -66,16 +66,18 @@ source("../analysis/R/association.R")
 #       params = data field with parameters
 # TODO(pseudorandom): move this functionality to ReadMapFile
 ProcessMap <- function(map, params) {
-  map$rmap <- map$map
+  map$all_cohorts_map <- map$map
   split_map <- function(i, map_struct) {
     numbits <- params$k
     indices <- which(as.matrix(
       map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE,
       arr.ind = TRUE)
-    sparseMatrix(indices[, "row"], indices[, "col"],
+    map_by_cohort <- sparseMatrix(indices[, "row"], indices[, "col"],
                  dims = c(numbits, max(indices[, "col"])))
+    colnames(map_by_cohort) <- colnames(map_struct)
+    map_by_cohort
   }
-  map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap))
+  map$map_by_cohort <- lapply(1:params$m, function(i) split_map(i, map$all_cohorts_map))
   map
 }
 
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index 3ff1e5df..4536ff1a 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -51,11 +51,11 @@ if(!interactive()) {
   opts <- parse_args(OptionParser(option_list = option_list))
 }    
 
-source("../analysis/R/encode.R")
-source("../analysis/R/decode.R")
-source("../analysis/R/simulation.R")
-source("../analysis/R/read_input.R")
-source("../analysis/R/association.R")
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
 
 # Read unique values of reports from a csv file
 # Inputs: filename. The file is expected to contain two rows of strings
@@ -144,7 +144,7 @@ SimulateReports <- function(N, uvals, params, unif,
   # Format:
   #     cohort, bloom filter var1, bloom filter var2
   reports <- lapply(1:2, function(i)
-    EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
+    EncodeAll(samples[[i]], cohorts, map[[i]]$all_cohorts_map, params))
   # Organize cohorts and reports into format
   write_matrix <- cbind(as.matrix(cohorts),
                         as.matrix(lapply(reports[[1]], 

From 2fb6ef282dc1d8b556cc19146f81db4c24eda618 Mon Sep 17 00:00:00 2001
From: Michel Rivas Hernandez <michel@molecularwarehouse.com>
Date: Thu, 28 Sep 2017 16:27:08 +0100
Subject: [PATCH 2/4] Fixing generating association reports

---
 tests/assoc_sim.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
index 4536ff1a..09001923 100755
--- a/tests/assoc_sim.R
+++ b/tests/assoc_sim.R
@@ -144,7 +144,7 @@ SimulateReports <- function(N, uvals, params, unif,
   # Format:
   #     cohort, bloom filter var1, bloom filter var2
   reports <- lapply(1:2, function(i)
-    EncodeAll(samples[[i]], cohorts, map[[i]]$all_cohorts_map, params))
+    EncodeAll(samples[[i]], cohorts, map[[i]]$map_by_cohort, params))
   # Organize cohorts and reports into format
   write_matrix <- cbind(as.matrix(cohorts),
                         as.matrix(lapply(reports[[1]], 

From c760be62fac7f307de5f2df76e1b705c50cc4b8c Mon Sep 17 00:00:00 2001
From: Michel Rivas Hernandez <michel@molecularwarehouse.com>
Date: Wed, 4 Oct 2017 14:40:27 +0100
Subject: [PATCH 3/4] Adding tests for n variables

---
 tests/analyze_assoc_n.R | 139 ++++++++++++++++++++++++++++++++
 tests/assoc_sim_n.R     | 172 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 311 insertions(+)
 create mode 100644 tests/analyze_assoc_n.R
 create mode 100644 tests/assoc_sim_n.R

diff --git a/tests/analyze_assoc_n.R b/tests/analyze_assoc_n.R
new file mode 100644
index 00000000..42533ac8
--- /dev/null
+++ b/tests/analyze_assoc_n.R
@@ -0,0 +1,139 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reads map files, report files, and RAPPOR parameters to run 
+# an EM algorithm to estimate joint distribution over two or more variables
+# 
+# Usage:
+#       $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \
+#                                 -reports reports.csv \
+# Inputs: map1, map2, reports, params
+#         see how options are parsed below for more information
+# Outputs:
+#         prints a table with estimated joint probability masses
+#         over candidate strings
+#         Ex. 
+#                 ssl   nossl
+#         intel   0.1   0.3
+#         google  0.5   0.1
+
+library("argparse")
+library("glmnet")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  parser <- ArgumentParser()
+    # Flags
+  parser$add_argument("-m", "--map", metavar='N', nargs='+',
+                help = "Hashed candidates 1..N")
+  parser$add_argument("-r", "--reports", default = "reports.csv",
+                help = "File with raw reports as <cohort, report1 .. reportN>")
+  parser$add_argument("-p", "--params", default = "params.csv",
+                help = "Filename for RAPPOR parameters")
+
+#  opts <- parse_args(OptionParser(option_list = option_list))
+#  opts <- commandArgs(trailingOnly = TRUE)
+  opts <- parser$parse_args()
+}
+
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+# This function processes the maps loaded using ReadMapFile
+# Association analysis requires a map object with a map
+# field that has the map split into cohorts and an rmap field
+# that has all the cohorts combined
+# Arguments:
+#       map = map object with cohorts as sparse matrix in
+#             object map$map
+#             This is the expected object from ReadMapFile
+#       params = data field with parameters
+# TODO(pseudorandom): move this functionality to ReadMapFile
+ProcessMap <- function(map, params) {
+  map$all_cohorts_map <- map$map
+  split_map <- function(i, map_struct) {
+    numbits <- params$k
+    indices <- which(as.matrix(
+      map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE,
+      arr.ind = TRUE)
+    map_by_cohort <- sparseMatrix(indices[, "row"], indices[, "col"],
+                 dims = c(numbits, max(indices[, "col"])))
+    colnames(map_by_cohort) <- colnames(map_struct)
+    map_by_cohort
+  }
+  map$map_by_cohort <- lapply(1:params$m, function(i) split_map(i, map$all_cohorts_map))
+  map
+}
+
+main <- function(opts) {
+  ptm <- proc.time()
+
+  params <- ReadParameterFile(opts$params)
+  map <- lapply(opts$map, function(o)
+                  ProcessMap(ReadMapFile(o, params = params),
+                             params = params))
+  N <- length(opts$map)
+  # Reports must be of the format
+  #     cohort no, rappor bitstring 1.. rappor bitstring N
+  reportsObj <- read.csv(opts$reports,
+                         colClasses = c("integer", rep("character", 2*N)),
+                         header = FALSE)
+
+  # Parsing reportsObj
+  # ComputeDistributionEM allows for different sets of cohorts
+  # for each variable. Here, both sets of cohorts are identical
+  co <- as.list(reportsObj[1])[[1]]
+  cohorts <- rep(list(co), N)
+  # Parse reports from reportObj cols 2 and 3
+  reports <- lapply(2:(N+1), function(x) as.list(reportsObj[x]))
+
+  # Split strings into bit arrays (as required by assoc analysis)
+  reports <- lapply(1:N, function(i) {
+    # apply the following function to each of reports[[1]] and reports[[2]]
+    lapply(reports[[i]][[1]], function(x) {
+      # function splits strings and converts them to numeric values
+      as.numeric(strsplit(x, split = "")[[1]])
+    })
+  })
+
+  joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+                                      ignore_other = TRUE,
+                                      params, marginals = NULL,
+                                      estimate_var = TRUE)
+  # TODO(pseudorandom): Export the results to a file for further analysis
+  print("JOINT_DIST$FIT")
+  print(joint_dist$fit)
+  print("JOINT_DIST$SUM")
+  x <- joint_dist$fit
+  y <- cbind(x, rowSums(x))
+  z <- rbind(y, colSums(y))
+  print(rowSums(joint_dist$fit))
+  print(colSums(joint_dist$fit))
+  print(z[length(z)])
+  print("JOINT_DIST$SD")
+  print(joint_dist$sd)
+  print("JOINT_DIST$VAR-COV")
+  print(joint_dist$var_cov)
+  print("PROC.TIME")
+  print(proc.time() - ptm)
+}
+
+if(!interactive()) {
+  main(opts)
+}
\ No newline at end of file
diff --git a/tests/assoc_sim_n.R b/tests/assoc_sim_n.R
new file mode 100644
index 00000000..0c619544
--- /dev/null
+++ b/tests/assoc_sim_n.R
@@ -0,0 +1,172 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simulates inputs on which association analysis can be run.
+# Currently assoc_sim_n.R has been extended from assoc_sim.R to support N variables.
+# 
+# Usage:
+#       $ ./assoc_sim.R -n 1000
+# Inputs: uvals, params, reports, map, num, unif
+#         see how options are parsed below for more information
+# Outputs:
+#         reports.csv file containing reports
+#         map_{1, 2, ...}.csv file(s) containing maps of variables
+
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+  option_list <- list(
+    make_option(c("--uvals", "-v"), default = "uvals.csv",
+                help = "Filename for list of values over which
+                distributions are simulated. The file is a list of
+                comma-separated strings each line of which refers
+                to a new variable."),
+    make_option(c("--params", "-p"), default = "params.csv",
+                help = "Filename for RAPPOR parameters"),
+    make_option(c("--reports", "-r"), default = "reports.csv",
+                help = "Filename for reports"),
+    make_option(c("--map", "-m"), default = "map",
+                help = "Filename *prefix* for map(s)"),
+    make_option(c("--num", "-n"), default = 1e05,
+                help = "Number of reports"),
+    make_option(c("--unif", "-u"), default = FALSE,
+                help = "Run simulation with uniform distribution")
+  )
+  opts <- parse_args(OptionParser(option_list = option_list))
+}    
+
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/read_input.R")
+source("analysis/R/association.R")
+
+# Read unique values of reports from a csv file
+# Inputs: filename. The file is expected to contain two rows of strings
+#         (one for each variable):
+#         "google.com", "apple.com", ...
+#         "ssl", "nossl", ...
+#         "uk", "usa", "china", ...
+# Returns: a list containing strings
+GetUniqueValsFromFile <- function(filename) {
+  contents <- read.csv(filename, header = FALSE)
+  # Removes superfluous "" entries if the lists of unique values
+  # differ in length
+  strip_empty <- function(vec) {
+    vec[!vec %in% c("")]
+  }
+  lapply(1:nrow(contents), function(i) strip_empty(as.vector(t(contents[i,]))))
+}
+
+# Simulate correlated reports and write into reportsfile
+# Inputs: N = number of reports
+#         uvals = list containing a list of unique values
+#         params = list with RAPPOR parameters
+#         unif = whether to replace poisson with uniform
+#         mapfile = file to write maps into (with .csv suffixes)
+#         reportsfile = file to write reports into (with .csv suffix)
+SimulateReports <- function(N, uvals, params, unif,
+                            mapfile, reportsfile) {
+  # Compute true distribution
+  m <- params$m  
+
+  if (unif) {
+    # Draw uniformly from 1 to 10
+    v1_samples <- as.integer(runif(N, 1, 10))
+  } else {
+    # Draw from a Poisson random variable
+    v1_samples <- rpois(N, 1) + rep(1, N)
+  }
+  
+  # Pr[var2 = N + 1 | var1 = N] = 0.5
+  # Pr[var2 = N     | var1 = N] = 0.5
+  M <- length(uvals)
+  Resample <- function(samples){
+    N <- length(samples)
+    vals <- sample(c(-1, 0, 1), N, replace = TRUE)
+    unlist(lapply(1:N, function(i) if (samples[i] == 1 && vals[i] == -1) samples[i] else samples[i] + vals[i]))
+  }
+  v1_samples <- rpois(N, 1) + rep(1, N)
+  tmp_samples <- lapply(rep(list(v1_samples), M), Resample)
+
+  # Function to pad strings to uval_vec if sample_vec has
+  # larger support than the number of strings in uval_vec
+  # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals
+  # only have "value1", "value2", and "value3", samples now
+  # over support {"value1", "value2", "value3", "str4", ...}
+  PadStrings <- function(sample_vec, uval_vec) {
+    if (max(sample_vec) > length(uval_vec)) {
+      # Padding uvals to required length
+      len <- length(uval_vec)
+      max_of_samples <- max(sample_vec)
+      uval_vec[(len + 1):max_of_samples] <- apply(
+        as.matrix((len + 1):max_of_samples),
+        1,
+        function(i) sprintf("str%d", i))
+    }
+    uval_vec
+  }
+  
+  # Pad and update uvals
+  uvals <- lapply(1:M, function(i) PadStrings(tmp_samples[[i]],
+                                              uvals[[i]]))
+
+  # Replace integers in tmp_samples with actual sample strings
+  samples <- lapply(1:M, function(i) uvals[[i]][tmp_samples[[i]]])
+
+  # Randomly assign cohorts in each dimension
+  cohorts <- sample(1:m, N, replace = TRUE)
+  
+  # Create and write map into mapfile_1.csv and mapfile_2.csv
+  map <- lapply(uvals, function(u) CreateMap(u, params))
+  res <- lapply(1:M, function(i)
+        write.table(map[[i]]$map_pos, file = paste(mapfile, "_", i, ".csv", sep = ""),
+                sep = ",", col.names = FALSE, na = "", quote = FALSE))
+
+  # Write reports into a csv file
+  # Format:
+  #     cohort, bloom filter var1, bloom filter var2
+
+  reports <- lapply(1:M, function(i) EncodeAll(samples[[i]], cohorts, map[[i]]$map_by_cohort, params))
+
+  # Organize cohorts and reports into format
+  write_matrix <- do.call(cbind,
+                        c(list(cohorts),
+                        lapply(reports, function(r)
+                            unlist(lapply(r, function(x)
+                                paste(x, collapse = "")))),
+                        samples))
+  write.table(write_matrix, file = reportsfile, quote = FALSE,
+              row.names = FALSE, col.names = FALSE, sep = ",")
+}
+
+main <- function(opts) {
+  ptm <- proc.time()
+  
+  uvals <- GetUniqueValsFromFile(opts$uvals)
+  params <- ReadParameterFile(opts$params)
+  SimulateReports(opts$num, uvals, params,  opts$unif, # inputs
+                  opts$map, opts$reports)              # outputs
+  
+  print("PROC.TIME")
+  print(proc.time() - ptm)
+}
+
+if(!interactive()) {
+  main(opts)
+}

From 6bc68abae7c7f6be12326f7a952f639a9bb5c00b Mon Sep 17 00:00:00 2001
From: Michel Rivas Hernandez <michel@molecularwarehouse.com>
Date: Wed, 4 Oct 2017 14:42:31 +0100
Subject: [PATCH 4/4] Fixing usage example for n maps

---
 tests/analyze_assoc_n.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/analyze_assoc_n.R b/tests/analyze_assoc_n.R
index 42533ac8..ecfecfa1 100644
--- a/tests/analyze_assoc_n.R
+++ b/tests/analyze_assoc_n.R
@@ -18,9 +18,9 @@
 # an EM algorithm to estimate joint distribution over two or more variables
 # 
 # Usage:
-#       $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \
+#       $ ./analyze_assoc.R -map map_1.csv map_2.csv ... map_n.csv \
 #                                 -reports reports.csv \
-# Inputs: map1, map2, reports, params
+# Inputs: map1, map2,... mapn, reports, params
 #         see how options are parsed below for more information
 # Outputs:
 #         prints a table with estimated joint probability masses