From 2dba1e683d1ab63898d36a915b72fdebecc197bc Mon Sep 17 00:00:00 2001 From: Michel Rivas Hernandez Date: Wed, 27 Sep 2017 15:13:11 +0100 Subject: [PATCH 1/4] Fixing association test samples generation Adding column names to cohort maps --- tests/analyze_assoc.R | 18 ++++++++++-------- tests/assoc_sim.R | 12 ++++++------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R index 5d78806f..d49bed60 100755 --- a/tests/analyze_assoc.R +++ b/tests/analyze_assoc.R @@ -49,11 +49,11 @@ if(!interactive()) { opts <- parse_args(OptionParser(option_list = option_list)) } -source("../analysis/R/encode.R") -source("../analysis/R/decode.R") -source("../analysis/R/simulation.R") -source("../analysis/R/read_input.R") -source("../analysis/R/association.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") # This function processes the maps loaded using ReadMapFile # Association analysis requires a map object with a map @@ -66,16 +66,18 @@ source("../analysis/R/association.R") # params = data field with parameters # TODO(pseudorandom): move this functionality to ReadMapFile ProcessMap <- function(map, params) { - map$rmap <- map$map + map$all_cohorts_map <- map$map split_map <- function(i, map_struct) { numbits <- params$k indices <- which(as.matrix( map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE, arr.ind = TRUE) - sparseMatrix(indices[, "row"], indices[, "col"], + map_by_cohort <- sparseMatrix(indices[, "row"], indices[, "col"], dims = c(numbits, max(indices[, "col"]))) + colnames(map_by_cohort) <- colnames(map_struct) + map_by_cohort } - map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap)) + map$map_by_cohort <- lapply(1:params$m, function(i) split_map(i, map$all_cohorts_map)) map } diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index 3ff1e5df..4536ff1a 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -51,11 +51,11 @@ if(!interactive()) { opts <- parse_args(OptionParser(option_list = option_list)) } -source("../analysis/R/encode.R") -source("../analysis/R/decode.R") -source("../analysis/R/simulation.R") -source("../analysis/R/read_input.R") -source("../analysis/R/association.R") +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") # Read unique values of reports from a csv file # Inputs: filename. The file is expected to contain two rows of strings @@ -144,7 +144,7 @@ SimulateReports <- function(N, uvals, params, unif, # Format: # cohort, bloom filter var1, bloom filter var2 reports <- lapply(1:2, function(i) - EncodeAll(samples[[i]], cohorts, map[[i]]$map, params)) + EncodeAll(samples[[i]], cohorts, map[[i]]$all_cohorts_map, params)) # Organize cohorts and reports into format write_matrix <- cbind(as.matrix(cohorts), as.matrix(lapply(reports[[1]], From 2fb6ef282dc1d8b556cc19146f81db4c24eda618 Mon Sep 17 00:00:00 2001 From: Michel Rivas Hernandez Date: Thu, 28 Sep 2017 16:27:08 +0100 Subject: [PATCH 2/4] Fixing generating association reports --- tests/assoc_sim.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R index 4536ff1a..09001923 100755 --- a/tests/assoc_sim.R +++ b/tests/assoc_sim.R @@ -144,7 +144,7 @@ SimulateReports <- function(N, uvals, params, unif, # Format: # cohort, bloom filter var1, bloom filter var2 reports <- lapply(1:2, function(i) - EncodeAll(samples[[i]], cohorts, map[[i]]$all_cohorts_map, params)) + EncodeAll(samples[[i]], cohorts, map[[i]]$map_by_cohort, params)) # Organize cohorts and reports into format write_matrix <- cbind(as.matrix(cohorts), as.matrix(lapply(reports[[1]], From c760be62fac7f307de5f2df76e1b705c50cc4b8c Mon Sep 17 00:00:00 2001 From: Michel Rivas Hernandez Date: Wed, 4 Oct 2017 14:40:27 +0100 Subject: [PATCH 3/4] Adding tests for n variables --- tests/analyze_assoc_n.R | 139 ++++++++++++++++++++++++++++++++ tests/assoc_sim_n.R | 172 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 311 insertions(+) create mode 100644 tests/analyze_assoc_n.R create mode 100644 tests/assoc_sim_n.R diff --git a/tests/analyze_assoc_n.R b/tests/analyze_assoc_n.R new file mode 100644 index 00000000..42533ac8 --- /dev/null +++ b/tests/analyze_assoc_n.R @@ -0,0 +1,139 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reads map files, report files, and RAPPOR parameters to run +# an EM algorithm to estimate joint distribution over two or more variables +# +# Usage: +# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ +# -reports reports.csv \ +# Inputs: map1, map2, reports, params +# see how options are parsed below for more information +# Outputs: +# prints a table with estimated joint probability masses +# over candidate strings +# Ex. +# ssl nossl +# intel 0.1 0.3 +# google 0.5 0.1 + +library("argparse") +library("glmnet") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + parser <- ArgumentParser() + # Flags + parser$add_argument("-m", "--map", metavar='N', nargs='+', + help = "Hashed candidates 1..N") + parser$add_argument("-r", "--reports", default = "reports.csv", + help = "File with raw reports as ") + parser$add_argument("-p", "--params", default = "params.csv", + help = "Filename for RAPPOR parameters") + +# opts <- parse_args(OptionParser(option_list = option_list)) +# opts <- commandArgs(trailingOnly = TRUE) + opts <- parser$parse_args() +} + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") +# This function processes the maps loaded using ReadMapFile +# Association analysis requires a map object with a map +# field that has the map split into cohorts and an rmap field +# that has all the cohorts combined +# Arguments: +# map = map object with cohorts as sparse matrix in +# object map$map +# This is the expected object from ReadMapFile +# params = data field with parameters +# TODO(pseudorandom): move this functionality to ReadMapFile +ProcessMap <- function(map, params) { + map$all_cohorts_map <- map$map + split_map <- function(i, map_struct) { + numbits <- params$k + indices <- which(as.matrix( + map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE, + arr.ind = TRUE) + map_by_cohort <- sparseMatrix(indices[, "row"], indices[, "col"], + dims = c(numbits, max(indices[, "col"]))) + colnames(map_by_cohort) <- colnames(map_struct) + map_by_cohort + } + map$map_by_cohort <- lapply(1:params$m, function(i) split_map(i, map$all_cohorts_map)) + map +} + +main <- function(opts) { + ptm <- proc.time() + + params <- ReadParameterFile(opts$params) + map <- lapply(opts$map, function(o) + ProcessMap(ReadMapFile(o, params = params), + params = params)) + N <- length(opts$map) + # Reports must be of the format + # cohort no, rappor bitstring 1.. rappor bitstring N + reportsObj <- read.csv(opts$reports, + colClasses = c("integer", rep("character", 2*N)), + header = FALSE) + + # Parsing reportsObj + # ComputeDistributionEM allows for different sets of cohorts + # for each variable. Here, both sets of cohorts are identical + co <- as.list(reportsObj[1])[[1]] + cohorts <- rep(list(co), N) + # Parse reports from reportObj cols 2 and 3 + reports <- lapply(2:(N+1), function(x) as.list(reportsObj[x])) + + # Split strings into bit arrays (as required by assoc analysis) + reports <- lapply(1:N, function(i) { + # apply the following function to each of reports[[1]] and reports[[2]] + lapply(reports[[i]][[1]], function(x) { + # function splits strings and converts them to numeric values + as.numeric(strsplit(x, split = "")[[1]]) + }) + }) + + joint_dist <- ComputeDistributionEM(reports, cohorts, map, + ignore_other = TRUE, + params, marginals = NULL, + estimate_var = TRUE) + # TODO(pseudorandom): Export the results to a file for further analysis + print("JOINT_DIST$FIT") + print(joint_dist$fit) + print("JOINT_DIST$SUM") + x <- joint_dist$fit + y <- cbind(x, rowSums(x)) + z <- rbind(y, colSums(y)) + print(rowSums(joint_dist$fit)) + print(colSums(joint_dist$fit)) + print(z[length(z)]) + print("JOINT_DIST$SD") + print(joint_dist$sd) + print("JOINT_DIST$VAR-COV") + print(joint_dist$var_cov) + print("PROC.TIME") + print(proc.time() - ptm) +} + +if(!interactive()) { + main(opts) +} \ No newline at end of file diff --git a/tests/assoc_sim_n.R b/tests/assoc_sim_n.R new file mode 100644 index 00000000..0c619544 --- /dev/null +++ b/tests/assoc_sim_n.R @@ -0,0 +1,172 @@ +#!/usr/bin/env Rscript +# +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Simulates inputs on which association analysis can be run. +# Currently assoc_sim_n.R has been extended from assoc_sim.R to support N variables. +# +# Usage: +# $ ./assoc_sim.R -n 1000 +# Inputs: uvals, params, reports, map, num, unif +# see how options are parsed below for more information +# Outputs: +# reports.csv file containing reports +# map_{1, 2, ...}.csv file(s) containing maps of variables + +library("optparse") + +options(stringsAsFactors = FALSE) + +if(!interactive()) { + option_list <- list( + make_option(c("--uvals", "-v"), default = "uvals.csv", + help = "Filename for list of values over which + distributions are simulated. The file is a list of + comma-separated strings each line of which refers + to a new variable."), + make_option(c("--params", "-p"), default = "params.csv", + help = "Filename for RAPPOR parameters"), + make_option(c("--reports", "-r"), default = "reports.csv", + help = "Filename for reports"), + make_option(c("--map", "-m"), default = "map", + help = "Filename *prefix* for map(s)"), + make_option(c("--num", "-n"), default = 1e05, + help = "Number of reports"), + make_option(c("--unif", "-u"), default = FALSE, + help = "Run simulation with uniform distribution") + ) + opts <- parse_args(OptionParser(option_list = option_list)) +} + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/read_input.R") +source("analysis/R/association.R") + +# Read unique values of reports from a csv file +# Inputs: filename. The file is expected to contain two rows of strings +# (one for each variable): +# "google.com", "apple.com", ... +# "ssl", "nossl", ... +# "uk", "usa", "china", ... +# Returns: a list containing strings +GetUniqueValsFromFile <- function(filename) { + contents <- read.csv(filename, header = FALSE) + # Removes superfluous "" entries if the lists of unique values + # differ in length + strip_empty <- function(vec) { + vec[!vec %in% c("")] + } + lapply(1:nrow(contents), function(i) strip_empty(as.vector(t(contents[i,])))) +} + +# Simulate correlated reports and write into reportsfile +# Inputs: N = number of reports +# uvals = list containing a list of unique values +# params = list with RAPPOR parameters +# unif = whether to replace poisson with uniform +# mapfile = file to write maps into (with .csv suffixes) +# reportsfile = file to write reports into (with .csv suffix) +SimulateReports <- function(N, uvals, params, unif, + mapfile, reportsfile) { + # Compute true distribution + m <- params$m + + if (unif) { + # Draw uniformly from 1 to 10 + v1_samples <- as.integer(runif(N, 1, 10)) + } else { + # Draw from a Poisson random variable + v1_samples <- rpois(N, 1) + rep(1, N) + } + + # Pr[var2 = N + 1 | var1 = N] = 0.5 + # Pr[var2 = N | var1 = N] = 0.5 + M <- length(uvals) + Resample <- function(samples){ + N <- length(samples) + vals <- sample(c(-1, 0, 1), N, replace = TRUE) + unlist(lapply(1:N, function(i) if (samples[i] == 1 && vals[i] == -1) samples[i] else samples[i] + vals[i])) + } + v1_samples <- rpois(N, 1) + rep(1, N) + tmp_samples <- lapply(rep(list(v1_samples), M), Resample) + + # Function to pad strings to uval_vec if sample_vec has + # larger support than the number of strings in uval_vec + # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals + # only have "value1", "value2", and "value3", samples now + # over support {"value1", "value2", "value3", "str4", ...} + PadStrings <- function(sample_vec, uval_vec) { + if (max(sample_vec) > length(uval_vec)) { + # Padding uvals to required length + len <- length(uval_vec) + max_of_samples <- max(sample_vec) + uval_vec[(len + 1):max_of_samples] <- apply( + as.matrix((len + 1):max_of_samples), + 1, + function(i) sprintf("str%d", i)) + } + uval_vec + } + + # Pad and update uvals + uvals <- lapply(1:M, function(i) PadStrings(tmp_samples[[i]], + uvals[[i]])) + + # Replace integers in tmp_samples with actual sample strings + samples <- lapply(1:M, function(i) uvals[[i]][tmp_samples[[i]]]) + + # Randomly assign cohorts in each dimension + cohorts <- sample(1:m, N, replace = TRUE) + + # Create and write map into mapfile_1.csv and mapfile_2.csv + map <- lapply(uvals, function(u) CreateMap(u, params)) + res <- lapply(1:M, function(i) + write.table(map[[i]]$map_pos, file = paste(mapfile, "_", i, ".csv", sep = ""), + sep = ",", col.names = FALSE, na = "", quote = FALSE)) + + # Write reports into a csv file + # Format: + # cohort, bloom filter var1, bloom filter var2 + + reports <- lapply(1:M, function(i) EncodeAll(samples[[i]], cohorts, map[[i]]$map_by_cohort, params)) + + # Organize cohorts and reports into format + write_matrix <- do.call(cbind, + c(list(cohorts), + lapply(reports, function(r) + unlist(lapply(r, function(x) + paste(x, collapse = "")))), + samples)) + write.table(write_matrix, file = reportsfile, quote = FALSE, + row.names = FALSE, col.names = FALSE, sep = ",") +} + +main <- function(opts) { + ptm <- proc.time() + + uvals <- GetUniqueValsFromFile(opts$uvals) + params <- ReadParameterFile(opts$params) + SimulateReports(opts$num, uvals, params, opts$unif, # inputs + opts$map, opts$reports) # outputs + + print("PROC.TIME") + print(proc.time() - ptm) +} + +if(!interactive()) { + main(opts) +} From 6bc68abae7c7f6be12326f7a952f639a9bb5c00b Mon Sep 17 00:00:00 2001 From: Michel Rivas Hernandez Date: Wed, 4 Oct 2017 14:42:31 +0100 Subject: [PATCH 4/4] Fixing usage example for n maps --- tests/analyze_assoc_n.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/analyze_assoc_n.R b/tests/analyze_assoc_n.R index 42533ac8..ecfecfa1 100644 --- a/tests/analyze_assoc_n.R +++ b/tests/analyze_assoc_n.R @@ -18,9 +18,9 @@ # an EM algorithm to estimate joint distribution over two or more variables # # Usage: -# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ +# $ ./analyze_assoc.R -map map_1.csv map_2.csv ... map_n.csv \ # -reports reports.csv \ -# Inputs: map1, map2, reports, params +# Inputs: map1, map2,... mapn, reports, params # see how options are parsed below for more information # Outputs: # prints a table with estimated joint probability masses