Skip to content
This repository has been archived by the owner on Sep 27, 2023. It is now read-only.

Bool assoc #71

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions analysis/R/assoc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/usr/bin/env Rscript
#
# Copyright 2015 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Reads map files, report files, and RAPPOR parameters to run
# an EM algorithm to estimate joint distribution over two variables one of
# which is boolean (or Basic Rappor)
#
# Usage:
# $ ./assoc.R --inp <JSON file>
#
# Input: JSON file with the following fields
# "maps" for map files of each var
# "reports" for a list of reports
# "counts" for 2 way marginal counts, individual marginal counts
# respectively
# "params" for params file with RAPPOR params
# "results" for a file name into which results will be written
# as comma separated values
#
# Output: A table with joint distribution to stdout and csv file with results

library("jsonlite")
library("optparse")
library("reshape2") # For "unrolling" joint results to csv file

options(stringsAsFactors = FALSE)

if(!interactive()) {
option_list <- list(
make_option(c("--inp"), default = "inp.json",
help = "JSON file with inputs for assoc.R"))
opts <- parse_args(OptionParser(option_list = option_list))
}

# source("analysis/R/encode.R")
source("analysis/R/decode.R")
# source("analysis/R/simulation.R")
source("analysis/R/read_input.R")
source("analysis/R/association.R")
# source("tests/gen_counts.R")

# Implements the EM algorithm on two variables
# NOTE: var 2 assumed to be reported via Basic RAPPOR
# TODO(pseudorandom): Remove this assumption on var 2.
EMAlgBasic <- function(inp) {
ptm <- proc.time()
params <- ReadParameterFile(inp$params)
cat("Finished loading parameters.\n")
# Ensure sufficient maps as required by number of vars
stopifnot(inp$numvars == length(inp$maps))
# Correct map from ReadMapFile() for assoc analysis
map <- lapply(inp$maps, function(o)
CorrectMapForAssoc(LoadMapFile(o, params = params),
params = params))

cat("Finished parsing map(s).\n")

# For Basic rappor, we need to setup a map file manually for var 2
# The map file has 2 components:
# - an array of simple (0, 1) map to represent Basic rappor values for each
# cohort
# - a combined map (rmap) that collapses cohorts
m1 <- lapply(1:params$m, function(z) {
m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
colnames(m) <- c("FALSE", "TRUE")
m
})
m2 <- sparseMatrix(1:params$m, rep(2, params$m))
colnames(m2) <- colnames(m1[[1]])
map[[2]]$map <- m1
map[[2]]$rmap <- m2

# Reports must be of the format
# client name, cohort no, rappor bitstring 1, rappor bitstring 2, ...
# Reading to temporary data frame
tmp_reports_df <- read.csv(inp$reports,
colClasses = c("character", "integer",
rep("character", inp$numvars)),
header = TRUE)
# Ignore the first column
tmp_reports_df <- tmp_reports_df[,-1]

# TODO(pseudorandom): Do not assume var 2 is always boolean
# Fix which params$k is set to 1 based on which var is boolean
# params now is a tuple of params in the standard format as required by
# ComputeDistributionEM
params = list(params, params)
params[[2]]$k = 1

# Parsing tmp_reports_df
# ComputeDistributionEM allows for different sets of cohorts
# for each variable. Here, both sets of cohorts are identical
tmp_cohorts <- as.list(tmp_reports_df[1])$cohort
tmp_cohorts <- tmp_cohorts + 1 # 1 indexing for EM algorithm
# Expanded list of report cohorts
cohorts <- rep(list(tmp_cohorts), inp$numvars)
# Parse reports from tmp_reports_df cols 2, 3, ...
# into a reports array
reports <- lapply(1:inp$numvars, function(x) as.list(tmp_reports_df[x + 1]))

# Split ASCII strings into array of numerics (as required by assoc analysis)
reports <- lapply(1:inp$numvars, function(i) {
# apply the following function to each of reports[[1]] and reports[[2]]
# TODO(pseudorandom): Do not assume var 2 is always boolean in [[1]]
lapply(reports[[i]][[1]], function(x) {
# function splits strings and converts them to numeric values
# rev needed for endianness
rev(as.integer(strsplit(x, split = "")[[1]]))
})
})

cat("Finished parsing reports.\n")

# ignore_other = FALSE because in doing association with Basic RAPPOR, we're
# going to use the Other category to estimate no. of reports where the bit
# is unset
# marginals and estimate_var disabled
# verbose outputs timing information for efficiency analysis; set to inp$time
joint_dist <- ComputeDistributionEM(reports, cohorts, map,
ignore_other = FALSE,
params, marginals = NULL,
estimate_var = FALSE,
verbose = inp$time)
em <- joint_dist$fit
time_taken <- proc.time() - ptm
cat("Came here \n\n")
# Replace Other column name with FALSE. In Basic RAPPOR we use Other
# category to estimate no. of reports with unset bits (which are essentially
# FALSE)
colnames(em)[which(colnames(em) == "Other")] <- "FALSE"

# Unroll and write to results.csv
if(is.null(inp$results))
inp$results <- "results.csv"
write.csv(melt(em), file = inp$results, quote = FALSE,
row.names = FALSE)

print("EM Algorithm Results")
print(em[order(-rowSums(em)), order(-colSums(em))])
if(inp$time == TRUE)
print(time_taken)
}

main <- function(opts) {
inp <- fromJSON(opts$inp)
if(inp$algo == "EM")
EMAlgBasic(inp)
# Recommendation: Use EMAlg; TwoWayAlgBasic is still experimental
# and code is not completely checked in yet.
# if(inp$algo == "2Way")
# TwoWayAlgBasic(inp)
}

if(!interactive()) {
main(opts)
}
109 changes: 81 additions & 28 deletions analysis/R/association.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@
# in RAPPOR. This contains the functions relevant to estimating joint
# distributions.

library(parallel) # mclapply

# Wrapper function to print strings only if verbose flag is passed in
PrintIfVerbose <- function(string, flag = FALSE) {
if(flag == TRUE) {
print(string)
}
}

GetOtherProbs <- function(counts, map, marginal, params) {
# Computes the marginal for the "other" category.
#
Expand Down Expand Up @@ -44,7 +53,7 @@ GetOtherProbs <- function(counts, map, marginal, params) {
# Counts to remove from each cohort.
top_counts <- ceiling(marginal$proportion * N / params$m)
sum_top <- sum(top_counts)
candidate_map <- lapply(map, function(x) x[, candidate_strings])
candidate_map <- lapply(map, function(x) x[, candidate_strings, drop = FALSE])

# Counts set by known strings without noise considerations.
if (length(marginal) > 0) {
Expand All @@ -63,6 +72,10 @@ GetOtherProbs <- function(counts, map, marginal, params) {
pstar <- (1 - f / 2) * p + (f / 2) * q
top_counts_cohort <- (sum_top - top_counts_cohort) * pstar +
top_counts_cohort * qstar

# Adjustment for basic rappor
if(nrow(top_counts_cohort) == 1)
top_counts_cohort <- t(top_counts_cohort)
top_counts_cohort <- cbind(sum_top, top_counts_cohort)

# Counts set by the "other" category.
Expand All @@ -72,18 +85,20 @@ GetOtherProbs <- function(counts, map, marginal, params) {
props_other[props_other > 1] <- 1
props_other[is.nan(props_other)] <- 0
props_other[is.infinite(props_other)] <- 0
# Adjustment for basic rappor
if(is.null(nrow(props_other)))
props_other <- t(props_other)
as.list(as.data.frame(props_other))
}

GetCondProb <- function(report, candidate_strings, params, map,
prob_other = NULL) {
GetCondProb <- function(report, pstar, qstar, bit_indices, prob_other = NULL) {
# Given the observed bit array, estimate P(report | true value).
# Probabilities are estimated for all truth values.
#
# Args:
# report: a single observed RAPPOR report (binary vector of length k)
# candidate_strings: vector of strings in the dictionary (i.e. not the
# "other" category)
# bit_indices: list of integer vectors of length h, specifying which bits
# are set
# params: System parameters
# map: list of matrices encoding locations of hashes for each string
# prob_other: vector of length k, indicating how often each bit in the
Expand All @@ -93,22 +108,13 @@ GetCondProb <- function(report, candidate_strings, params, map,
# Conditional probability of report given each of the strings in
# candidate_strings

p <- params$p
q <- params$q
f <- params$f
ones <- sum(report)
zeros <- length(report) - ones

qstar <- (1 - f / 2) * q + (f / 2) * p
pstar <- (1 - f / 2) * p + (f / 2) * q
probs <- ifelse(report == 1, pstar, 1 - pstar)

# Find the bits set by the candidate strings
inds <- lapply(candidate_strings, function(x)
which(map[, x]))

# Find the likelihood of report given each candidate string
prob_obs_vals <- sapply(inds, function(x) {
prob_obs_vals <- sapply(bit_indices, function(x) {
prod(c(probs[-x], ifelse(report[x] == 1, qstar, 1 - qstar)))
})

Expand Down Expand Up @@ -147,6 +153,7 @@ UpdatePij <- function(pij, cond_prob) {
# Returns:
# Updated pijs from em algorithm (maximization)

# TODO: use mclapply, since it goes over all reports
wcp <- lapply(cond_prob, function(x) {
z <- x * pij
z <- z / sum(z)
Expand Down Expand Up @@ -213,8 +220,20 @@ EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
if (nrow(pij[[1]]) > 0) {
# Run EM
for (i in 1:max_iter) {
if(i %% 100 == 0) {
cat(sprintf("EM iteration %d\n", i))
}
if (i == 1) {
ptm_iter <- proc.time()
}
pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob)
dif <- max(abs(pij[[i + 1]] - pij[[i]]))

# Timing information for debugging purposes
if (i == 1) {
PrintIfVerbose("ONE ITERATION", verbose)
PrintIfVerbose(proc.time() - ptm_iter, verbose)
}
if (dif < epsilon) {
break
}
Expand Down Expand Up @@ -285,7 +304,8 @@ ComputeDistributionEM <- function(reports, report_cohorts,
maps, ignore_other = FALSE,
params,
marginals = NULL,
estimate_var = FALSE) {
estimate_var = FALSE,
verbose = FALSE) {
# Computes the distribution of num_variables variables, where
# num_variables is chosen by the client, using the EM algorithm.
#
Expand All @@ -312,17 +332,24 @@ ComputeDistributionEM <- function(reports, report_cohorts,
# Compute the counts for each variable and then do conditionals.
joint_conditional = NULL
found_strings <- list()
cd_for_reports <- list()

for (j in (1:num_variables)) {
ptm <- proc.time()
variable_report <- reports[[j]]
variable_cohort <- report_cohorts[[j]]
map <- maps[[j]]

# Compute the probability of the "other" category
variable_counts <- NULL
if (is.null(marginals)) {
variable_counts <- ComputeCounts(variable_report, variable_cohort, params)
marginal <- Decode(variable_counts, map$rmap, params, quiet = TRUE)$fit
ptm2 <- proc.time()
variable_counts <- ComputeCounts(variable_report, variable_cohort, params[[j]])
marginal <- Decode(variable_counts, map$rmap, params[[j]],
quiet = TRUE)$fit
print(marginal)
PrintIfVerbose("TIME IN MARGINALS", verbose)
PrintIfVerbose(proc.time() - ptm2, verbose)
if (nrow(marginal) == 0) {
return (NULL)
}
Expand All @@ -332,38 +359,64 @@ ComputeDistributionEM <- function(reports, report_cohorts,
found_strings[[j]] <- marginal$string

if (ignore_other) {
prob_other <- vector(mode = "list", length = params$m)
prob_other <- vector(mode = "list", length = params[[j]]$m)
} else {
if (is.null(variable_counts)) {
variable_counts <- ComputeCounts(variable_report, variable_cohort,
params)
params[[j]])
}
prob_other <- GetOtherProbs(variable_counts, map$map, marginal,
params)
params[[j]])
found_strings[[j]] <- c(found_strings[[j]], "Other")
}

p <- params[[j]]$p
q <- params[[j]]$q
f <- params[[j]]$f

pstar <- (1 - f/2) * p + (f / 2) * q
qstar <- (1 - f/2) * q + (f / 2) * p
candidate_strings <- rownames(marginal)

bit_indices_by_cohort <- lapply(1:params[[j]]$m, function(cohort) {
map_for_cohort <- map$map[[cohort]]
# Find the bits set by the candidate strings
bit_indices <- lapply(candidate_strings, function(x) {
which(map_for_cohort[, x])})
bit_indices
})

num_cores <- 10

cat("Computing conditional probabilities\n")
# Get the joint conditional distribution
cond_report_dist <- lapply(seq(length(variable_report)), function(i) {
cond_report_dist <- mclapply(seq(length(variable_report)), function(i) {
if (i %% 100000 == 0) {
cat(sprintf("Processed %d reports ...\n", i))
}
idx <- variable_cohort[i]
bit_indices <- bit_indices_by_cohort[[idx]]
rep <- GetCondProb(variable_report[[i]],
candidate_strings = rownames(marginal),
params = params,
map$map[[idx]],
pstar, qstar, bit_indices,
prob_other[[idx]])
rep
})
}, mc.cores = num_cores)

# Update the joint conditional distribution of all variables
joint_conditional <- UpdateJointConditional(cond_report_dist,
joint_conditional)
joint_conditional)
PrintIfVerbose("TIME IN COND_REPORT_DIST", verbose)
PrintIfVerbose(proc.time()-ptm, verbose)
}

ptm <- proc.time()
# Run expectation maximization to find joint distribution
em <- EM(joint_conditional, epsilon = 10 ^ -6, verbose = FALSE,
estimate_var = estimate_var)
PrintIfVerbose("TIME IN EM", verbose)
PrintIfVerbose(proc.time() - ptm, verbose)
dimnames(em$est) <- found_strings

# Return results in a usable format
list(fit = em$est, sd = em$sd, em = em)

}
Loading