From 4f70aa7f4b594a756f691c32ca0af3632f7406ed Mon Sep 17 00:00:00 2001
From: "ymyuan@student.ubc.ca" <ymyuan@student.ubc.ca>
Date: Wed, 12 Nov 2025 17:59:26 -0800
Subject: [PATCH] update

---
 .Rhistory                          | 500 +++++++++++++++++++++++++++++
 vignettes/clamp-cat-exp-causal.Rmd | 445 +++++++++++++++++++++++++
 vignettes/clamp-linear-causal.Rmd  |   3 -
 3 files changed, 945 insertions(+), 3 deletions(-)
 create mode 100644 vignettes/clamp-cat-exp-causal.Rmd

diff --git a/.Rhistory b/.Rhistory
index 4c8e121..5aa5e55 100644
--- a/.Rhistory
+++ b/.Rhistory
@@ -1,3 +1,4 @@
+<<<<<<< Updated upstream
 load_all()
 res_cl3 <- clamp(X, y, W=Wmat,
 standardize = F,
@@ -412,6 +413,182 @@ function(j) yboot - weighted.mean(yboot, Wboot[,j]))
 wxy <- colSums(Wboot * Xboot_ * yboot_)
 wx2 <- colSums(Wboot * Xboot_^2)
 boot_betahat[B, ] <- wxy / wx2
+=======
+# par(mfrow = c(3, 4))
+# for (j in 1 : ncol(X_original)) {
+#   # plot(X_original[,j], .overlap_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("OW"))
+#   plot(X_original[,j], 1/ProbPred_integrated[,j] *.overlap_weighting_g(ProbPred_integrated[,j]),
+#        xlab = paste0("X", j), ylab = paste0("OW"))
+#   # plot(X_original[,j], .entropy_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("EW"))
+# }
+res_cl <- clamp_categorical(X=X_original, y=Y, W=W,
+maxL = 5,
+max_iter = 20,
+nboots = 100,
+seed = 123,
+verbose = T)
+plot(res_cl$level_pip)
+plot(res_cl$variable_pip)
+plot(res_cl$elbo, type = "o")
+plot(res_cl$loglik, type = "o")
+clamp_summarize_coefficients(res_cl)
+summary(res_cl)
+plot_select_results(res_cl$variable_pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+level_names <- paste0(rep(1:10, each = 2), "_", 1:2)
+plot_select_results(res_cl$level_pip, (2*effect_ind-1):(2*effect_ind)) +
+scale_x_continuous(breaks = seq(1:20),
+labels = paste0("X", level_names)) +
+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
+ylab("level-wise PIP") +
+xlab("variable_level")
+resY <- residuals(lm(Y ~ U))
+res_su <- susieR::susie(X_original, resY, L = 5)
+plot(res_su$pip)
+susieR::summary.susie(res_su)
+plot_select_results(res_su$pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+res_su0 <- susieR::susie(X_original, Y, L = 5)
+plot(res_su0$pip)
+susieR::summary.susie(res_su0)
+plot_select_results(res_su0$pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+res_la <- glmnet::cv.glmnet(X_original, Y, family = "gaussian", alpha = 1)
+plot_select_results(abs(as.numeric(coef(res_la, s = "lambda.min"))[-1]),
+effect_idx = effect_ind) +
+ylab("|coefficients|")
+knitr::opts_chunk$set(
+collapse = TRUE,
+comment = "#>"
+)
+devtools::load_all(".")
+# library(clamp)
+library(nnet)
+library(data.table)
+data.dir <- "~/Documents/research/data/genotype/"
+## for plots
+library(tidyverse)
+myPalette2 <- c("#E69F00", "#999999", "#00757F")
+myPalette3 <- c("dodgerblue2", "green4", "#6A3D9A", "#FF7F00",
+"gold1", "skyblue2", "#FB9A99", "palegreen2", "#CAB2D6",
+"#FDBF6F", "gray70", "khaki2", "maroon", "orchid1", "deeppink1",
+"blue1", "steelblue4", "darkturquoise", "green1", "yellow4",
+"yellow3", "darkorange4", "brown")
+plot_select_results <- function(vals, effect_idx, cs_list = NULL) {
+tibble(vals = vals) %>%
+mutate(id = 1 : n()) %>%
+mutate(true_label = if_else(id %in% effect_idx, "causal", "noncausal")) %>%
+arrange(desc(true_label)) %>%
+ggplot(aes(x = id, y = vals, color = as.factor(true_label))) +
+geom_point(size = 3) +
+scale_color_manual(values = myPalette2[1:2]) +
+labs(x = "variable", color = "") +
+theme_classic()
+}
+# tab <- tibble(vals = vals) %>%
+#   mutate(id = 1 : n(), cs = -1) %>%
+#   mutate(true_label = if_else(id %in% effect_idx, "causal", "noncausal")) %>%
+#   arrange(desc(true_label))
+#
+# for (i in seq_along(cs_list)) {
+#   tab$cs[tab$id %in% cs_list[[i]]] <- i
+# }
+#
+# ggplot(data = tab) +
+#   geom_point(aes(x = id, y = vals,
+#                  color = as.factor(true_label),
+#                  shape = factor(cs)), size = 3) +
+#   scale_color_manual(values = myPalette2[1:2]) +
+#   scale_shape_manual()
+set.seed(123)
+n <- 1000
+p <- 20
+K <- 3  ## 3-level categorical variable.
+# confounder
+U <- rnorm(n, mean = 0.2)
+# potential outcomes
+eps <- rnorm(n)
+# When mean(U) = 0.5
+# ============ f1 ============
+f1func <- function(.x, .u) return(.x + .u + .x*.u)
+## E[f1(0,U)] = 0.5; E[f1(1,U)] = 2; E[f1(2,U)] = 3.5
+## Delta1 = 1.5; Delta2 = 3
+# ============ f2 ============
+f2func <- function(.x, .u) return(2*(.x - 0.8*.u))
+## (E[f2(0,U)] = -0.4; E[f2(1,U)] = 0.6; E[f2(2,U)] = 1.6)*2
+## Delta1 = 2; Delta2 = 4
+# ============ f3 ============
+f3func <- function(.x, .u) return(-.x^2 + .u)
+## E[f3(0,U)] = 0.5, E[f3(1,U)] = -0.5, E[f3(2,U)] = -3.5
+## (Delta1 = -1, Delta2 = -4)
+# treatment assignment: multinomial logistic regression (softmax)
+## .lp: matrix of linear predictors (s). It should contain (K-1) columns, and each column is an n-vector of the linear predictors of the k-th level (1 <= k < K)
+softmax <- function(.lp) {
+K <- length(.lp) + 1  ## number of levels
+exp.s <- apply(.lp, 2, exp)  ## exp(LinearPredictor), n by (K-1)
+Z <- rowSums(exp.s) + 1  ## denominator
+prob.mat <- cbind(1, exp.s) / Z
+return(prob.mat)
+}
+xi.matrix <- matrix(nrow = 2, ncol = p*(K-1))
+# 1st row: intercepts
+# 2nd row: slopes of U
+intercepts_base <- 0
+slopes <- seq(from = -10, to = -2, length.out = p*(K-1))
+slopes_eps <- rnorm(n=length(slopes), sd = 0.1)
+xi.matrix[1,] <- intercepts_base
+xi.matrix[2,] <- slopes + slopes_eps
+LinearPred <- apply(xi.matrix, 2, function(x) x[1] + x[2]*U)
+X <- matrix(NA)
+for (j in 1 : p) {
+probs <- softmax(LinearPred[, ((j-1)*(K-1)+1) : (j*(K-1)) , drop=F])
+Xj <- t(apply(probs, 1, function(pr) {rmultinom(1, 1, prob = pr)})) ## n by K
+if (sum(is.na(X))) {
+X <- Xj
+} else {
+X <- cbind(X, Xj)
+}
+}
+colnames(X) <- paste0(rep(paste0("X", 1:p, "_"), each = K), 0:(K-1))
+X <- apply(X, 2, as.double)
+## Turn it into a n by p matrix with entries 0, 1, 2
+X_original <- sweep(X, 2, as.vector(rep(1:K, times = p)), "*")
+variable_names <- sub("_.*", "", colnames(X_original))
+X_original <- sapply(unique(variable_names), function(v) {
+rowSums(X_original[, variable_names == v, drop=F])
+})
+X_original <- X_original - 1
+head(X_original)
+corrplot::corrplot(cor(X_original), method = 'number')
+############################################
+# observed outcome
+# effect_ind <- sample(1:p, size = 3)
+effect_ind <- c(4)
+# Y <-
+#   X[,(effect_ind[1] - 1) * K + 1] * f1func(0, U) +
+#   X[,(effect_ind[1] - 1) * K + 2] * f1func(1, U) +
+#   X[,(effect_ind[1] - 1) * K + 3] * f1func(2, U) +
+#   X[,(effect_ind[2] - 1) * K + 1] * f2func(0, U) +
+#   X[,(effect_ind[2] - 1) * K + 2] * f2func(1, U) +
+#   X[,(effect_ind[2] - 1) * K + 3] * f2func(2, U) + eps
+# X[,(effect_ind[3] - 1) * K + 1] * f3func(0, U) +
+# X[,(effect_ind[3] - 1) * K + 2] * f3func(1, U) +
+# X[,(effect_ind[3] - 1) * K + 3] * f3func(2, U) + eps
+Y <- X_original[,effect_ind, drop=F] %*%
+as.matrix(rep(1, times = length(effect_ind))) + U + eps
+## Robust approach 1: IPW truncation
+.clipped <- function(.val, .alpha = 0.05) {
+ifelse(.val < .alpha, .alpha,
+ifelse(.val > (1-.alpha), 1-.alpha, .val))
+>>>>>>> Stashed changes
 }
 boot_var <- colVars(boot_betahat)
 betahat_bs1 <- data.frame(Estimate = betahat, SE = sqrt(boot_var))
@@ -459,6 +636,7 @@ wxy <- colSums(Wboot * Xboot_ * yboot_)
 wx2 <- colSums(Wboot * Xboot_^2)
 boot_betahat[B, ] <- wxy / wx2
 }
+<<<<<<< Updated upstream
 boot_var <- colVars(boot_betahat)
 betahat_bs1 <- data.frame(Estimate = betahat, SE = sqrt(boot_var))
 betahat_bs1
@@ -510,3 +688,325 @@ max_iter = 1,
 maxL = 10,
 seed = 123,
 verbose = T)
+=======
+colnames(ProbPred) <- colnames(X)
+## Check the propensities in the specific levels
+ProbPred_integrated <- X * ProbPred
+variable_names <- sub("_.*", "", colnames(X))
+ProbPred_integrated <- sapply(unique(variable_names), function(var){
+rowSums(ProbPred_integrated[, variable_names == var, drop=F])
+})
+# par(mfrow = c(3, 4))  # create a 3 x 3 plotting matrix
+# for (j in 1 : ncol(X_original)) {
+#   plot(X_original[,j], ProbPred_integrated[,j],
+#        xlab = paste0("X", j), ylab = paste0("estimated propensity"))
+# }
+W <- 1 / .clipped(ProbPred)
+# W <- 1 / ProbPred * .overlap_weighting_g(ProbPred)
+# W <- 1 / ProbPred * .entropy_weighting_g(ProbPred)
+# par(mfrow = c(3, 4))
+# for (j in 1 : ncol(X_original)) {
+#   # plot(X_original[,j], .overlap_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("OW"))
+#   plot(X_original[,j], 1/ProbPred_integrated[,j] *.overlap_weighting_g(ProbPred_integrated[,j]),
+#        xlab = paste0("X", j), ylab = paste0("OW"))
+#   # plot(X_original[,j], .entropy_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("EW"))
+# }
+res_cl <- clamp_categorical(X=X_original, y=Y, W=W,
+maxL = 5,
+max_iter = 20,
+nboots = 100,
+seed = 123,
+verbose = T)
+plot(res_cl$level_pip)
+plot(res_cl$variable_pip)
+plot(res_cl$elbo, type = "o")
+plot(res_cl$loglik, type = "o")
+clamp_summarize_coefficients(res_cl)
+summary(res_cl)
+plot_select_results(res_cl$variable_pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+level_names <- paste0(rep(1:10, each = 2), "_", 1:2)
+plot_select_results(res_cl$level_pip, (2*effect_ind-1):(2*effect_ind)) +
+scale_x_continuous(breaks = seq(1:20),
+labels = paste0("X", level_names)) +
+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
+ylab("level-wise PIP") +
+xlab("variable_level")
+resY <- residuals(lm(Y ~ U))
+res_su <- susieR::susie(X_original, resY, L = 5)
+plot(res_su$pip)
+susieR::summary.susie(res_su)
+plot_select_results(res_su$pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+res_su0 <- susieR::susie(X_original, Y, L = 5)
+plot(res_su0$pip)
+susieR::summary.susie(res_su0)
+plot_select_results(res_su0$pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+res_la <- glmnet::cv.glmnet(X_original, Y, family = "gaussian", alpha = 1)
+plot_select_results(abs(as.numeric(coef(res_la, s = "lambda.min"))[-1]),
+effect_idx = effect_ind) +
+ylab("|coefficients|")
+Y <- X_original[,effect_ind, drop=F] %*%
+as.matrix(rep(1, times = length(effect_ind))) + 5*U + eps
+set.seed(123)
+n <- 500
+p <- 20
+K <- 3  ## 3-level categorical variable.
+# confounder
+U <- rnorm(n, mean = 0.2)
+# potential outcomes
+eps <- rnorm(n)
+# When mean(U) = 0.5
+# ============ f1 ============
+f1func <- function(.x, .u) return(.x + .u + .x*.u)
+## E[f1(0,U)] = 0.5; E[f1(1,U)] = 2; E[f1(2,U)] = 3.5
+## Delta1 = 1.5; Delta2 = 3
+# ============ f2 ============
+f2func <- function(.x, .u) return(2*(.x - 0.8*.u))
+## (E[f2(0,U)] = -0.4; E[f2(1,U)] = 0.6; E[f2(2,U)] = 1.6)*2
+## Delta1 = 2; Delta2 = 4
+# ============ f3 ============
+f3func <- function(.x, .u) return(-.x^2 + .u)
+## E[f3(0,U)] = 0.5, E[f3(1,U)] = -0.5, E[f3(2,U)] = -3.5
+## (Delta1 = -1, Delta2 = -4)
+# treatment assignment: multinomial logistic regression (softmax)
+## .lp: matrix of linear predictors (s). It should contain (K-1) columns, and each column is an n-vector of the linear predictors of the k-th level (1 <= k < K)
+softmax <- function(.lp) {
+K <- length(.lp) + 1  ## number of levels
+exp.s <- apply(.lp, 2, exp)  ## exp(LinearPredictor), n by (K-1)
+Z <- rowSums(exp.s) + 1  ## denominator
+prob.mat <- cbind(1, exp.s) / Z
+return(prob.mat)
+}
+xi.matrix <- matrix(nrow = 2, ncol = p*(K-1))
+# 1st row: intercepts
+# 2nd row: slopes of U
+intercepts_base <- 0
+slopes <- seq(from = -10, to = -2, length.out = p*(K-1))
+slopes_eps <- rnorm(n=length(slopes), sd = 0.1)
+xi.matrix[1,] <- intercepts_base
+xi.matrix[2,] <- slopes + slopes_eps
+LinearPred <- apply(xi.matrix, 2, function(x) x[1] + x[2]*U)
+X <- matrix(NA)
+for (j in 1 : p) {
+probs <- softmax(LinearPred[, ((j-1)*(K-1)+1) : (j*(K-1)) , drop=F])
+Xj <- t(apply(probs, 1, function(pr) {rmultinom(1, 1, prob = pr)})) ## n by K
+if (sum(is.na(X))) {
+X <- Xj
+} else {
+X <- cbind(X, Xj)
+}
+}
+colnames(X) <- paste0(rep(paste0("X", 1:p, "_"), each = K), 0:(K-1))
+X <- apply(X, 2, as.double)
+## Turn it into a n by p matrix with entries 0, 1, 2
+X_original <- sweep(X, 2, as.vector(rep(1:K, times = p)), "*")
+variable_names <- sub("_.*", "", colnames(X_original))
+X_original <- sapply(unique(variable_names), function(v) {
+rowSums(X_original[, variable_names == v, drop=F])
+})
+X_original <- X_original - 1
+head(X_original)
+corrplot::corrplot(cor(X_original), method = 'number')
+############################################
+# observed outcome
+# effect_ind <- sample(1:p, size = 3)
+effect_ind <- c(4)
+# Y <-
+#   X[,(effect_ind[1] - 1) * K + 1] * f1func(0, U) +
+#   X[,(effect_ind[1] - 1) * K + 2] * f1func(1, U) +
+#   X[,(effect_ind[1] - 1) * K + 3] * f1func(2, U) +
+#   X[,(effect_ind[2] - 1) * K + 1] * f2func(0, U) +
+#   X[,(effect_ind[2] - 1) * K + 2] * f2func(1, U) +
+#   X[,(effect_ind[2] - 1) * K + 3] * f2func(2, U) + eps
+# X[,(effect_ind[3] - 1) * K + 1] * f3func(0, U) +
+# X[,(effect_ind[3] - 1) * K + 2] * f3func(1, U) +
+# X[,(effect_ind[3] - 1) * K + 3] * f3func(2, U) + eps
+Y <- X_original[,effect_ind, drop=F] %*%
+as.matrix(rep(1, times = length(effect_ind))) + 5*U + eps
+## Robust approach 1: IPW truncation
+.clipped <- function(.val, .alpha = 0.05) {
+ifelse(.val < .alpha, .alpha,
+ifelse(.val > (1-.alpha), 1-.alpha, .val))
+}
+## Robust approach 2: balancing weight
+### Option 1: overlap weighting
+.overlap_weighting_g <- function(.e) { .e * (1 - .e) }
+### Option 2: Entropy weighting
+.entropy_weighting_g <- function(.e) {-(.e * .loge(.e) + (1-.e) * .loge(1-.e) )}
+## estimated confounder
+# Uhat <- rsvd::rsvd(X_original, k = 3)$u
+# require(nnet)
+ProbPred <- matrix(nrow = nrow(X), ncol = ncol(X)) ## same size as X
+col_indices <- colnames(X)
+for (j in 1 : p) {
+jcols <- grepl(paste0("X", as.character(j), "_"), col_indices)
+Xj <- X[, jcols, drop=F]
+# mn.fit <- multinom(Xj ~ Uhat)
+mn.fit <- multinom(Xj ~ U)
+ProbPred[, jcols] <- predict(mn.fit, type = "probs")
+}
+colnames(ProbPred) <- colnames(X)
+## Check the propensities in the specific levels
+ProbPred_integrated <- X * ProbPred
+variable_names <- sub("_.*", "", colnames(X))
+ProbPred_integrated <- sapply(unique(variable_names), function(var){
+rowSums(ProbPred_integrated[, variable_names == var, drop=F])
+})
+# par(mfrow = c(3, 4))  # create a 3 x 3 plotting matrix
+# for (j in 1 : ncol(X_original)) {
+#   plot(X_original[,j], ProbPred_integrated[,j],
+#        xlab = paste0("X", j), ylab = paste0("estimated propensity"))
+# }
+W <- 1 / .clipped(ProbPred)
+# W <- 1 / ProbPred * .overlap_weighting_g(ProbPred)
+# W <- 1 / ProbPred * .entropy_weighting_g(ProbPred)
+# par(mfrow = c(3, 4))
+# for (j in 1 : ncol(X_original)) {
+#   # plot(X_original[,j], .overlap_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("OW"))
+#   plot(X_original[,j], 1/ProbPred_integrated[,j] *.overlap_weighting_g(ProbPred_integrated[,j]),
+#        xlab = paste0("X", j), ylab = paste0("OW"))
+#   # plot(X_original[,j], .entropy_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("EW"))
+# }
+res_cl <- clamp_categorical(X=X_original, y=Y, W=W,
+maxL = 5,
+max_iter = 20,
+nboots = 100,
+seed = 123,
+verbose = T)
+plot(res_cl$level_pip)
+plot(res_cl$variable_pip)
+plot(res_cl$elbo, type = "o")
+plot(res_cl$loglik, type = "o")
+clamp_summarize_coefficients(res_cl)
+summary(res_cl)
+plot_select_results(res_cl$variable_pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+level_names <- paste0(rep(1:10, each = 2), "_", 1:2)
+plot_select_results(res_cl$level_pip, (2*effect_ind-1):(2*effect_ind)) +
+scale_x_continuous(breaks = seq(1:20),
+labels = paste0("X", level_names)) +
+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
+ylab("level-wise PIP") +
+xlab("variable_level")
+resY <- residuals(lm(Y ~ U))
+res_su <- susieR::susie(X_original, resY, L = 5)
+plot(res_su$pip)
+susieR::summary.susie(res_su)
+plot_select_results(res_su$pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+res_su0 <- susieR::susie(X_original, Y, L = 5)
+plot(res_su0$pip)
+susieR::summary.susie(res_su0)
+plot_select_results(res_su0$pip, effect_ind) +
+scale_x_continuous(breaks = seq(1, 10),
+labels = paste0("X", as.character(1:10))) +
+ylab("variable-wise PIP")
+res_la <- glmnet::cv.glmnet(X_original, Y, family = "gaussian", alpha = 1)
+plot_select_results(abs(as.numeric(coef(res_la, s = "lambda.min"))[-1]),
+effect_idx = effect_ind) +
+ylab("|coefficients|")
+# Function for expit
+expit <- function(x) ifelse(x > 0, 1 / (1 + exp(-x)), exp(x) / (1 + exp(x)))
+# Load libraries
+library(ggplot2)
+library(reshape2)
+set.seed(123)
+# Parameters
+n <- 500   # number of individuals
+p <- 200    # number of treatments
+# Simulate latent confounder U
+u <- rnorm(n, mean = 0, sd = 1)
+# Function for expit
+expit <- function(x) ifelse(x > 0, 1 / (1 + exp(-x)), exp(x) / (1 + exp(x)))
+# Simulate propensity scores matrix (n x p)
+E <- matrix(NA, nrow = n, ncol = p)
+for (j in 1:p) {
+eps <- rnorm(n, mean = 0, sd = 0.1)
+linear_pred <- ((j - 0.5 * p) / (p + 1)) * u + eps
+E[, j] <- expit(linear_pred)
+}
+colnames(E) <- paste0("X", 1:p)
+# Correlation matrix
+cor_mat <- cor(E)
+# Convert to long format for ggplot
+cor_df <- melt(cor_mat)
+colnames(cor_df) <- c("Var1", "Var2", "Correlation")
+# Correlation heatmap
+ggplot(cor_df, aes(x = Var1, y = Var2, fill = Correlation)) +
+geom_tile(color = "white") +
+scale_fill_gradient2(low = "blue", mid = "white", high = "red",
+midpoint = 0, limit = c(-1, 1)) +
+theme_minimal() +
+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
+coord_fixed() +
+labs(title = "Correlation Plot of Propensity Scores",
+x = "Treatment Variables",
+y = "Treatment Variables")
+corrplot::corrplot(cor_mat)
+ggplot(cor_df, aes(x = Var1, y = Var2, fill = Correlation)) +
+geom_tile(color = "white") +
+scale_fill_gradient2(low = "blue", mid = "white", high = "red",
+midpoint = 0, limit = c(-1, 1)) +
+theme_minimal() +
+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
+coord_fixed() +
+labs(title = "Correlation Plot of Propensity Scores",
+x = "Treatment Variables",
+y = "Treatment Variables")
+# Correlation heatmap
+ggplot(cor_df, aes(x = Var1, y = Var2, fill = Correlation)) +
+geom_tile(color = "white") +
+scale_fill_gradient2(low = "blue", mid = "white", high = "red",
+midpoint = 0, limit = c(-1, 1)) +
+theme_minimal() +
+theme(axis.text.x = element_blank(),
+axis.text.y = element_blank(),
+axis.ticks = element_blank()) +
+coord_fixed() +
+labs(title = "Correlation Plot of Propensity Scores",
+x = "Treatment Variables",
+y = "Treatment Variables")
+# Correlation heatmap
+ggplot(cor_df, aes(x = Var1, y = Var2, fill = Correlation)) +
+geom_tile(color = "white") +
+scale_fill_gradient2(low = "blue", mid = "white", high = "red",
+midpoint = 0, limit = c(-1, 1)) +
+theme_minimal() +
+theme(axis.text.x = element_blank(),
+axis.text.y = element_blank(),
+axis.ticks = element_blank()) +
+coord_fixed() +
+labs(#title = "Correlation plot of Propensity Scores",
+x = "Treatment Variables (Var 1)",
+y = "Treatment Variables (Var 2)")
+# Correlation heatmap
+ggplot(cor_df, aes(x = Var1, y = Var2, fill = Correlation)) +
+geom_tile(color = "white") +
+scale_fill_gradient2(low = "blue", mid = "white", high = "red",
+midpoint = 0, limit = c(-1, 1)) +
+theme_minimal() +
+theme(axis.text.x = element_blank(),
+axis.text.y = element_blank(),
+axis.ticks = element_blank()) +
+coord_fixed() +
+labs(#title = "Correlation plot of Propensity Scores",
+x = "Treatment Variables",
+y = "Treatment Variables")
+cor_mat[c(1, 2, 31, 65, 192), c(1, 2, 31, 65, 192)]
+>>>>>>> Stashed changes
diff --git a/vignettes/clamp-cat-exp-causal.Rmd b/vignettes/clamp-cat-exp-causal.Rmd
new file mode 100644
index 0000000..9cc5cbf
--- /dev/null
+++ b/vignettes/clamp-cat-exp-causal.Rmd
@@ -0,0 +1,445 @@
+---
+title: "Finding causal categorical variables using CLAMP"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{clamp-cat-exp-causal}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+```{r setup, include=FALSE}
+devtools::load_all(".")
+# library(clamp)
+library(nnet)
+library(data.table)
+
+data.dir <- "~/Documents/research/data/genotype/"
+```
+
+```{r, message=F, warning=F}
+## for plots
+library(tidyverse)
+myPalette2 <- c("#E69F00", "#999999", "#00757F")
+myPalette3 <- c("dodgerblue2", "green4", "#6A3D9A", "#FF7F00", 
+        "gold1", "skyblue2", "#FB9A99", "palegreen2", "#CAB2D6", 
+        "#FDBF6F", "gray70", "khaki2", "maroon", "orchid1", "deeppink1", 
+        "blue1", "steelblue4", "darkturquoise", "green1", "yellow4", 
+        "yellow3", "darkorange4", "brown")
+plot_select_results <- function(vals, effect_idx, cs_list = NULL) {
+  tibble(vals = vals) %>%
+    mutate(id = 1 : n()) %>%
+    mutate(true_label = if_else(id %in% effect_idx, "causal", "noncausal")) %>%
+    arrange(desc(true_label)) %>%
+    ggplot(aes(x = id, y = vals, color = as.factor(true_label))) +
+    geom_point(size = 3) +
+    scale_color_manual(values = myPalette2[1:2]) +
+    labs(x = "variable", color = "") +
+    theme_classic()
+}
+
+# tab <- tibble(vals = vals) %>%
+#   mutate(id = 1 : n(), cs = -1) %>% 
+#   mutate(true_label = if_else(id %in% effect_idx, "causal", "noncausal")) %>%
+#   arrange(desc(true_label))
+# 
+# for (i in seq_along(cs_list)) {
+#   tab$cs[tab$id %in% cs_list[[i]]] <- i
+# }
+# 
+# ggplot(data = tab) + 
+#   geom_point(aes(x = id, y = vals, 
+#                  color = as.factor(true_label), 
+#                  shape = factor(cs)), size = 3) +
+#   scale_color_manual(values = myPalette2[1:2]) +
+#   scale_shape_manual()
+```
+
+## Toy example: One causal exposures
+
+```{r}
+set.seed(123)
+n <- 500
+p <- 20
+K <- 3  ## 3-level categorical variable.
+
+# confounder
+U <- rnorm(n, mean = 0.2)
+
+# potential outcomes
+eps <- rnorm(n)
+
+# When mean(U) = 0.5
+# ============ f1 ============
+f1func <- function(.x, .u) return(.x + .u + .x*.u)
+## E[f1(0,U)] = 0.5; E[f1(1,U)] = 2; E[f1(2,U)] = 3.5
+## Delta1 = 1.5; Delta2 = 3
+# ============ f2 ============
+f2func <- function(.x, .u) return(2*(.x - 0.8*.u))
+## (E[f2(0,U)] = -0.4; E[f2(1,U)] = 0.6; E[f2(2,U)] = 1.6)*2
+## Delta1 = 2; Delta2 = 4
+# ============ f3 ============
+f3func <- function(.x, .u) return(-.x^2 + .u)
+## E[f3(0,U)] = 0.5, E[f3(1,U)] = -0.5, E[f3(2,U)] = -3.5
+## (Delta1 = -1, Delta2 = -4)
+
+# treatment assignment: multinomial logistic regression (softmax)
+## .lp: matrix of linear predictors (s). It should contain (K-1) columns, and each column is an n-vector of the linear predictors of the k-th level (1 <= k < K)
+softmax <- function(.lp) {
+  K <- length(.lp) + 1  ## number of levels
+  
+  exp.s <- apply(.lp, 2, exp)  ## exp(LinearPredictor), n by (K-1)
+  Z <- rowSums(exp.s) + 1  ## denominator
+  
+  prob.mat <- cbind(1, exp.s) / Z
+  return(prob.mat)
+}
+
+xi.matrix <- matrix(nrow = 2, ncol = p*(K-1))
+# 1st row: intercepts
+# 2nd row: slopes of U
+intercepts_base <- 0
+slopes <- seq(from = -10, to = -2, length.out = p*(K-1))
+slopes_eps <- rnorm(n=length(slopes), sd = 0.1)
+xi.matrix[1,] <- intercepts_base
+xi.matrix[2,] <- slopes + slopes_eps
+LinearPred <- apply(xi.matrix, 2, function(x) x[1] + x[2]*U)
+X <- matrix(NA)
+for (j in 1 : p) {
+  probs <- softmax(LinearPred[, ((j-1)*(K-1)+1) : (j*(K-1)) , drop=F])
+  Xj <- t(apply(probs, 1, function(pr) {rmultinom(1, 1, prob = pr)})) ## n by K
+  
+  if (sum(is.na(X))) {
+    X <- Xj
+  } else { 
+      X <- cbind(X, Xj) 
+  }
+}
+colnames(X) <- paste0(rep(paste0("X", 1:p, "_"), each = K), 0:(K-1))
+X <- apply(X, 2, as.double)
+
+## Turn it into a n by p matrix with entries 0, 1, 2
+X_original <- sweep(X, 2, as.vector(rep(1:K, times = p)), "*")
+variable_names <- sub("_.*", "", colnames(X_original))
+X_original <- sapply(unique(variable_names), function(v) {
+  rowSums(X_original[, variable_names == v, drop=F])
+})
+X_original <- X_original - 1
+head(X_original)
+corrplot::corrplot(cor(X_original), method = 'number')
+
+
+############################################
+# observed outcome
+# effect_ind <- sample(1:p, size = 3)
+effect_ind <- c(4)
+# Y <- 
+#   X[,(effect_ind[1] - 1) * K + 1] * f1func(0, U) +
+#   X[,(effect_ind[1] - 1) * K + 2] * f1func(1, U) +
+#   X[,(effect_ind[1] - 1) * K + 3] * f1func(2, U) +
+#   X[,(effect_ind[2] - 1) * K + 1] * f2func(0, U) +
+#   X[,(effect_ind[2] - 1) * K + 2] * f2func(1, U) +
+#   X[,(effect_ind[2] - 1) * K + 3] * f2func(2, U) + eps
+  # X[,(effect_ind[3] - 1) * K + 1] * f3func(0, U) +
+  # X[,(effect_ind[3] - 1) * K + 2] * f3func(1, U) +
+  # X[,(effect_ind[3] - 1) * K + 3] * f3func(2, U) + eps
+
+Y <- X_original[,effect_ind, drop=F] %*% 
+  as.matrix(rep(1, times = length(effect_ind))) + 5*U + eps
+```
+
+### Step 0: estimate propensity scores
+
+```{r}
+## Robust approach 1: IPW truncation
+.clipped <- function(.val, .alpha = 0.05) {
+  ifelse(.val < .alpha, .alpha,
+         ifelse(.val > (1-.alpha), 1-.alpha, .val))
+}
+
+## Robust approach 2: balancing weight
+
+### Option 1: overlap weighting
+.overlap_weighting_g <- function(.e) { .e * (1 - .e) }
+
+### Option 2: Entropy weighting
+.entropy_weighting_g <- function(.e) {-(.e * .loge(.e) + (1-.e) * .loge(1-.e) )}
+```
+
+
+
+```{r}
+## estimated confounder
+# Uhat <- rsvd::rsvd(X_original, k = 3)$u
+  
+# require(nnet)
+ProbPred <- matrix(nrow = nrow(X), ncol = ncol(X)) ## same size as X
+col_indices <- colnames(X)
+for (j in 1 : p) {
+  jcols <- grepl(paste0("X", as.character(j), "_"), col_indices)
+  Xj <- X[, jcols, drop=F]
+  # mn.fit <- multinom(Xj ~ Uhat)
+  mn.fit <- multinom(Xj ~ U)
+  ProbPred[, jcols] <- predict(mn.fit, type = "probs")
+}
+colnames(ProbPred) <- colnames(X)
+```
+
+```{r}
+## Check the propensities in the specific levels
+ProbPred_integrated <- X * ProbPred
+variable_names <- sub("_.*", "", colnames(X))
+ProbPred_integrated <- sapply(unique(variable_names), function(var){
+    rowSums(ProbPred_integrated[, variable_names == var, drop=F])
+  })
+
+# par(mfrow = c(3, 4))  # create a 3 x 3 plotting matrix
+# for (j in 1 : ncol(X_original)) {
+#   plot(X_original[,j], ProbPred_integrated[,j],
+#        xlab = paste0("X", j), ylab = paste0("estimated propensity"))
+# }
+```
+
+```{r}
+W <- 1 / .clipped(ProbPred)
+# W <- 1 / ProbPred * .overlap_weighting_g(ProbPred)
+# W <- 1 / ProbPred * .entropy_weighting_g(ProbPred)
+
+# par(mfrow = c(3, 4))
+# for (j in 1 : ncol(X_original)) {
+#   # plot(X_original[,j], .overlap_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("OW"))
+#   plot(X_original[,j], 1/ProbPred_integrated[,j] *.overlap_weighting_g(ProbPred_integrated[,j]),
+#        xlab = paste0("X", j), ylab = paste0("OW"))
+#   # plot(X_original[,j], .entropy_weighting_g(ProbPred_integrated[,j]),
+#   #      xlab = paste0("X", j), ylab = paste0("EW"))
+# }
+```
+
+
+### Fit: method 1 - PIP for each variable/independent levels
+```{r}
+res_cl <- clamp_categorical(X=X_original, y=Y, W=W, 
+                            maxL = 5,
+                            max_iter = 20,
+                            nboots = 100,
+                            seed = 123, 
+                            verbose = T)
+plot(res_cl$level_pip)
+plot(res_cl$variable_pip)
+plot(res_cl$elbo, type = "o")
+plot(res_cl$loglik, type = "o")
+
+clamp_summarize_coefficients(res_cl)
+
+summary(res_cl)
+```
+
+```{r}
+plot_select_results(res_cl$variable_pip, effect_ind) +
+  scale_x_continuous(breaks = seq(1, 10), 
+                     labels = paste0("X", as.character(1:10))) + 
+  ylab("variable-wise PIP")
+```
+
+```{r}
+level_names <- paste0(rep(1:10, each = 2), "_", 1:2)
+plot_select_results(res_cl$level_pip, (2*effect_ind-1):(2*effect_ind)) + 
+  scale_x_continuous(breaks = seq(1:20), 
+                     labels = paste0("X", level_names)) +
+  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + 
+  ylab("level-wise PIP") + 
+  xlab("variable_level")
+```
+
+
+```{r}
+resY <- residuals(lm(Y ~ U))
+res_su <- susieR::susie(X_original, resY, L = 5)
+plot(res_su$pip)
+susieR::summary.susie(res_su)
+```
+
+```{r}
+plot_select_results(res_su$pip, effect_ind) +
+  scale_x_continuous(breaks = seq(1, 10), 
+                     labels = paste0("X", as.character(1:10))) + 
+  ylab("variable-wise PIP")
+
+```
+
+
+
+```{r}
+res_su0 <- susieR::susie(X_original, Y, L = 5)
+plot(res_su0$pip)
+susieR::summary.susie(res_su0)
+```
+
+```{r}
+plot_select_results(res_su0$pip, effect_ind) +
+  scale_x_continuous(breaks = seq(1, 10), 
+                     labels = paste0("X", as.character(1:10))) + 
+  ylab("variable-wise PIP")
+```
+
+
+
+```{r}
+res_la <- glmnet::cv.glmnet(X_original, Y, family = "gaussian", alpha = 1)
+plot_select_results(abs(as.numeric(coef(res_la, s = "lambda.min"))[-1]), 
+                    effect_idx = effect_ind) +
+  ylab("|coefficients|")
+```
+
+
+## Toy example 2: Genetics Dataset
+
+```{r}
+library(bigsnpr)
+`%&%` <- function(a,b) paste0(a,b)
+if.needed <- function(.files, .code) {
+  if(!all(file.exists(.files))){
+    .code
+  }
+}
+dir.create(data.dir, recursive=TRUE, showWarnings=FALSE)
+
+.bed.file <- data.dir %&% "1000G_phase3_common_norel.bed"
+if.needed(.bed.file, {
+  bigsnpr::download_1000G(data.dir)
+})
+.bk.file <- data.dir %&% "1000G_phase3_common_norel.rds"
+if.needed(.bk.file, {
+  BED <- snp_readBed(.bed.file)
+})
+dat <- snp_attach(.bk.file)
+dat.genotypes <- dat$genotypes
+rm(dat)
+```
+
+Get the population structure and treat it as the confounder. 
+
+```{r}
+pop.info.file <- data.dir %&% "1000G_phase3_common_norel.fam2"
+pop.info <- fread(pop.info.file)
+head(pop.info)
+# all(pop.info$sample.ID == dat$fam$sample.ID)
+
+# Use pop.info$`Super Population` as the confounder.
+U <- fastDummies::dummy_cols(pop.info$`Super Population`, 
+                                     remove_first_dummy = F, 
+                                     remove_selected_columns = T)
+U <- sapply(U, as.numeric)
+gamma_u <- rnorm(ncol(U), mean = 1)
+
+rm(pop.info)
+```
+
+```{r}
+nn <- nrow(dat.genotypes)
+pp <- 10
+startpoint <- 1000
+X <- dat.genotypes[1:nn, startpoint:(startpoint+pp-1)]
+colnames(X) <- paste0("X", 1:pp)
+
+# check minor allele frequencies and remove columns whose MAF <= 0.05
+MAF <- snp_MAF(dat.genotypes, ind.col = startpoint:(startpoint+pp-1))
+plot(MAF); abline(h=0.05)
+remove_snp_cols <- which(MAF <= 0.05)
+X <- X[, -remove_snp_cols, drop=F]
+dim(X)
+
+# randomly sample causal variables. 
+n_causal_vars <- 3
+causal_indices <- sort(sample(colnames(X), n_causal_vars))
+causal_effects <- rnorm(n_causal_vars)  ## Assume all additive effects
+print(data.frame(variable = causal_indices, effects = causal_effects))
+
+# generate response Y
+h2 <- 1
+esp <- rnorm(nn)
+y <- sqrt(h2) * (X[, causal_indices, drop=F] %*% as.matrix(causal_effects) 
+                 + U %*% as.matrix(gamma_u)) +
+  sqrt(1-h2) * esp
+```
+
+
+### Step 1: calculate the matrix of inverse propensity weights.
+
+
+```{r}
+# Approximate substitute confounder
+Uhat <- rsvd::rsvd(X, k = 2)$u
+
+# Expand X as one-hot version
+X_one_hot <- fastDummies::dummy_cols(apply(X, 2, as.factor), 
+                                     remove_first_dummy = F, 
+                                     remove_selected_columns = T)
+head(X_one_hot)
+X_one_hot <- apply(X_one_hot, 2, as.double)
+
+# Estimate the propensity scores
+ProbPred <- matrix(nrow = nrow(X_one_hot), ncol = ncol(X_one_hot))
+variable_names <- sub("_.*", "", colnames(X_one_hot))
+for (v in unique(variable_names)) {
+  Xj <- X_one_hot[, variable_names == v, drop=F]
+  mn.fit <- multinom(Xj ~ Uhat)
+  ProbPred[, variable_names == v] <- predict(mn.fit, type = "probs")
+}
+colnames(ProbPred) <- colnames(X_one_hot)
+
+ProbPred <- .clipped(ProbPred)
+W <- 1 / ProbPred
+```
+
+
+```{r}
+## Check the propensities in the specific levels
+ProbPred_integrated <- X_one_hot * ProbPred
+ProbPred_integrated <- sapply(unique(variable_names), function(var){
+    rowSums(ProbPred_integrated[, variable_names == var, drop=F])
+  })
+
+par(mfrow = c(3, 3))  # create a 3 x 3 plotting matrix
+for (j in 1 : ncol(X)) {
+  plot(X[,j], ProbPred_integrated[,j],
+       xlab = paste0("X", j), ylab = paste0("estimated propensity"))
+}
+```
+
+
+```{r}
+ggplot(data.frame(X = X[, 9], Uhat=Uhat), aes(x = Uhat, y = X)) +
+  geom_point() +
+  theme_bw()
+```
+
+### Step 2: fit the clamp
+
+```{r}
+X_sub <- X[, !(colnames(X) %in% c("X5", "X7", "X9"))] 
+W_sub <- W[, !sub("_.*", "", colnames(W)) %in% c("X5", "X7", "X9")]
+
+res_cl <- clamp_categorical(X=X_sub, y=y, W=W_sub, max_iter = 1,
+                            seed = 123, verbose = T)
+```
+
+```{r}
+plot(res_cl$level_pip)
+plot(res_cl$variable_pip)
+plot(res_cl$elbo, type = "o")
+
+clamp_summarize_coefficients(res_cl)
+
+summary(res_cl)
+```
+
+
+
diff --git a/vignettes/clamp-linear-causal.Rmd b/vignettes/clamp-linear-causal.Rmd
index aef807d..ee8d2d5 100644
--- a/vignettes/clamp-linear-causal.Rmd
+++ b/vignettes/clamp-linear-causal.Rmd
@@ -38,7 +38,6 @@ This is a simulated data set `X` generated by random numbers.
 ```{r synthetic data}
 set.seed(107109)
 nn <- 500
-# pp <- 1000
 pp <- 200
 h2 <- 0.3
 U <- rnorm(nn)
@@ -119,7 +118,6 @@ plot(avg_elbo, type = "l")
 ```{r}
 res_cl3 <- clamp(X, y, W=Wmat, 
                 standardize = F,
-                mle_estimator = "WLS",
                 max_iter = 10,
                 maxL = 10,
                 seed = 123, 
@@ -132,7 +130,6 @@ summarize_coefficients(res_cl3)
 ```{r}
 res_cl2 <- clamp(X, y, W=Wmat, 
                 standardize = T,
-                mle_estimator = "WLS",
                 max_iter = 20,
                 maxL = 10,
                 seed = 123,