Scalability/scalability.R

#' ---
#' title: "Regression and Other Stories: Scalability"
#' author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
#' date: "`r format(Sys.Date())`"
#' output:
#'   html_document:
#'     theme: readable
#'     toc: true
#'     toc_depth: 2
#'     toc_float: true
#'     code_download: true
#' ---

#' Demonstrate how the computation time scales with bigger data. See
#' Chapter 22 in Regression and Other Stories.
#' 
#' -------------
#'

#+ setup, include=FALSE
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)

#' #### Load packages
library(arm)
library(rstanarm)
options(mc.cores = parallel::detectCores())
library(tictoc)

#' ## Linear regression n=100K, p=100
#' 
#' #### Create fake data with n=100\,000 and p=100
SEED <- 1656
set.seed(SEED)
n <- 1e5
k <- 100
x <- rnorm(n)
xn <- matrix(rnorm(n*(k-1)),nrow=n)
a <- 2
b <- 3
sigma <- 10
y <- a + b*x + sigma*rnorm(n)
fake <- data.frame(x, xn, y)

#' #### Fit using lm
tic()
fit1 <- lm(y ~ ., data=fake)
toc()
#display(fit1)

#' #### Fit using stan_glm and MCMC
#' 
#' stan_glm is fast for linear regression with n>k and small or
#' moderate k (using OLS trick)
tic()
#+ results='hide'
fit2 <- stan_glm(y ~ ., data=fake, mean_PPD=FALSE,
                 refresh=500, seed=SEED, cores=1)
#+
toc()
#print(fit2, digits=2)

#' #### Fit using stan_glm and optimization
#' 
#' Using optimization with normal approximation and Pareto smoothed
#' importance resampling provides coefficient standard errors, but
#' also diagnostic whether normal approximation at the mode is
#' appropriate.
tic()
fit3 <- stan_glm(y ~ ., data=fake, algorithm='optimizing', init=0) 
toc()
#print(fit3, digits=2)

#' ## Logistic regression n=10K, p=100
#' 
#' #### Create fake data with 10\,000 observations and p=100
SEED <- 1655
set.seed(SEED)
n <- 1e4
k <- 100
x <- rnorm(n)
xn <- matrix(rnorm(n*(k-1)),nrow=n)
a <- 2
b <- 3
sigma <- 1
y <- as.numeric(a + b*x + sigma*rnorm(n) > 0)
fake <- data.frame(x, xn, y)

#' #### Fit using glm
tic()
fit1 <- glm(y ~ ., data=fake, family=binomial())
toc()
#display(fit1)

#' #### Fit using stan_glm and MCMC
tic()
fit2 <- stan_glm(y ~ ., data=fake, family=binomial(), mean_PPD=FALSE,
                 init_r=0.1, seed=SEED)
toc()
#print(fit2, digits=2)

#' #### Fit using stan_glm and optimization
#' 
#' Using optimization with normal approximation and Pareto smoothed
#' importance resampling provides coefficient standard errors, but
#' also diagnostic whether normal approximation at the mode is
#' appropriate.
tic()
fit3 <- stan_glm(y ~ ., data=fake, family=binomial(),
                algorithm='optimizing', init=0, draws = 16000, keep_every = 4) 
toc()
#print(fit3, digits=2)

#' ## Logistic regression n=100K, p=100
#' 
#' #### Create fake data with 100\,000 observations and p=100
SEED <- 1655
set.seed(SEED)
n <- 1e5
k <- 100
x <- rnorm(n)
xn <- matrix(rnorm(n*(k-1)),nrow=n)
a <- 2
b <- 3
sigma <- 1
y <- as.numeric(a + b*x + sigma*rnorm(n) > 0)
fake <- data.frame(x, xn, y)

#' #### Fit using glm
tic()
fit1 <- glm(y ~ ., data=fake, family=binomial())
toc()
#display(fit1)

#' #### Fit using stan_glm and optimization
#' 
#' Using optimization with normal approximation and Pareto smoothed
#' importance resampling provides coefficient standard errors, but
#' also diagnostic whether normal approximation at the mode is
#' appropriate.
tic()
fit3 <- stan_glm(y ~ ., data=fake, family=binomial(),
                 algorithm='optimizing', init=0) 
toc()
summary(fit3, digits=2)