Skip to content

Commit

Permalink
cleaning and initial make overs
Browse files Browse the repository at this point in the history
  • Loading branch information
avehtari committed Feb 26, 2018
1 parent a89630d commit 05e7178
Show file tree
Hide file tree
Showing 77 changed files with 827 additions and 380 deletions.
File renamed without changes.
File renamed without changes.
Empty file removed Congress/cong3/Icon
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
setwd("~/AndrewFiles/books/regression.and.other.stories/Examples/Earnings")

library("here")
library("foreign")
earnings <- read.dta("earnings.dta")

## create variables for age and ethnicity categories
#' Load original data
earnings <- read.dta(here("Earnings/data","earnings.dta"))

#' Create variables for age and ethnicity categories
earnings$age <- 90 - earnings$yearbn # survey was conducted in 1990
earnings$age[earnings$age < 18] <- NA
earnings$age_category <- with(earnings, ifelse(age < 35, 1, ifelse(age < 50, 2, 3)))
earnings$eth <- with(earnings, ifelse(race==2, 1, ifelse(hisp==1, 2, ifelse(race==1, 3, 4))))
earnings$male <- 2 - earnings$sex

## (For simplicity) remove cases with missing data and restrict to people with positive earnings born after 1925

#' (For simplicity) remove cases with missing data and restrict
#' to people born after 1925
ok <- with(earnings, !is.na(earn + height + sex + age) & yearbn > 2)
earnings_clean <- earnings[ok,]
write.csv(earnings_clean, "earnings.csv")
write.csv(earnings_clean, here("Earnings/data","earnings.csv"))
61 changes: 38 additions & 23 deletions Earnings/earnings_bootstrap.R
Original file line number Diff line number Diff line change
@@ -1,37 +1,52 @@
setwd("~/AndrewFiles/books/regression.and.other.stories/Examples/Earnings")
#' ---
#' title: "Regression and Other Stories: Earnings"
#' author: "Andrew Gelman, Aki Vehtari"
#' date: "`r format(Sys.Date())`"
#' ---

#' Bootstrapping to simulate the sampling distribution
#'
#' -------------
#'

#' **Load libraries**
#+ setup, message=FALSE, error=FALSE, warning=FALSE
library("here")
library("arm")

## classical regressions and graphs for earnings example

earnings_clean <- read.csv("earnings.csv")
n <- nrow(earnings_clean)

earn <- earnings_clean$earn
height <- earnings_clean$height
male <- earnings_clean$male

colnames(height_data) <- c("ID", "earn", "height", "male")
print(height_data[1:10,])

print(mean(height[male]) / mean(height[!male]))

n <- length(height)
#' **Load data**
earnings <- read.csv(here("Earnings/data","earnings.csv"))
earnings_all <- read.csv(here("Earnings/data","earnings.csv"))
earnings_all$positive <- earnings_all$earn > 0
#' only non-zero earnings
earnings <- earnings_all[earnings_all$positive, ]
n <- nrow(earnings)
earn <- earnings$earn
male <- earnings$male
print(earnings[1:10,])

#' **Median of women's earnings, divided by the median of men's earnings**
print(median(earn[male==0]) / median(earn[male==1]))

#' **A single bootstrap sample**
n <- length(earn)
boot <- sample(n, replace=TRUE)
height_boot <- height[boot]
earn_boot <- earn[boot]
male_boot <- male[boot]
ratio_boot <- mean(height_boot[male_boot]) / mean(height_boot[!male_boot])
ratio_boot <- median(earn_boot[male_boot==0]) / median(earn_boot[male_boot==1])

#' **A set of bootstrap simulations**
Boot_ratio <- function(data){
n <- nrow(data)
boot <- sample(n, replace=TRUE)
height_boot <- data$height[boot]
earn_boot <- data$earn[boot]
male_boot <- data$male[boot]
ratio_boot <- mean(height_boot[male_boot]) / mean(height_boot[!male_boot])
ratio_boot <- median(earn_boot[male_boot==0]) / median(earn_boot[male_boot==1])
return(ratio_boot)
}
n_sims <- 10000
output <- replicate(n_sims, Boot_ratio(data=earnings))

n_sims <- 100
output <- replicate(n_sims, Boot_ratio(data=earnings_clean))

#' **Summarize the results graphically and numerically**
hist(output)
print(sd(output))
104 changes: 67 additions & 37 deletions Earnings/earnings_regression.R
Original file line number Diff line number Diff line change
@@ -1,25 +1,42 @@
setwd("~/AndrewFiles/books/regression.and.other.stories/Examples/Earnings")
#' ---
#' title: "Regression and Other Stories: Earnings"
#' author: "Andrew Gelman, Aki Vehtari"
#' date: "`r format(Sys.Date())`"
#' ---

#' Predict respondents' yearly earnings using survey data from 1990.
#'
#' -------------
#'

#' **Load libraries**
#+ setup, message=FALSE, error=FALSE, warning=FALSE
library("here")
library("arm")
library("rstanarm")
options(mc.cores = parallel::detectCores())
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))

## classical regressions and graphs for earnings example

earnings_all <- read.csv("earnings.csv")
#' **Load data**
earnings_all <- read.csv(here("Earnings/data","earnings.csv"))
earnings_all$positive <- earnings_all$earn > 0
# only non-zero earnings
earnings <- earnings_all[earnings_all$positive, ]
n <- nrow(earnings)
height_jitter_add <- runif(n, -.2, .2)

## model on original scale
#' ### Classical regression

#' **Model on original scale**
lm_0 <- lm(earn ~ height, data=earnings)
display(lm_0)

#+ heights1a, eval=FALSE, include=FALSE
# figure for the book, don't display in the report
pdf(here("Earnings/figs","heights1a.pdf"), height=8.5, width=11)
# plot linear model
pdf("heights1a.pdf", height=8.5, width=11)
par(mar=c(6,6,4,2)+.1)
plot(earnings$height + height_jitter_add, earnings$earn, xlab="height", ylab="earnings",
cex=.8, cex.lab=3, pch=20, cex.axis=3, yaxt="n", mgp=c(4,1.5,0),
Expand All @@ -29,7 +46,8 @@ abline(0,0,col="gray")
axis(2, c(0,100000,200000), c("0","100000","200000"), mgp=c(4,1.1,0),cex.axis=3)
dev.off()

# ggplot version
#+ fig1a
# plot linear model, ggplot version
gg_earnings <- ggplot(earnings, aes(x = height + height_jitter_add, y = earn)) +
geom_point(alpha = 0.75) +
geom_hline(yintercept = 0, color = "darkgray") +
Expand All @@ -39,8 +57,10 @@ gg_earnings <- ggplot(earnings, aes(x = height + height_jitter_add, y = earn)) +
gg_earnings


#+ heights1b, eval=FALSE, include=FALSE
# figure for the book, don't display in the report
# plot extrapolation
pdf("heights1b.pdf", height=8.5, width=11)
pdf(here("Earnings/figs","heights1b.pdf"), height=8.5, width=11)
par(mar=c(6,6,4,2)+.1)
plot(xlim=c(0,max(earnings$height)), ylim=c(-70000,200000), earnings$height + height_jitter_add, earnings$earn, xlab="height", ylab="earnings", cex=.8, cex.lab=3, pch=20, cex.axis=3, yaxt="n", mgp=c(4,1.5,0), col="gray10", cex.main=3, main="Extrapolation")
abline(coef(lm_0), lwd=2)
Expand All @@ -49,30 +69,34 @@ intercept <- coef(lm_0)[1]
axis(2, c(intercept,0,100000,200000), c(round(intercept,-2),"0","100000","200000"), mgp=c(4,1.1,0),cex.axis=3)
dev.off()

# ggplot version, modifying the gg_earnings object we already created
#+ fig1b
# plot extrapolation, ggplot version, modifying the gg_earnings object we already created
gg_earnings +
ylim(-70000, 200000) +
xlim(0, 80) +
labs(title = "Extrapolation")


# include male/female
#' **Include male/female**
lm_1 <- lm(earn ~ height + male, data=earnings)
display(lm_1)
coef1 <- coef(lm_1)

pdf("heights2.pdf", height=8.5, width=11)
#+ heights2, eval=FALSE, include=FALSE
# figure for the book, don't display in the report
pdf(here("Earnings/figs","heights2.pdf"), height=8.5, width=11)
par(mar=c(6,6,5,2)+.1)
plot(range(earnings$height), range(predict(lm_1)), type="n", xlab="height", ylab="predicted earnings", cex=.8, cex.lab=3, pch=20, cex.axis=3, mgp=c(4,1.5,0), yaxt="n", col="gray10",
cex.main=3, main="Fitted regression, displayed as\nseparate lines for men and women", bty="l")
axis(2, c(20000,30000), cex.axis=3)
abline(coef1[1], coef1[2], col="red", lwd=2)
text(68, coef1[1] + coef1[2]*65, "women:\ny = -11,000 + 450x", cex=3, adj=0, col="red")
text(68, coef1[1] + coef1[2]*65, "women:\ny = -11 000 + 450x", cex=3, adj=0, col="red")
abline(coef1[1]+coef1[3], coef1[2], col="blue", lwd=2)
text(68, coef1[1]+coef1[3] + coef1[2]*65, "men:\ny = -2,000 + 450x", cex=3, adj=0, col="blue")
text(68, coef1[1]+coef1[3] + coef1[2]*65, "men:\ny = -2 000 + 450x", cex=3, adj=0, col="blue")
dev.off()

# ggplot version
#+ fig2
# include male/female, ggplot version
ggplot(earnings, aes(height, earn)) +
geom_blank() +
geom_abline(
Expand All @@ -88,33 +112,36 @@ ggplot(earnings, aes(height, earn)) +
"text",
x = c(68, 68),
y = c(coef1[1] + coef1[2] * 65, coef1[1] + coef1[3] + coef1[2] * 65),
label = c("women:\ny = -11,000 + 450x", "men:\ny = -2,000 + 450x"),
label = c("women:\ny = -11 000 + 450x", "men:\ny = -2 000 + 450x"),
color = c("red", "blue"),
size = 5, hjust = 0
) +
labs(
x = "height",
y = "predicted earnings",
y = "predicted earnings",
title = "Fitted regression, displayed as\nseparate lines for men and women"
)



#' **Include interaction**
lm_2 <- lm(earn ~ height + male + height*male, data=earnings)
display(lm_2)
coef2 <- coef(lm_2)

pdf("heights3.pdf", height=8.5, width=11)
#+ heights3, eval=FALSE, include=FALSE
# figure for the book, don't display in the report
pdf(here("Earnings/figs","heights3.pdf"), height=8.5, width=11)
par(mar=c(6,6,5,2)+.1)
plot(range(earnings$height), range(predict(lm_2)), type="n", xlab="height", ylab="predicted earnings", cex=.8, cex.lab=3, pch=20, cex.axis=3, mgp=c(4,1.5,0), yaxt="n", col="gray10", cex.main=3, main="Fitted regression with interactions,\nseparate lines for men and women", bty="l")
axis(2, c(20000,30000), cex.axis=3)
abline(coef2[1], coef2[2], col="red", lwd=2)
text(62, coef2[1] + coef2[2]*80, "women:\ny = -7,000 + 180x", cex=3, adj=0, col="red")
text(62, coef2[1] + coef2[2]*80, "women:\ny = -7 000 + 180x", cex=3, adj=0, col="red")
abline(coef2[1]+coef2[3], coef2[2]+coef2[4], col="blue", lwd=2)
text(68, coef2[1]+coef2[3] + (coef2[2]+coef2[4])*66, "men:\ny = -22,000 + 740x", cex=3, adj=0, col="blue")
text(68, coef2[1]+coef2[3] + (coef2[2]+coef2[4])*66, "men:\ny = -22 000 + 740x", cex=3, adj=0, col="blue")
dev.off()

# ggplot version
#+ fig3
# include interaction, ggplot version
ggplot(earnings, aes(height, earn)) +
geom_blank() +
geom_abline(
Expand All @@ -130,7 +157,7 @@ ggplot(earnings, aes(height, earn)) +
"text",
x = c(62, 68),
y = c(coef2[1] + coef2[2] * 80, coef2[1]+coef2[3] + (coef2[2]+coef2[4])*66),
label = c("women:\ny = -7,000 + 180x", "men:\ny = -22,000 + 740x"),
label = c("women:\ny = -7 000 + 180x", "men:\ny = -22 000 + 740x"),
color = c("red", "blue"),
size = 5, hjust = 0
) +
Expand All @@ -140,34 +167,37 @@ ggplot(earnings, aes(height, earn)) +
title = "Fitted regression with interactions,\nseparate lines for men and women"
)

#' ### Classical regression on log scale

# model on log scale
#' **Models on log scale**
earnings$log_earn <- log(earnings$earn)
logmodel_1 <- lm(log_earn ~ height, data=earnings)
display(logmodel_1)

earnings$log10_earn <- log10(earnings$earn)
display(lm(log10_earn ~ height, data=earnings))

logmodel_2 <- lm(log_earn ~ height + male, data=earnings)
display(logmodel_2)

earnings$log_height <- log(earnings$height)
loglogmodel_2 <- lm(log_earn ~ log_height + male, data=earnings)
display(loglogmodel_2)

logmodel_3 <- lm(log_earn ~ height + male + height:male, data=earnings)
display(logmodel_3)

#' **Model on log scale with standardized interaction**
earnings$z_height <- with(earnings, (height - mean(height))/sd(height))
logmodel_3a <- lm(log_earn ~ z_height + male + z_height:male, data=earnings)
display(logmodel_3a)

#' ### Bayesian regression with rstanarm

#' **Bayesian model on log scale**
M_1 <- stan_glm(log_earn ~ height + male, data = earnings)
sims <- as.matrix(M_1)
n_sims <- nrow(sims)

postscript("heights.log1a.ps", horizontal=TRUE)
#+ heights.log1a, eval=FALSE, include=FALSE
# figure for the book, don't display in the report
postscript(here("Earnings/figs","heights.log1a.ps"), horizontal=TRUE)
par(mar=c(6,6,4,2)+.1)
plot(earnings$height + runif(n,-.2,.2), earnings$log_earn, xlab="height", ylab="log (earnings)", cex=.8, cex.lab=3, pch=20, cex.axis=3, yaxt="n", mgp=c(4,1.5,0), col="gray10",
cex.main=3, main="Log regression, plotted on log scale")
Expand All @@ -179,7 +209,8 @@ for (i in subset){
curve(coef(M_1)[1] + coef(M_1)[2]*x, add=TRUE)
dev.off()

# ggplot version
#+ fig.log1a
# Bayesian model on log scale, ggplot version
subset <- sample(n_sims, 10)
ggplot(earnings, aes(height, log_earn)) +
geom_jitter(height = 0, width = 0.25) +
Expand All @@ -198,21 +229,20 @@ ggplot(earnings, aes(height, log_earn)) +
title = "Log regression, plotted on log scale"
)

####


#' **Bayesian logistic regression on non-zero earnings**
fit_1a <- stan_glm(positive ~ height + male,
family = binomial(link = "logit"),
data = earnings_all)
sims_1a <- as.matrix(fit_1a)
sims_1a <- as.matrix(fit_1a

#' **Bayesian model on log scale**
fit_1b <- stan_glm(log_earn ~ height + male, data = earnings)
sims_1b <- as.matrix(fit_1b)

print(fit_1a, digits=2)
print(fit_1b, digits=2)

new <- data.frame(height = 68, male = 0)
#' **Predictions for a new person**
new <- data.frame(height = 68, male = 0, positive=1)
pred_1a <- posterior_predict(fit_1a, newdata=new)
pred_1b <- posterior_predict(fit_1a, newdata=new)
pred_1b <- posterior_predict(fit_1b, newdata=new)
pred <- ifelse(pred_1a == 1, exp(pred_1b), 0)

Binary file added Earnings/figs/heights.log1a.ps
Binary file not shown.
Binary file added Earnings/figs/heights1a.pdf
Binary file not shown.
Binary file added Earnings/figs/heights1b.pdf
Binary file not shown.
Binary file renamed Earnings/heights2.pdf → Earnings/figs/heights2.pdf
Binary file not shown.
Binary file renamed Earnings/heights3.pdf → Earnings/figs/heights3.pdf
Binary file not shown.
Binary file removed Earnings/heights.log1a.ps
Binary file not shown.
Binary file removed Earnings/heights1a.pdf
Binary file not shown.
Binary file removed Earnings/heights1b.pdf
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 05e7178

Please sign in to comment.