cleaning and initial make overs

idhx · Feb 26, 2018 · 05e7178 · 05e7178
1 parent a89630d
commit 05e7178
Show file tree

Hide file tree

Showing 77 changed files with 827 additions and 380 deletions.
diff --git a/oscars.csv → AcademyAwards/oscars.csv b/oscars.csv → AcademyAwards/oscars.csv
diff --git a/oscars.txt → AcademyAwards/oscars.txt b/oscars.txt → AcademyAwards/oscars.txt
diff --git a/Congress/cong3/Icon b/Congress/cong3/Icon
diff --git a/Earnings/earnings.csv → Earnings/data/earnings.csv b/Earnings/earnings.csv → Earnings/data/earnings.csv
diff --git a/Earnings/earnings.dta → Earnings/data/earnings.dta b/Earnings/earnings.dta → Earnings/data/earnings.dta
diff --git a/Earnings/earnings.rda → Earnings/data/earnings.rda b/Earnings/earnings.rda → Earnings/data/earnings.rda
diff --git a/Earnings/earnings_setup.R → Earnings/data/earnings_setup_data.R b/Earnings/earnings_setup.R → Earnings/data/earnings_setup_data.R
@@ -1,18 +1,19 @@
-setwd("~/AndrewFiles/books/regression.and.other.stories/Examples/Earnings")
 
+library("here")
 library("foreign")
-earnings <- read.dta("earnings.dta")
 
-## create variables for age and ethnicity categories
+#' Load original data
+earnings <- read.dta(here("Earnings/data","earnings.dta"))
 
+#' Create variables for age and ethnicity categories
 earnings$age <- 90 -  earnings$yearbn # survey was conducted in 1990
 earnings$age[earnings$age < 18] <- NA
 earnings$age_category <- with(earnings, ifelse(age < 35, 1, ifelse(age < 50, 2, 3)))
 earnings$eth <- with(earnings, ifelse(race==2, 1, ifelse(hisp==1, 2, ifelse(race==1, 3, 4))))
 earnings$male <- 2 - earnings$sex
 
-## (For simplicity) remove cases with missing data and restrict to people with positive earnings born after 1925
-
+#' (For simplicity) remove cases with missing data and restrict
+#' to people born after 1925
 ok <- with(earnings, !is.na(earn + height + sex + age) & yearbn > 2)
 earnings_clean <- earnings[ok,]
-write.csv(earnings_clean, "earnings.csv")
+write.csv(earnings_clean, here("Earnings/data","earnings.csv"))
diff --git a/Earnings/earnings_bootstrap.R b/Earnings/earnings_bootstrap.R
@@ -1,37 +1,52 @@
-setwd("~/AndrewFiles/books/regression.and.other.stories/Examples/Earnings")
+#' ---
+#' title: "Regression and Other Stories: Earnings"
+#' author: "Andrew Gelman, Aki Vehtari"
+#' date: "`r format(Sys.Date())`"
+#' ---
+
+#' Bootstrapping to simulate the sampling distribution
+#' 
+#' -------------
+#' 
+
+#' **Load libraries**
+#+ setup, message=FALSE, error=FALSE, warning=FALSE
+library("here")
 library("arm")
 
-## classical regressions and graphs for earnings example
-
-earnings_clean <- read.csv("earnings.csv")
-n <- nrow(earnings_clean)
-
-earn <- earnings_clean$earn
-height <- earnings_clean$height
-male <- earnings_clean$male
-
-colnames(height_data) <- c("ID", "earn", "height", "male")
-print(height_data[1:10,])
-
-print(mean(height[male]) / mean(height[!male]))
-
-n <- length(height)
+#' **Load data**
+earnings <- read.csv(here("Earnings/data","earnings.csv"))
+earnings_all <- read.csv(here("Earnings/data","earnings.csv"))
+earnings_all$positive <- earnings_all$earn > 0
+#' only non-zero earnings
+earnings <- earnings_all[earnings_all$positive, ]
+n <- nrow(earnings)
+earn <- earnings$earn
+male <- earnings$male
+print(earnings[1:10,])
+
+#' **Median of women's earnings, divided by the median of men's earnings**
+print(median(earn[male==0]) / median(earn[male==1]))
+
+#' **A single bootstrap sample**
+n <- length(earn)
 boot <- sample(n, replace=TRUE)
-height_boot <- height[boot]
+earn_boot <- earn[boot]
 male_boot <- male[boot]
-ratio_boot <- mean(height_boot[male_boot]) / mean(height_boot[!male_boot])
+ratio_boot <- median(earn_boot[male_boot==0]) / median(earn_boot[male_boot==1])
 
+#' **A set of bootstrap simulations**
 Boot_ratio <- function(data){
   n <- nrow(data)
   boot <- sample(n, replace=TRUE)
-  height_boot <- data$height[boot]
+  earn_boot <- data$earn[boot]
   male_boot <- data$male[boot]
-  ratio_boot <- mean(height_boot[male_boot]) / mean(height_boot[!male_boot])
+  ratio_boot <- median(earn_boot[male_boot==0]) / median(earn_boot[male_boot==1])
   return(ratio_boot)
 }
+n_sims <- 10000
+output <- replicate(n_sims, Boot_ratio(data=earnings))
 
-n_sims <- 100
-output <- replicate(n_sims, Boot_ratio(data=earnings_clean))
-
+#' **Summarize the results graphically and numerically**
 hist(output)
 print(sd(output))
diff --git a/Earnings/earnings_regression.R b/Earnings/earnings_regression.R
@@ -1,25 +1,42 @@
-setwd("~/AndrewFiles/books/regression.and.other.stories/Examples/Earnings")
+#' ---
+#' title: "Regression and Other Stories: Earnings"
+#' author: "Andrew Gelman, Aki Vehtari"
+#' date: "`r format(Sys.Date())`"
+#' ---
+
+#' Predict respondents' yearly earnings using survey data from 1990.
+#' 
+#' -------------
+#' 
+
+#' **Load libraries**
+#+ setup, message=FALSE, error=FALSE, warning=FALSE
+library("here")
 library("arm")
 library("rstanarm")
 options(mc.cores = parallel::detectCores())
 library("ggplot2")
 library("bayesplot")
 theme_set(bayesplot::theme_default(base_family = "sans"))
 
-## classical regressions and graphs for earnings example
-
-earnings_all <- read.csv("earnings.csv")
+#' **Load data**
+earnings_all <- read.csv(here("Earnings/data","earnings.csv"))
 earnings_all$positive <- earnings_all$earn > 0
+# only non-zero earnings
 earnings <- earnings_all[earnings_all$positive, ]
 n <- nrow(earnings)
 height_jitter_add <- runif(n, -.2, .2)
 
-## model on original scale
+#' ### Classical regression
+
+#' **Model on original scale**
 lm_0 <- lm(earn ~ height, data=earnings)
 display(lm_0)
 
+#+ heights1a, eval=FALSE, include=FALSE
+# figure for the book, don't display in the report
+pdf(here("Earnings/figs","heights1a.pdf"), height=8.5, width=11)
 # plot linear model
-pdf("heights1a.pdf", height=8.5, width=11)
 par(mar=c(6,6,4,2)+.1)
 plot(earnings$height + height_jitter_add, earnings$earn, xlab="height", ylab="earnings",
      cex=.8, cex.lab=3, pch=20, cex.axis=3, yaxt="n", mgp=c(4,1.5,0),
@@ -29,7 +46,8 @@ abline(0,0,col="gray")
 axis(2, c(0,100000,200000), c("0","100000","200000"), mgp=c(4,1.1,0),cex.axis=3)
 dev.off()
 
-# ggplot version
+#+ fig1a
+# plot linear model, ggplot version
 gg_earnings <- ggplot(earnings, aes(x = height + height_jitter_add, y = earn)) +
   geom_point(alpha = 0.75) +
   geom_hline(yintercept = 0, color = "darkgray") +
@@ -39,8 +57,10 @@ gg_earnings <- ggplot(earnings, aes(x = height + height_jitter_add, y = earn)) +
 gg_earnings
 
 
+#+ heights1b, eval=FALSE, include=FALSE
+# figure for the book, don't display in the report
 # plot extrapolation
-pdf("heights1b.pdf", height=8.5, width=11)
+pdf(here("Earnings/figs","heights1b.pdf"), height=8.5, width=11)
 par(mar=c(6,6,4,2)+.1)
 plot(xlim=c(0,max(earnings$height)), ylim=c(-70000,200000), earnings$height + height_jitter_add, earnings$earn, xlab="height", ylab="earnings", cex=.8, cex.lab=3, pch=20, cex.axis=3, yaxt="n", mgp=c(4,1.5,0), col="gray10", cex.main=3, main="Extrapolation")
 abline(coef(lm_0), lwd=2)
@@ -49,30 +69,34 @@ intercept <- coef(lm_0)[1]
 axis(2, c(intercept,0,100000,200000), c(round(intercept,-2),"0","100000","200000"), mgp=c(4,1.1,0),cex.axis=3)
 dev.off()
 
-# ggplot version, modifying the gg_earnings object we already created
+#+ fig1b
+# plot extrapolation, ggplot version, modifying the gg_earnings object we already created
 gg_earnings +
   ylim(-70000, 200000) +
   xlim(0, 80) +
   labs(title = "Extrapolation")
 
 
-# include male/female
+#' **Include male/female**
 lm_1 <- lm(earn ~ height + male, data=earnings)
 display(lm_1)
 coef1 <- coef(lm_1)
 
-pdf("heights2.pdf", height=8.5, width=11)
+#+ heights2, eval=FALSE, include=FALSE
+# figure for the book, don't display in the report
+pdf(here("Earnings/figs","heights2.pdf"), height=8.5, width=11)
 par(mar=c(6,6,5,2)+.1)
 plot(range(earnings$height), range(predict(lm_1)), type="n", xlab="height", ylab="predicted earnings", cex=.8, cex.lab=3, pch=20, cex.axis=3, mgp=c(4,1.5,0), yaxt="n", col="gray10",
       cex.main=3, main="Fitted regression, displayed as\nseparate lines for men and women", bty="l")
 axis(2, c(20000,30000), cex.axis=3)
 abline(coef1[1], coef1[2], col="red", lwd=2)
-text(68, coef1[1] + coef1[2]*65, "women:\ny = -11,000 + 450x", cex=3, adj=0, col="red")
+text(68, coef1[1] + coef1[2]*65, "women:\ny = -11 000 + 450x", cex=3, adj=0, col="red")
 abline(coef1[1]+coef1[3], coef1[2], col="blue", lwd=2)
-text(68, coef1[1]+coef1[3] + coef1[2]*65, "men:\ny = -2,000 + 450x", cex=3, adj=0, col="blue")
+text(68, coef1[1]+coef1[3] + coef1[2]*65, "men:\ny = -2 000 + 450x", cex=3, adj=0, col="blue")
 dev.off()
 
-# ggplot version
+#+ fig2
+# include male/female, ggplot version
 ggplot(earnings, aes(height, earn)) +
   geom_blank() +
   geom_abline(
@@ -88,33 +112,36 @@ ggplot(earnings, aes(height, earn)) +
     "text",
     x = c(68, 68),
     y = c(coef1[1] + coef1[2] * 65, coef1[1] + coef1[3] + coef1[2] * 65),
-    label = c("women:\ny = -11,000 + 450x", "men:\ny = -2,000 + 450x"),
+    label = c("women:\ny = -11 000 + 450x", "men:\ny = -2 000 + 450x"),
     color = c("red", "blue"),
     size = 5, hjust = 0
   ) +
   labs(
     x = "height",
-    y = "predicted earnings",
+      y = "predicted earnings",
     title = "Fitted regression, displayed as\nseparate lines for men and women"
   )
 
 
-
+#' **Include interaction**
 lm_2 <- lm(earn ~ height + male + height*male, data=earnings)
 display(lm_2)
 coef2 <- coef(lm_2)
 
-pdf("heights3.pdf", height=8.5, width=11)
+#+ heights3, eval=FALSE, include=FALSE
+# figure for the book, don't display in the report
+pdf(here("Earnings/figs","heights3.pdf"), height=8.5, width=11)
 par(mar=c(6,6,5,2)+.1)
 plot(range(earnings$height), range(predict(lm_2)), type="n", xlab="height", ylab="predicted earnings", cex=.8, cex.lab=3, pch=20, cex.axis=3, mgp=c(4,1.5,0), yaxt="n", col="gray10", cex.main=3, main="Fitted regression with interactions,\nseparate lines for men and women", bty="l")
 axis(2, c(20000,30000), cex.axis=3)
 abline(coef2[1], coef2[2], col="red", lwd=2)
-text(62, coef2[1] + coef2[2]*80, "women:\ny = -7,000 + 180x", cex=3, adj=0, col="red")
+text(62, coef2[1] + coef2[2]*80, "women:\ny = -7 000 + 180x", cex=3, adj=0, col="red")
 abline(coef2[1]+coef2[3], coef2[2]+coef2[4], col="blue", lwd=2)
-text(68, coef2[1]+coef2[3] + (coef2[2]+coef2[4])*66, "men:\ny = -22,000 + 740x", cex=3, adj=0, col="blue")
+text(68, coef2[1]+coef2[3] + (coef2[2]+coef2[4])*66, "men:\ny = -22 000 + 740x", cex=3, adj=0, col="blue")
 dev.off()
 
-# ggplot version
+#+ fig3
+# include interaction, ggplot version
 ggplot(earnings, aes(height, earn)) +
   geom_blank() +
   geom_abline(
@@ -130,7 +157,7 @@ ggplot(earnings, aes(height, earn)) +
     "text",
     x = c(62, 68),
     y = c(coef2[1] + coef2[2] * 80, coef2[1]+coef2[3] + (coef2[2]+coef2[4])*66),
-    label = c("women:\ny = -7,000 + 180x", "men:\ny = -22,000 + 740x"),
+    label = c("women:\ny = -7 000 + 180x", "men:\ny = -22 000 + 740x"),
     color = c("red", "blue"),
     size = 5, hjust = 0
   ) +
@@ -140,34 +167,37 @@ ggplot(earnings, aes(height, earn)) +
     title = "Fitted regression with interactions,\nseparate lines for men and women"
   )
 
+#' ### Classical regression on log scale
 
-# model on log scale
+#' **Models on log scale**
 earnings$log_earn <- log(earnings$earn)
 logmodel_1 <- lm(log_earn ~ height, data=earnings)
 display(logmodel_1)
-
 earnings$log10_earn <- log10(earnings$earn)
 display(lm(log10_earn ~ height, data=earnings))
-
 logmodel_2 <- lm(log_earn ~ height + male, data=earnings)
 display(logmodel_2)
-
 earnings$log_height <- log(earnings$height)
 loglogmodel_2 <- lm(log_earn ~ log_height + male, data=earnings)
 display(loglogmodel_2)
-
 logmodel_3 <- lm(log_earn ~ height + male + height:male, data=earnings)
 display(logmodel_3)
 
+#' **Model on log scale with standardized interaction**
 earnings$z_height <- with(earnings, (height - mean(height))/sd(height))
 logmodel_3a <- lm(log_earn ~ z_height + male + z_height:male, data=earnings)
 display(logmodel_3a)
 
+#' ### Bayesian regression with rstanarm
+
+#' **Bayesian model on log scale**
 M_1 <- stan_glm(log_earn ~ height + male, data = earnings)
 sims <- as.matrix(M_1)
 n_sims <- nrow(sims)
 
-postscript("heights.log1a.ps", horizontal=TRUE)
+#+ heights.log1a, eval=FALSE, include=FALSE
+# figure for the book, don't display in the report
+postscript(here("Earnings/figs","heights.log1a.ps"), horizontal=TRUE)
 par(mar=c(6,6,4,2)+.1)
 plot(earnings$height + runif(n,-.2,.2), earnings$log_earn, xlab="height", ylab="log (earnings)", cex=.8, cex.lab=3, pch=20, cex.axis=3, yaxt="n", mgp=c(4,1.5,0), col="gray10",
       cex.main=3, main="Log regression, plotted on log scale")
@@ -179,7 +209,8 @@ for (i in subset){
 curve(coef(M_1)[1] + coef(M_1)[2]*x, add=TRUE)
 dev.off()
 
-# ggplot version
+#+ fig.log1a
+# Bayesian model on log scale, ggplot version
 subset <- sample(n_sims, 10)
 ggplot(earnings, aes(height, log_earn)) +
   geom_jitter(height = 0, width = 0.25) +
@@ -198,21 +229,20 @@ ggplot(earnings, aes(height, log_earn)) +
     title = "Log regression, plotted on log scale"
   )
 
-####
-
-
+#' **Bayesian logistic regression on non-zero earnings**
 fit_1a <- stan_glm(positive ~ height + male,
                    family = binomial(link = "logit"),
                    data = earnings_all)
-sims_1a <- as.matrix(fit_1a)
+sims_1a <- as.matrix(fit_1a)§
+
+#' **Bayesian model on log scale**
 fit_1b <- stan_glm(log_earn ~ height + male, data = earnings)
 sims_1b <- as.matrix(fit_1b)
-
 print(fit_1a, digits=2)
 print(fit_1b, digits=2)
 
-new <- data.frame(height = 68, male = 0)
+#' **Predictions for a new person**
+new <- data.frame(height = 68, male = 0, positive=1)
 pred_1a <- posterior_predict(fit_1a, newdata=new)
-pred_1b <- posterior_predict(fit_1a, newdata=new)
+pred_1b <- posterior_predict(fit_1b, newdata=new)
 pred <- ifelse(pred_1a == 1, exp(pred_1b), 0)
-
diff --git a/Earnings/figs/heights.log1a.ps b/Earnings/figs/heights.log1a.ps
diff --git a/Earnings/figs/heights1a.pdf b/Earnings/figs/heights1a.pdf
diff --git a/Earnings/figs/heights1b.pdf b/Earnings/figs/heights1b.pdf
diff --git a/Earnings/heights2.pdf → Earnings/figs/heights2.pdf b/Earnings/heights2.pdf → Earnings/figs/heights2.pdf
diff --git a/Earnings/heights3.pdf → Earnings/figs/heights3.pdf b/Earnings/heights3.pdf → Earnings/figs/heights3.pdf
diff --git a/Earnings/heights.log1a.ps b/Earnings/heights.log1a.ps
diff --git a/Earnings/heights1a.pdf b/Earnings/heights1a.pdf
diff --git a/Earnings/heights1b.pdf b/Earnings/heights1b.pdf
diff --git a/ElectionsEconomy/hibbs.dat → ElectionsEconomy/data/hibbs.dat b/ElectionsEconomy/hibbs.dat → ElectionsEconomy/data/hibbs.dat
diff --git a/ElectionsEconomy/hibbsdots.pdf → ElectionsEconomy/data/hibbsdots.pdf b/ElectionsEconomy/hibbsdots.pdf → ElectionsEconomy/data/hibbsdots.pdf
diff --git a/ElectionsEconomy/hibbsline.pdf → ElectionsEconomy/data/hibbsline.pdf b/ElectionsEconomy/hibbsline.pdf → ElectionsEconomy/data/hibbsline.pdf
diff --git a/ElectionsEconomy/hibbsline2a.pdf → ElectionsEconomy/data/hibbsline2a.pdf b/ElectionsEconomy/hibbsline2a.pdf → ElectionsEconomy/data/hibbsline2a.pdf
diff --git a/ElectionsEconomy/hibbsline2b.pdf → ElectionsEconomy/data/hibbsline2b.pdf b/ElectionsEconomy/hibbsline2b.pdf → ElectionsEconomy/data/hibbsline2b.pdf
diff --git a/ElectionsEconomy/hibbspredict.pdf → ElectionsEconomy/data/hibbspredict.pdf b/ElectionsEconomy/hibbspredict.pdf → ElectionsEconomy/data/hibbspredict.pdf
diff --git a/ElectionsEconomy/hibbspredict_bayes_1.pdf → ...ionsEconomy/data/hibbspredict_bayes_1.pdf b/ElectionsEconomy/hibbspredict_bayes_1.pdf → ...ionsEconomy/data/hibbspredict_bayes_1.pdf
diff --git a/ElectionsEconomy/hibbspredict_bayes_2a.pdf → ...onsEconomy/data/hibbspredict_bayes_2a.pdf b/ElectionsEconomy/hibbspredict_bayes_2a.pdf → ...onsEconomy/data/hibbspredict_bayes_2a.pdf
diff --git a/ElectionsEconomy/hibbspredict_bayes_2b.pdf → ...onsEconomy/data/hibbspredict_bayes_2b.pdf b/ElectionsEconomy/hibbspredict_bayes_2b.pdf → ...onsEconomy/data/hibbspredict_bayes_2b.pdf
diff --git a/ElectionsEconomy/hibbspredict_bayes_3.pdf → ...ionsEconomy/data/hibbspredict_bayes_3.pdf b/ElectionsEconomy/hibbspredict_bayes_3.pdf → ...ionsEconomy/data/hibbspredict_bayes_3.pdf
diff --git a/ElectionsEconomy/hibbsscatter.pdf → ElectionsEconomy/data/hibbsscatter.pdf b/ElectionsEconomy/hibbsscatter.pdf → ElectionsEconomy/data/hibbsscatter.pdf
diff --git a/ElectionsEconomy/hill_2a.pdf → ElectionsEconomy/data/hill_2a.pdf b/ElectionsEconomy/hill_2a.pdf → ElectionsEconomy/data/hill_2a.pdf
diff --git a/ElectionsEconomy/hill_2b.pdf → ElectionsEconomy/data/hill_2b.pdf b/ElectionsEconomy/hill_2b.pdf → ElectionsEconomy/data/hill_2b.pdf
diff --git a/ElectionsEconomy/hill_2c.pdf → ElectionsEconomy/data/hill_2c.pdf b/ElectionsEconomy/hill_2c.pdf → ElectionsEconomy/data/hill_2c.pdf
diff --git a/ElectionsEconomy/hill_3c.pdf → ElectionsEconomy/data/hill_3c.pdf b/ElectionsEconomy/hill_3c.pdf → ElectionsEconomy/data/hill_3c.pdf