diff --git a/DESCRIPTION b/DESCRIPTION index d8342f9..5cfb4fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: dgo Title: Dynamic Estimation of Group-Level Opinion -Version: 0.2.11 -Date: 2017-10-26 +Version: 0.2.12 +Date: 2017-11-13 Description: Fit dynamic group-level IRT and MRP models from individual or aggregated item response data. This package handles common preprocessing tasks and extends functions for inspecting results, poststratification, and diff --git a/Makefile b/Makefile index 752c33f..318230f 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ else R := R endif -all: clean docs data readme build check install +all: clean docs data readme build check install site quick: clean diff --git a/NEWS.md b/NEWS.md index 3923f0b..9431966 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,19 @@ +## dgo 0.2.12 + +* Allow modeling of unobserved groups using aggregated data. The previous + behavior was to drop rows in `aggregate_data` indicating zero trials. (They + don't represent item responses.) Preserving them has the effect that + unobserved groups, defined partially or entirely by the values of the grouping + variables in zero-trial rows in `aggregate_data`, can be included in a model. +* Fix an unexpected error when 1) `aggregate_data` is used without `item_data`, + 2) no demographic groups are specified via `group_names`, and 3) geographic + `modifier_data` is used. +* Fix the check for missing `modifier_data`. Geographic `modifier_data` must + cover all combinations of the geo and time variables in the item response data + (individual or aggregated), but because of a bug in the validation of the + geographic data, this requirement was not always enforced. In some cases a + warning would appear instead of an error. + ## dgo 0.2.11 * Add poststratification over posterior samples (closes #21). diff --git a/R/restrict_input_data.r b/R/restrict_input_data.r index a509b14..c871b27 100644 --- a/R/restrict_input_data.r +++ b/R/restrict_input_data.r @@ -63,7 +63,7 @@ restrict_modifier <- function(modifier_data, group_grid, ctrl) { modifier_data <- modifier_data[geo_time_grid, nomatch = 0] # confirm that modifier data covers all modeled geo and time - missing_geo_time <- modifier_data[!geo_time_grid] + missing_geo_time <- geo_time_grid[!modifier_data] if (nrow(missing_geo_time)) { stop("Not all pairs of time periods and geographic areas are in ", "modifier_data. ", nrow(missing_geo_time), " missing.") @@ -122,11 +122,6 @@ restrict_aggregates <- function(aggregate_data, ctrl) { stop("no rows in aggregate data remaining after subsetting to items ", "in `aggregate_item_names`") - aggregate_data <- aggregate_data[get("n_grp") > 0] - if (!nrow(aggregate_data)) - stop("no rows in aggregate data remaining after dropping unobserved ", - "group-item combinations") - extra_colnames <- setdiff(names(aggregate_data), c(ctrl@geo_name, ctrl@time_name, ctrl@group_names, "item", "s_grp", "n_grp")) if (length(extra_colnames)) { diff --git a/R/shape_hierarchical.r b/R/shape_hierarchical.r index eb2b084..0cdb629 100644 --- a/R/shape_hierarchical.r +++ b/R/shape_hierarchical.r @@ -10,8 +10,10 @@ shape_hierarchical_data <- function(modifier_data, modifier_names, group_grid_t, hierarchical <- data.table::copy(modifier_data) hierarchical <- drop_extra_cols(hierarchical, modifier_names, ctrl) data.table::setkeyv(hierarchical, c(ctrl@geo_name, ctrl@time_name)) - unmodeled <- zero_unmodeled(hierarchical, modifier_names, group_grid_t, ctrl) - hierarchical <- rbind(hierarchical, unmodeled) + if (length(ctrl@group_names)) { + unmodeled <- zero_unmodeled(hierarchical, modifier_names, group_grid_t, ctrl) + hierarchical <- rbind(hierarchical, unmodeled) + } zz <- create_zz(hierarchical, modifier_names, ctrl) return(zz) } @@ -40,7 +42,8 @@ zero_unmodeled <- function(hierarchical, modifier_names, group_grid_t, ctrl) { paste0(x, unique(group_grid_t[[x]]))[-1] })) unmodeled_frame <- expand.grid(c(list(unmodeled_param_levels, - ctrl@time_filter), rep(list(0L), length(modifier_names)))) + ctrl@time_filter), rep(list(0L), length(modifier_names))), + stringsAsFactors = FALSE) unmodeled_frame <- setNames(unmodeled_frame, c(ctrl@geo_name, ctrl@time_name, modifier_names)) data.table::setDT(unmodeled_frame, key = c(ctrl@geo_name, ctrl@time_name)) diff --git a/README.Rmd b/README.Rmd index 88dc94c..2d084a2 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,4 +1,5 @@ --- +title: 'dgo: Dynamic Estimation of Group-Level Opinion' output: github_document --- [![Build Status](https://travis-ci.org/jamesdunham/dgo.svg?branch=master)](https://travis-ci.org/jamesdunham/dgo) @@ -7,29 +8,29 @@ output: github_document # Introduction -dgo is an R package for the dynamic estimation of group-level opinion. The -package can be used to estimate subpopulation groups' average latent -conservatism (or other latent trait) from individuals' responses to dichotomous -questions using a Bayesian group-level IRT approach developed by [Caughey and -Warshaw -2015](http://pan.oxfordjournals.org/content/early/2015/02/04/pan.mpu021.full.pdf+html) -that models latent traits at the level of demographic and/or geographic groups -rather than individuals. This approach uses a hierarchical model to borrow -strength cross-sectionally and dynamic linear models to do so across time. The -group-level estimates can be weighted to generate estimates for geographic -units, such as states. - -dgo can also be used to estimate smoothed estimates of subpopulation groups' -average responses on individual survey questions using a dynamic multi-level -regression and poststratification (MRP) model ([Park, Gelman, and Bafumi +dgo is an R package for the dynamic estimation of group-level public opinion. +You can use the package to estimate latent trait means in subpopulations from +survey data. For example, dgo can estimate the average policy liberalism in each +American state over time among Democrats, Independents, and Republicans, given +their answers to survey questions about policy proposals. + +dgo accomplishes this using a Bayesian group-level IRT approach developed by +[Caughey and Warshaw +2015](http://pan.oxfordjournals.org/content/early/2015/02/04/pan.mpu021.full.pdf+html). +It models latent traits at the level of demographic and geographic groups rather +than individuals. It uses a hierarchical model to borrow strength +cross-sectionally and dynamic linear models to do so across time. + +The package can also be used to estimate smoothed estimates of subpopulations' +average responses to single survey items, using a dynamic multi-level regression +and poststratification (MRP) model ([Park, Gelman, and Bafumi 2004](http://stat.columbia.edu/~gelman/research/published/StateOpinionsNationalPolls.050712.dkp.pdf)). -For instance, it could be used to estimate public opinion in each state on +For instance, you can use dgo to estimate public opinion in each state on same-sex marriage or the Affordable Care Act. This model opens up new areas of research on historical public opinion in the -United States at the subnational level. It also enables scholars of comparative -politics to estimate dynamic models of public opinion opinion at the country or -subnational level. +United States at the subnational level. It also allows scholars of comparative +politics to estimate dynamic cross-national models of public opinion. ```{r, knitr-options, echo = FALSE} # rmarkdown::render("README.Rmd") @@ -67,7 +68,7 @@ If you don't have already have RStan, follow its Load the package and set RStan's recommended options for a local, multicore machine with excess RAM: -```{r, result = 'hide'} +```{r, result = 'hide', message = FALSE} library(dgo) rstan_options(auto_write = TRUE) options(mc.cores = parallel::detectCores()) diff --git a/README.md b/README.md index aee9baf..29b5041 100644 --- a/README.md +++ b/README.md @@ -1,48 +1,73 @@ - -[![Build Status](https://travis-ci.org/jamesdunham/dgo.svg?branch=master)](https://travis-ci.org/jamesdunham/dgo) [![Build status](https://ci.appveyor.com/api/projects/status/1ta36kmoqen98k87?svg=true)](https://ci.appveyor.com/project/jamesdunham/dgo) [![codecov](https://codecov.io/gh/jamesdunham/dgo/branch/master/graph/badge.svg)](https://codecov.io/gh/jamesdunham/dgo) - -Introduction -============ - -dgo is an R package for the dynamic estimation of group-level opinion. The package can be used to estimate subpopulation groups' average latent conservatism (or other latent trait) from individuals' responses to dichotomous questions using a Bayesian group-level IRT approach developed by [Caughey and Warshaw 2015](http://pan.oxfordjournals.org/content/early/2015/02/04/pan.mpu021.full.pdf+html) that models latent traits at the level of demographic and/or geographic groups rather than individuals. This approach uses a hierarchical model to borrow strength cross-sectionally and dynamic linear models to do so across time. The group-level estimates can be weighted to generate estimates for geographic units, such as states. - -dgo can also be used to estimate smoothed estimates of subpopulation groups' average responses on individual survey questions using a dynamic multi-level regression and poststratification (MRP) model ([Park, Gelman, and Bafumi 2004](http://stat.columbia.edu/~gelman/research/published/StateOpinionsNationalPolls.050712.dkp.pdf)). For instance, it could be used to estimate public opinion in each state on same-sex marriage or the Affordable Care Act. - -This model opens up new areas of research on historical public opinion in the United States at the subnational level. It also enables scholars of comparative politics to estimate dynamic models of public opinion opinion at the country or subnational level. - -Installation -============ - -dgo can be installed from [CRAN](https://CRAN.R-project.org/package=dgo): +dgo: Dynamic Estimation of Group-Level Opinion +================ + +[![Build +Status](https://travis-ci.org/jamesdunham/dgo.svg?branch=master)](https://travis-ci.org/jamesdunham/dgo) +[![Build +status](https://ci.appveyor.com/api/projects/status/1ta36kmoqen98k87?svg=true)](https://ci.appveyor.com/project/jamesdunham/dgo) +[![codecov](https://codecov.io/gh/jamesdunham/dgo/branch/master/graph/badge.svg)](https://codecov.io/gh/jamesdunham/dgo) + +# Introduction + +dgo is an R package for the dynamic estimation of group-level public +opinion. You can use the package to estimate latent trait means in +subpopulations from survey data. For example, dgo can estimate the +average policy liberalism in each American state over time among +Democrats, Independents, and Republicans, given their answers to survey +questions about policy proposals. + +dgo accomplishes this using a Bayesian group-level IRT approach +developed by [Caughey and Warshaw +2015](http://pan.oxfordjournals.org/content/early/2015/02/04/pan.mpu021.full.pdf+html). +It models latent traits at the level of demographic and geographic +groups rather than individuals. It uses a hierarchical model to borrow +strength cross-sectionally and dynamic linear models to do so across +time. + +The package can also be used to estimate smoothed estimates of +subpopulations’ average responses to single survey items, using a +dynamic multi-level regression and poststratification (MRP) model +([Park, Gelman, and Bafumi +2004](http://stat.columbia.edu/~gelman/research/published/StateOpinionsNationalPolls.050712.dkp.pdf)). +For instance, you can use dgo to estimate public opinion in each state +on same-sex marriage or the Affordable Care Act. + +This model opens up new areas of research on historical public opinion +in the United States at the subnational level. It also allows scholars +of comparative politics to estimate dynamic cross-national models of +public opinion. + +# Installation + +dgo can be installed from +[CRAN](https://CRAN.R-project.org/package=dgo): ``` r install.packages("dgo") ``` -Or get the latest version from [GitHub](https://github.com/jamesdunham/dgo) using [devtools](https://github.com/hadley/devtools/): +Or get the latest version from +[GitHub](https://github.com/jamesdunham/dgo) using +[devtools](https://github.com/hadley/devtools/): ``` r if (!require(devtools, quietly = TRUE)) install.packages("devtools") devtools::install_github("jamesdunham/dgo") ``` -dgo requires a working installation of [RStan](http://mc-stan.org/interfaces/rstan.html). If you don't have already have RStan, follow its "[Getting Started](https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started)" guide. +dgo requires a working installation of +[RStan](http://mc-stan.org/interfaces/rstan.html). If you don’t have +already have RStan, follow its “[Getting +Started](https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started)” +guide. -Usage -===== +# Usage -Load the package and set RStan's recommended options for a local, multicore machine with excess RAM: +Load the package and set RStan’s recommended options for a local, +multicore machine with excess RAM: ``` r library(dgo) -#> Loading required package: dgodata -#> Loading required package: rstan -#> Loading required package: ggplot2 -#> Loading required package: StanHeaders -#> rstan (Version 2.16.2, packaged: 2017-07-03 09:24:58 UTC, GitRev: 2e1f913d3ca3) -#> For execution on a local, multicore CPU with excess RAM we recommend calling -#> rstan_options(auto_write = TRUE) -#> options(mc.cores = parallel::detectCores()) rstan_options(auto_write = TRUE) options(mc.cores = parallel::detectCores()) ``` @@ -50,24 +75,44 @@ options(mc.cores = parallel::detectCores()) The minimal workflow from raw data to estimation is: 1. shape input data using the `shape()` function; and -2. pass the result to the `dgirt()` function to estimate a latent trait (e.g., conservatism) or `dgmrp()` function to estimate opinion on a single survey question. - -Troubleshooting -=============== - -Please [report issues](https://github.com/jamesdunham/dgo/issues) that you encounter. - -- OS X only: RStan creates temporary files during estimation in a location given by `tempdir()`, typically an arbitrary location in `/var/folders`. If a model runs for days, these files can be cleaned up while still needed, which induces an error. A good solution is to set a safer path for temporary files, using an environment variable checked at session startup. For help setting environment variables, see the Stack Overflow question [here](https://stackoverflow.com/questions/17107206/change-temporary-directory). Confirm the new path before starting your model run by restarting R and checking the output from `tempdir()`. - -- Models fitted before October 2016 (specifically < [\#8e6a2cf](https://github.com/jamesdunham/dgo/commit/8e6a2cfbe00b2cd4a908b3067241e06124d143cd)) using dgirt are not fully compatible with dgo. Their contents can be extracted without using dgo, however, with the `$` indexing operator. For example: `as.data.frame(dgirtfit_object$stan.cmb)`. - -- Calling `dgirt()` or `dgmrp()` can generate [warnings](http://mc-stan.org/misc/warnings#compiler-warnings) during model compilation. These are safe to ignore, or can be suppressed by following the linked instructions. - -Contributing and citing -======================= - -dgo is under development and we welcome [suggestions](https://github.com/jamesdunham/dgo/issues). +2. pass the result to the `dgirt()` function to estimate a latent trait + (e.g., conservatism) or `dgmrp()` function to estimate opinion on a + single survey question. + +# Troubleshooting + +Please [report issues](https://github.com/jamesdunham/dgo/issues) that +you encounter. + + - OS X only: RStan creates temporary files during estimation in a + location given by `tempdir()`, typically an arbitrary location in + `/var/folders`. If a model runs for days, these files can be cleaned + up while still needed, which induces an error. A good solution is to + set a safer path for temporary files, using an environment variable + checked at session startup. For help setting environment variables, + see the Stack Overflow question + [here](https://stackoverflow.com/questions/17107206/change-temporary-directory). + Confirm the new path before starting your model run by restarting R + and checking the output from `tempdir()`. + + - Models fitted before October 2016 (specifically \< + [\#8e6a2cf](https://github.com/jamesdunham/dgo/commit/8e6a2cfbe00b2cd4a908b3067241e06124d143cd)) + using dgirt are not fully compatible with dgo. Their contents can be + extracted without using dgo, however, with the `$` indexing + operator. For example: `as.data.frame(dgirtfit_object$stan.cmb)`. + + - Calling `dgirt()` or `dgmrp()` can generate + [warnings](http://mc-stan.org/misc/warnings#compiler-warnings) + during model compilation. These are safe to ignore, or can be + suppressed by following the linked instructions. + +# Contributing and citing + +dgo is under development and we welcome +[suggestions](https://github.com/jamesdunham/dgo/issues). The package citation is: -Dunham, James, Devin Caughey, and Christopher Warshaw. 2017. dgo: Dynamic Estimation of Group-level Opinion. R package. . +Dunham, James, Devin Caughey, and Christopher Warshaw. 2017. dgo: +Dynamic Estimation of Group-level Opinion. R package. +. diff --git a/data/toy_dgirt_in.rda b/data/toy_dgirt_in.rda index a70e44e..b045321 100644 Binary files a/data/toy_dgirt_in.rda and b/data/toy_dgirt_in.rda differ diff --git a/data/toy_dgirtfit.rda b/data/toy_dgirtfit.rda index f310ddc..a2555ea 100644 Binary files a/data/toy_dgirtfit.rda and b/data/toy_dgirtfit.rda differ diff --git a/docs/articles/abortion_attitudes.html b/docs/articles/abortion_attitudes.html index 2ead0c7..dfadc1d 100644 --- a/docs/articles/abortion_attitudes.html +++ b/docs/articles/abortion_attitudes.html @@ -67,18 +67,16 @@

Abortion Attitudes

This vignette demonstrates estimation of public attitudes toward abortion from responses to a single survey item, using the dynamic multi-level regression and post-stratification (MRP) model implemented in dgmrp().

-
-

-Prepare input data

+

Prepare input data

shape() prepares input data for use with the modeling functions dgirt() and dgmrp(). Here we use the included opinion dataset.

-
dgirt_in_abortion <- shape(opinion, item_names = "abortion", time_name = "year",
-  geo_name = "state", group_names = "race3", geo_filter = c("CA", "GA", "LA",
-    "MA"), id_vars = "source")
-#> Applying restrictions, pass 1...
-#>  Dropped 5 rows for missingness in covariates
-#>  Dropped 633 rows for lacking item responses
-#> Applying restrictions, pass 2...
-#>  No changes
+
dgirt_in_abortion <- shape(opinion, item_names = "abortion", time_name = "year",
+
geo_name = "state", group_names = "race3", geo_filter = c("CA", "GA", "LA",
+
"MA"), id_vars = "source")
+
#> Applying restrictions, pass 1...
+
#> Dropped 5 rows for missingness in covariates
+
#> Dropped 633 rows for lacking item responses
+
#> Applying restrictions, pass 2...
+
#> No changes

In this call to shape() we specified:

  • the survey item response variable (abortion);
  • @@ -87,154 +85,159 @@

Notice that we named only one of these variables defining respondent groups using the group_names argument. The geo_name argument always takes the variable giving respondents’ local geographic area; it will be modeled differently.

Using the argument geo_filter, we subset the input data to the given values of the geo_name variable. And with the id_vars argument, we named an identfier that we’d like to keep in the processed data. (Other unused variables will be dropped.)

-
-
-

-Inspect the result

+

Inspect the result

summary() gives a high-level description of the result.

-
summary(dgirt_in_abortion)
-#> Items:
-#> [1] "abortion"
-#> Respondents:
-#>    23,007 in `item_data`
-#> Grouping variables:
-#> [1] "year"  "state" "race3"
-#> Time periods:
-#> [1] 2006 2007 2008 2009 2010
-#> Local geographic areas:
-#> [1] "CA" "GA" "LA" "MA"
-#> Hierarchical parameters:
-#> [1] "GA"         "LA"         "MA"         "race3other" "race3white"
-#> Modifiers of hierarchical parameters:
-#> NULL
-#> Constants:
-#>  Q  T  P  N  G  H  D 
-#>  1  5  5 60 12  1  1
+
+summary(dgirt_in_abortion)
+
#> Items:
+
#> [1] "abortion"
+
#> Respondents:
+
#> 23,007 in `item_data`
+
#> Grouping variables:
+
#> [1] "year" "state" "race3"
+
#> Time periods:
+
#> [1] 2006 2007 2008 2009 2010
+
#> Local geographic areas:
+
#> [1] "CA" "GA" "LA" "MA"
+
#> Hierarchical parameters:
+
#> [1] "GA" "LA" "MA" "race3other" "race3white"
+
#> Modifiers of hierarchical parameters:
+
#> NULL
+
#> Constants:
+
#> Q T P N G H D
+
#> 1 5 5 60 12 1 1

get_n() and get_item_n() give response counts.

-
get_n(dgirt_in_abortion, by = "state")
-#>    state     n
-#> 1:    CA 14248
-#> 2:    GA  4547
-#> 3:    LA  1658
-#> 4:    MA  2554
-get_item_n(dgirt_in_abortion, by = "year")
-#>    year abortion
-#> 1: 2006     5275
-#> 2: 2007     1690
-#> 3: 2008     4697
-#> 4: 2009     2141
-#> 5: 2010     9204
-
-
-

-Fit a model

+
+get_n(dgirt_in_abortion, by = "state")
+
#> state n
+
#> 1: CA 14248
+
#> 2: GA 4547
+
#> 3: LA 1658
+
#> 4: MA 2554
+
+get_item_n(dgirt_in_abortion, by = "year")
+
#> year abortion
+
#> 1: 2006 5275
+
#> 2: 2007 1690
+
#> 3: 2008 4697
+
#> 4: 2009 2141
+
#> 5: 2010 9204
+

Fit a model

dgmrp() fits a dynamic multi-level regression and post-stratification (MRP) model to data processed by shape(). Here, we’ll use it to estimate public attitudes toward abortion over time, for the groups defined by state and race3. (Specifically, by their Cartesian product.)

Under the hood, dgmrp() uses RStan for MCMC sampling, and arguments can be passed to RStan’s stan() via the ... argument of dgmrp(). This is almost always desirable. Here, we specify the number of sampler iterations, chains, and cores.

-
dgmrp_out_abortion <- dgmrp(dgirt_in_abortion, iter = 1500, chains = 4, cores =
-  4, seed = 42)
-

The model results are held in a dgmrp_fit object. Methods from RStan like extract() are available if needed because dgmrp_fit is a subclass of stanfit. But dgo provides its own methods for typical post-estimation tasks.

+
dgmrp_out_abortion <- dgmrp(dgirt_in_abortion, iter = 1500, chains = 4, cores =
-
-

-Work with results

+
4, seed = 42)
+

The model results are held in a dgmrp_fit object. Methods from RStan like extract() are available if needed because dgmrp_fit is a subclass of stanfit. But dgo provides its own methods for typical post-estimation tasks.

+

Work with results

For a high-level summary of the result, use summary().

-
summary(dgmrp_out_abortion)
-#> dgirt samples from 4 chains of 1500 iterations, 750 warmup, thinned every 1 
-#>   Drawn Mon May 29 23:27:34 2017 
-#>   Package version 0.2.10 
-#>   Model version 2017_01_04_singleissue 
-#>   117 parameters; 60 theta_bars (year state race3)
-#>   5 periods 2006 to 2010 
-#> 
-#> n_eff
-#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#>   95.68  242.50  451.87  685.85  927.30 3000.00
-#> 
-#> Rhat
-#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#>  0.9993  1.0028  1.0068  1.0081  1.0126  1.0406
-#> 
-#> Elapsed time
-#>    chain warmup sample total
-#> 1:     1    15S    16S   31S
-#> 2:     2    15S    11S   26S
-#> 3:     3    15S    19S   34S
-#> 4:     4    16S    10S   26S
+
+summary(dgmrp_out_abortion)
+
#> dgirt samples from 4 chains of 1500 iterations, 750 warmup, thinned every 1
+
#> Drawn Mon May 29 23:27:34 2017
+
#> Package version 0.2.10
+
#> Model version 2017_01_04_singleissue
+
#> 117 parameters; 60 theta_bars (year state race3)
+
#> 5 periods 2006 to 2010
+
#>
+
#> n_eff
+
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
+
#> 95.68 242.50 451.87 685.85 927.30 3000.00
+
#>
+
#> Rhat
+
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
+
#> 0.9993 1.0028 1.0068 1.0081 1.0126 1.0406
+
#>
+
#> Elapsed time
+
#> chain warmup sample total
+
#> 1: 1 15S 16S 31S
+
#> 2: 2 15S 11S 26S
+
#> 3: 3 15S 19S 34S
+
#> 4: 4 16S 10S 26S

To apply scalar functions to posterior samples, use summarize(). The default output gives summary statistics for the model’s theta_bar parameters, which represent group means. These are indexed by time (year) and group, where groups are again defined by local geographic area (state) and any other respondent characteristics (race3).

-
head(summarize(dgmrp_out_abortion))
-#>        param state race3 year      mean         sd    median     q_025
-#> 1: theta_bar    CA black 2006 0.7739283 0.02098019 0.7749083 0.7307567
-#> 2: theta_bar    CA black 2007 0.7980027 0.02771553 0.7979378 0.7439328
-#> 3: theta_bar    CA black 2008 0.7232980 0.02362116 0.7231930 0.6786121
-#> 4: theta_bar    CA black 2009 0.6863666 0.02128237 0.6863458 0.6463628
-#> 5: theta_bar    CA black 2010 0.7407779 0.01682742 0.7414667 0.7058706
-#> 6: theta_bar    CA other 2006 0.7347199 0.02322850 0.7354365 0.6872140
-#>        q_975
-#> 1: 0.8144084
-#> 2: 0.8517811
-#> 3: 0.7693334
-#> 4: 0.7279652
-#> 5: 0.7717651
-#> 6: 0.7790182
+
+head(summarize(dgmrp_out_abortion))
+
#> param state race3 year mean sd median q_025
+
#> 1: theta_bar CA black 2006 0.7739283 0.02098019 0.7749083 0.7307567
+
#> 2: theta_bar CA black 2007 0.7980027 0.02771553 0.7979378 0.7439328
+
#> 3: theta_bar CA black 2008 0.7232980 0.02362116 0.7231930 0.6786121
+
#> 4: theta_bar CA black 2009 0.6863666 0.02128237 0.6863458 0.6463628
+
#> 5: theta_bar CA black 2010 0.7407779 0.01682742 0.7414667 0.7058706
+
#> 6: theta_bar CA other 2006 0.7347199 0.02322850 0.7354365 0.6872140
+
#> q_975
+
#> 1: 0.8144084
+
#> 2: 0.8517811
+
#> 3: 0.7693334
+
#> 4: 0.7279652
+
#> 5: 0.7717651
+
#> 6: 0.7790182

Alternatively, summarize() can apply arbitrary functions to posterior samples for whatever parameter is given by its pars argument.

-
summarize(dgmrp_out_abortion, pars = "xi", funs = "var")
-#>    param year        var
-#> 1:    xi 2006 0.01814362
-#> 2:    xi 2007 0.05026942
-#> 3:    xi 2008 0.05606188
-#> 4:    xi 2009 0.04857038
-#> 5:    xi 2010 0.04149793
+
+summarize(dgmrp_out_abortion, pars = "xi", funs = "var")
+
#> param year var
+
#> 1: xi 2006 0.01814362
+
#> 2: xi 2007 0.05026942
+
#> 3: xi 2008 0.05606188
+
#> 4: xi 2009 0.04857038
+
#> 5: xi 2010 0.04149793

To access posterior samples in tabular form use as.data.frame(). By default, this method returns post-warmup samples for the theta_bar parameters, but like other methods takes a pars argument.

-
head(as.data.frame(dgmrp_out_abortion))
-#>        param state race3 year iteration     value
-#> 1: theta_bar    CA black 2006         1 0.7661626
-#> 2: theta_bar    CA black 2006         2 0.7690362
-#> 3: theta_bar    CA black 2006         3 0.7656257
-#> 4: theta_bar    CA black 2006         4 0.7935372
-#> 5: theta_bar    CA black 2006         5 0.7544080
-#> 6: theta_bar    CA black 2006         6 0.7819740
+
+head(as.data.frame(dgmrp_out_abortion))
+
#> param state race3 year iteration value
+
#> 1: theta_bar CA black 2006 1 0.7661626
+
#> 2: theta_bar CA black 2006 2 0.7690362
+
#> 3: theta_bar CA black 2006 3 0.7656257
+
#> 4: theta_bar CA black 2006 4 0.7935372
+
#> 5: theta_bar CA black 2006 5 0.7544080
+
#> 6: theta_bar CA black 2006 6 0.7819740

To poststratify the results use poststratify(). Here, we use the group population proportions bundled as annual_state_race_targets to reweight and aggregate estimates to strata defined by state-years.

-
poststratify(dgmrp_out_abortion, annual_state_race_targets, strata_names =
-  c("state", "year"), aggregated_names = "race3")
-#>     state year     value
-#>  1:    CA 2006 0.7187353
-#>  2:    CA 2007 0.7469064
-#>  3:    CA 2008 0.6562966
-#>  4:    CA 2009 0.6272075
-#>  5:    CA 2010 0.6754691
-#>  6:    GA 2006 0.6339750
-#>  7:    GA 2007 0.6225482
-#>  8:    GA 2008 0.5232615
-#>  9:    GA 2009 0.5095145
-#> 10:    GA 2010 0.5705449
-#> 11:    LA 2006 0.5266416
-#> 12:    LA 2007 0.4769044
-#> 13:    LA 2008 0.4142786
-#> 14:    LA 2009 0.3985367
-#> 15:    LA 2010 0.4229707
-#> 16:    MA 2006 0.7629194
-#> 17:    MA 2007 0.8099707
-#> 18:    MA 2008 0.7058450
-#> 19:    MA 2009 0.6624888
-#> 20:    MA 2010 0.7078342
+
+poststratify(dgmrp_out_abortion, annual_state_race_targets, strata_names = +
+
c("state", "year"), aggregated_names = "race3")
+
#> state year value
+
#> 1: CA 2006 0.7187353
+
#> 2: CA 2007 0.7469064
+
#> 3: CA 2008 0.6562966
+
#> 4: CA 2009 0.6272075
+
#> 5: CA 2010 0.6754691
+
#> 6: GA 2006 0.6339750
+
#> 7: GA 2007 0.6225482
+
#> 8: GA 2008 0.5232615
+
#> 9: GA 2009 0.5095145
+
#> 10: GA 2010 0.5705449
+
#> 11: LA 2006 0.5266416
+
#> 12: LA 2007 0.4769044
+
#> 13: LA 2008 0.4142786
+
#> 14: LA 2009 0.3985367
+
#> 15: LA 2010 0.4229707
+
#> 16: MA 2006 0.7629194
+
#> 17: MA 2007 0.8099707
+
#> 18: MA 2008 0.7058450
+
#> 19: MA 2009 0.6624888
+
#> 20: MA 2010 0.7078342

To plot the results use dgirt_plot(). This method plots summaries of posterior samples by time period. By default, it shows a 95% credible interval around posterior medians for the theta_bar parameters, for each local geographic area. Here we omit the CIs.

-
dgirt_plot(dgmrp_out_abortion, y_min = NULL, y_max = NULL)
+
+dgirt_plot(dgmrp_out_abortion, y_min = NULL, y_max = NULL)

dgirt_plot() can also plot the data.frame output from poststratify(), given arguments that identify the relevant variables. Below, we aggregate over the demographic grouping variable race3, resulting in a data.frame of estimates by state-year.

-
ps <- poststratify(dgmrp_out_abortion, annual_state_race_targets, strata_names =
-  c("state", "year"), aggregated_names = "race3")
-head(ps)
-#>    state year     value
-#> 1:    CA 2006 0.7187353
-#> 2:    CA 2007 0.7469064
-#> 3:    CA 2008 0.6562966
-#> 4:    CA 2009 0.6272075
-#> 5:    CA 2010 0.6754691
-#> 6:    GA 2006 0.6339750
-dgirt_plot(ps, group_names = NULL, time_name = "year", geo_name = "state")
+
ps <- poststratify(dgmrp_out_abortion, annual_state_race_targets, strata_names = +
+
c("state", "year"), aggregated_names = "race3")
+
+head(ps)
+
#> state year value
+
#> 1: CA 2006 0.7187353
+
#> 2: CA 2007 0.7469064
+
#> 3: CA 2008 0.6562966
+
#> 4: CA 2009 0.6272075
+
#> 5: CA 2010 0.6754691
+
#> 6: GA 2006 0.6339750
+
+dgirt_plot(ps, group_names = NULL, time_name = "year", geo_name = "state")

In the call to dgirt_plot(), we passed the names of the state and year variables. The group_names argument was then NULL, because there were no grouping variables left after we aggregated over race3.

-
+
diff --git a/docs/articles/index.html b/docs/articles/index.html index 4a9086e..97491fd 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -82,7 +82,7 @@
diff --git a/docs/articles/policy_liberalism.html b/docs/articles/policy_liberalism.html index d89bf73..071d716 100644 --- a/docs/articles/policy_liberalism.html +++ b/docs/articles/policy_liberalism.html @@ -67,19 +67,17 @@

Policy Liberalism

This vignette demonstrates estimation of latent policy liberalism from individuals’ responses to five survey items, using the Bayesian group-level IRT model implemented in dgirt().

-
-

-Prepare input data

+

Prepare input data

shape() prepares input data for use with the modeling functions dgirt() and dgmrp(). Here we use the included opinion dataset.

-
dgirt_in_liberalism <- shape(opinion, item_names = c("abortion",
-    "affirmative_action","stemcell_research" , "gaymarriage_amendment",
-    "partialbirth_abortion") , time_name = "year", geo_name = "state",
-  group_names = "race3", geo_filter = c("CA", "GA", "LA", "MA"))
-#> Applying restrictions, pass 1...
-#>  Dropped 5 rows for missingness in covariates
-#>  Dropped 8 rows for lacking item responses
-#> Applying restrictions, pass 2...
-#>  No changes
+
dgirt_in_liberalism <- shape(opinion, item_names = c("abortion",
+
"affirmative_action","stemcell_research" , "gaymarriage_amendment",
+
"partialbirth_abortion") , time_name = "year", geo_name = "state",
+
group_names = "race3", geo_filter = c("CA", "GA", "LA", "MA"))
+
#> Applying restrictions, pass 1...
+
#> Dropped 5 rows for missingness in covariates
+
#> Dropped 8 rows for lacking item responses
+
#> Applying restrictions, pass 2...
+
#> No changes

In this call to shape() we specified:

  • the survey item response variables as item_names;
  • @@ -89,161 +87,164 @@

    Notice that we named only one of these variables defining respondent groups using the group_names argument. The geo_name argument always takes the variable giving respondents’ local geographic area; it will be modeled differently.

    Using the argument geo_filter, we subset the input data to the given values of the geo_name variable. And with the id_vars argument, we named an identfier that we’d like to keep in the processed data. (Other unused variables will be dropped.)

    Important: the dgirt() model assumes consistent coding of the polarity of item responses for identification. This is already true for the opinion data. Typically it requires manual recoding.

    -

-
-

-Inspect the result

+

Inspect the result

summary() gives a high-level description of the result.

-
summary(dgirt_in_liberalism)
-#> Items:
-#> [1] "abortion"              "affirmative_action"    "gaymarriage_amendment"
-#> [4] "partialbirth_abortion" "stemcell_research"    
-#> Respondents:
-#>    23,632 in `item_data`
-#> Grouping variables:
-#> [1] "year"  "state" "race3"
-#> Time periods:
-#> [1] 2006 2007 2008 2009 2010
-#> Local geographic areas:
-#> [1] "CA" "GA" "LA" "MA"
-#> Hierarchical parameters:
-#> [1] "GA"         "LA"         "MA"         "race3other" "race3white"
-#> Modifiers of hierarchical parameters:
-#> NULL
-#> Constants:
-#>   Q   T   P   N   G   H   D 
-#>   5   5   5 300  12   1   1
+
+summary(dgirt_in_liberalism)
+
#> Items:
+
#> [1] "abortion" "affirmative_action" "gaymarriage_amendment"
+
#> [4] "partialbirth_abortion" "stemcell_research"
+
#> Respondents:
+
#> 23,632 in `item_data`
+
#> Grouping variables:
+
#> [1] "year" "state" "race3"
+
#> Time periods:
+
#> [1] 2006 2007 2008 2009 2010
+
#> Local geographic areas:
+
#> [1] "CA" "GA" "LA" "MA"
+
#> Hierarchical parameters:
+
#> [1] "GA" "LA" "MA" "race3other" "race3white"
+
#> Modifiers of hierarchical parameters:
+
#> NULL
+
#> Constants:
+
#> Q T P N G H D
+
#> 5 5 5 300 12 1 1

get_n() and get_item_n() give response counts.

-
get_n(dgirt_in_liberalism, by = "state")
-#>    state     n
-#> 1:    CA 14655
-#> 2:    GA  4667
-#> 3:    LA  1693
-#> 4:    MA  2617
-get_item_n(dgirt_in_liberalism, by = "year")
-#>    year abortion affirmative_action stemcell_research
-#> 1: 2006     5275               4750              2483
-#> 2: 2007     1690               1557              1705
-#> 3: 2008     4697               4704              4002
-#> 4: 2009     2141               2147                 0
-#> 5: 2010     9204               9241              9146
-#>    gaymarriage_amendment partialbirth_abortion
-#> 1:                  2642                  5064
-#> 2:                  1163                  1684
-#> 3:                  4265                     0
-#> 4:                     0                     0
-#> 5:                  9226                     0
-
-
-

-Fit a model

+
+get_n(dgirt_in_liberalism, by = "state")
+
#> state n
+
#> 1: CA 14655
+
#> 2: GA 4667
+
#> 3: LA 1693
+
#> 4: MA 2617
+
+get_item_n(dgirt_in_liberalism, by = "year")
+
#> year abortion affirmative_action stemcell_research
+
#> 1: 2006 5275 4750 2483
+
#> 2: 2007 1690 1557 1705
+
#> 3: 2008 4697 4704 4002
+
#> 4: 2009 2141 2147 0
+
#> 5: 2010 9204 9241 9146
+
#> gaymarriage_amendment partialbirth_abortion
+
#> 1: 2642 5064
+
#> 2: 1163 1684
+
#> 3: 4265 0
+
#> 4: 0 0
+
#> 5: 9226 0
+

Fit a model

dgirt() estimates a latent variable based on responses to multiple survey questions. Here, we’ll use it to estimate latent policy liberalism over time, for the groups defined by state and race3. (Specifically, by their Cartesian product.)

Under the hood, dgirt() uses RStan for MCMC sampling, and arguments can be passed to RStan’s stan() via the ... argument of dgirt(). This is almost always desirable. Here, we specify the number of sampler iterations, chains, and cores.

-
dgirt_out_liberalism <- dgirt(dgirt_in_liberalism, iter = 3000, chains = 4,
-  cores = 4, seed = 42)
+
dgirt_out_liberalism <- dgirt(dgirt_in_liberalism, iter = 3000, chains = 4,
+
cores = 4, seed = 42)

The model results are held in a dgirt_fit object. Methods from RStan like extract() are available if needed because dgirt_fit is a subclass of stanfit. But dgo provides its own methods for typical post-estimation tasks.

-
-
-

-Work with results

+

Work with results

For a high-level summary of the result, use summary().

-
summary(dgirt_out_liberalism)
-#> dgirt samples from 4 chains of 3000 iterations, 1500 warmup, thinned every 1 
-#>   Drawn Mon May 29 23:34:20 2017 
-#>   Package version 0.2.10 
-#>   Model version 2017_01_04 
-#>   137 parameters; 60 theta_bars (year state race3)
-#>   5 periods 2006 to 2010 
-#> 
-#> n_eff
-#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#>   65.07  388.19  585.84 1083.11 1245.79 6000.00
-#> 
-#> Rhat
-#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-#>  0.9997  1.0020  1.0059  1.0084  1.0119  1.0553
-#> 
-#> Elapsed time
-#>    chain warmup sample  total
-#> 1:     1  2M 4S 1M 52S 3M 56S
-#> 2:     2 1M 54S  3M 9S 4M 63S
-#> 3:     3 2M 19S  3M 7S 5M 26S
-#> 4:     4 2M 18S 3M 14S 5M 32S
+
+summary(dgirt_out_liberalism)
+
#> dgirt samples from 4 chains of 3000 iterations, 1500 warmup, thinned every 1
+
#> Drawn Mon May 29 23:34:20 2017
+
#> Package version 0.2.10
+
#> Model version 2017_01_04
+
#> 137 parameters; 60 theta_bars (year state race3)
+
#> 5 periods 2006 to 2010
+
#>
+
#> n_eff
+
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
+
#> 65.07 388.19 585.84 1083.11 1245.79 6000.00
+
#>
+
#> Rhat
+
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
+
#> 0.9997 1.0020 1.0059 1.0084 1.0119 1.0553
+
#>
+
#> Elapsed time
+
#> chain warmup sample total
+
#> 1: 1 2M 4S 1M 52S 3M 56S
+
#> 2: 2 1M 54S 3M 9S 4M 63S
+
#> 3: 3 2M 19S 3M 7S 5M 26S
+
#> 4: 4 2M 18S 3M 14S 5M 32S

To apply scalar functions to posterior samples, use summarize(). The default output gives summary statistics for the model’s theta_bar parameters, which represent group means. These are indexed by time (year) and group, where groups are again defined by local geographic area (state) and any other respondent characteristics (race3).

-
head(summarize(dgirt_out_liberalism))
-#>        param state race3 year      mean         sd    median      q_025
-#> 1: theta_bar    CA black 2006 0.5247367 0.06827003 0.5194593 0.40423566
-#> 2: theta_bar    CA black 2007 0.5944296 0.08813494 0.5853362 0.44776239
-#> 3: theta_bar    CA black 2008 0.5823158 0.10139495 0.5667726 0.43013247
-#> 4: theta_bar    CA black 2009 0.5132426 0.08060875 0.5021405 0.38781331
-#> 5: theta_bar    CA black 2010 0.4977903 0.05987256 0.4922105 0.39645056
-#> 6: theta_bar    CA other 2006 0.1593904 0.05938098 0.1664194 0.02339847
-#>        q_975
-#> 1: 0.6754660
-#> 2: 0.7932721
-#> 3: 0.8190219
-#> 4: 0.7062299
-#> 5: 0.6332728
-#> 6: 0.2583196
+
+head(summarize(dgirt_out_liberalism))
+
#> param state race3 year mean sd median q_025
+
#> 1: theta_bar CA black 2006 0.5247367 0.06827003 0.5194593 0.40423566
+
#> 2: theta_bar CA black 2007 0.5944296 0.08813494 0.5853362 0.44776239
+
#> 3: theta_bar CA black 2008 0.5823158 0.10139495 0.5667726 0.43013247
+
#> 4: theta_bar CA black 2009 0.5132426 0.08060875 0.5021405 0.38781331
+
#> 5: theta_bar CA black 2010 0.4977903 0.05987256 0.4922105 0.39645056
+
#> 6: theta_bar CA other 2006 0.1593904 0.05938098 0.1664194 0.02339847
+
#> q_975
+
#> 1: 0.6754660
+
#> 2: 0.7932721
+
#> 3: 0.8190219
+
#> 4: 0.7062299
+
#> 5: 0.6332728
+
#> 6: 0.2583196

Alternatively, summarize() can apply arbitrary functions to posterior samples for whatever parameter is given by its pars argument.

-
summarize(dgirt_out_liberalism, pars = "xi", funs = "var")
-#>    param year         var
-#> 1:    xi 2006 0.013076032
-#> 2:    xi 2007 0.008053516
-#> 3:    xi 2008 0.006789127
-#> 4:    xi 2009 0.006144990
-#> 5:    xi 2010 0.005945176
+
+summarize(dgirt_out_liberalism, pars = "xi", funs = "var")
+
#> param year var
+
#> 1: xi 2006 0.013076032
+
#> 2: xi 2007 0.008053516
+
#> 3: xi 2008 0.006789127
+
#> 4: xi 2009 0.006144990
+
#> 5: xi 2010 0.005945176

To access posterior samples in tabular form use as.data.frame(). By default, this method returns post-warmup samples for the theta_bar parameters, but like other methods takes a pars argument.

-
head(as.data.frame(dgirt_out_liberalism))
-#>        param state race3 year iteration     value
-#> 1: theta_bar    CA black 2006         1 0.6107959
-#> 2: theta_bar    CA black 2006         2 0.4745799
-#> 3: theta_bar    CA black 2006         3 0.4980549
-#> 4: theta_bar    CA black 2006         4 0.4898826
-#> 5: theta_bar    CA black 2006         5 0.4939210
-#> 6: theta_bar    CA black 2006         6 0.4746524
+
+head(as.data.frame(dgirt_out_liberalism))
+
#> param state race3 year iteration value
+
#> 1: theta_bar CA black 2006 1 0.6107959
+
#> 2: theta_bar CA black 2006 2 0.4745799
+
#> 3: theta_bar CA black 2006 3 0.4980549
+
#> 4: theta_bar CA black 2006 4 0.4898826
+
#> 5: theta_bar CA black 2006 5 0.4939210
+
#> 6: theta_bar CA black 2006 6 0.4746524

To poststratify the results use poststratify(). Here, we use the group population proportions bundled as annual_state_race_targets to reweight and aggregate estimates to strata defined by state-years.

-
poststratify(dgirt_out_liberalism, annual_state_race_targets, strata_names =
-  c("state", "year"), aggregated_names = "race3")
-#>     state year        value
-#>  1:    CA 2006  0.143321712
-#>  2:    CA 2007  0.188969603
-#>  3:    CA 2008  0.112907172
-#>  4:    CA 2009  0.058219329
-#>  5:    CA 2010  0.092557709
-#>  6:    GA 2006  0.103355439
-#>  7:    GA 2007  0.084458691
-#>  8:    GA 2008 -0.011351441
-#>  9:    GA 2009 -0.015584764
-#> 10:    GA 2010  0.010578655
-#> 11:    LA 2006  0.021248643
-#> 12:    LA 2007 -0.003170117
-#> 13:    LA 2008 -0.095756506
-#> 14:    LA 2009 -0.124279123
-#> 15:    LA 2010 -0.088613763
-#> 16:    MA 2006  0.147235550
-#> 17:    MA 2007  0.269984992
-#> 18:    MA 2008  0.159194876
-#> 19:    MA 2009  0.082495757
-#> 20:    MA 2010  0.122864118
+
+poststratify(dgirt_out_liberalism, annual_state_race_targets, strata_names = +
+
c("state", "year"), aggregated_names = "race3")
+
#> state year value
+
#> 1: CA 2006 0.143321712
+
#> 2: CA 2007 0.188969603
+
#> 3: CA 2008 0.112907172
+
#> 4: CA 2009 0.058219329
+
#> 5: CA 2010 0.092557709
+
#> 6: GA 2006 0.103355439
+
#> 7: GA 2007 0.084458691
+
#> 8: GA 2008 -0.011351441
+
#> 9: GA 2009 -0.015584764
+
#> 10: GA 2010 0.010578655
+
#> 11: LA 2006 0.021248643
+
#> 12: LA 2007 -0.003170117
+
#> 13: LA 2008 -0.095756506
+
#> 14: LA 2009 -0.124279123
+
#> 15: LA 2010 -0.088613763
+
#> 16: MA 2006 0.147235550
+
#> 17: MA 2007 0.269984992
+
#> 18: MA 2008 0.159194876
+
#> 19: MA 2009 0.082495757
+
#> 20: MA 2010 0.122864118

To plot the results use dgirt_plot(). This method plots summaries of posterior samples by time period. By default, it shows a 95% credible interval around posterior medians for the theta_bar parameters, for each local geographic area. Here we omit the CIs.

-
dgirt_plot(dgirt_out_liberalism, y_min = NULL, y_max = NULL)
+
+dgirt_plot(dgirt_out_liberalism, y_min = NULL, y_max = NULL)

dgirt_plot() can also plot the data.frame output from poststratify(), given arguments that identify the relevant variables. Below, we aggregate over the demographic grouping variable race3, resulting in a data.frame of estimates by state-year.

-
ps <- poststratify(dgirt_out_liberalism, annual_state_race_targets, strata_names
-  = c("state", "year"), aggregated_names = "race3")
-head(ps)
-#>    state year      value
-#> 1:    CA 2006 0.14332171
-#> 2:    CA 2007 0.18896960
-#> 3:    CA 2008 0.11290717
-#> 4:    CA 2009 0.05821933
-#> 5:    CA 2010 0.09255771
-#> 6:    GA 2006 0.10335544
-dgirt_plot(ps, group_names = NULL, time_name = "year", geo_name = "state")
+
ps <- poststratify(dgirt_out_liberalism, annual_state_race_targets, strata_names
+
= c("state", "year"), aggregated_names = "race3")
+
+head(ps)
+
#> state year value
+
#> 1: CA 2006 0.14332171
+
#> 2: CA 2007 0.18896960
+
#> 3: CA 2008 0.11290717
+
#> 4: CA 2009 0.05821933
+
#> 5: CA 2010 0.09255771
+
#> 6: GA 2006 0.10335544
+
+dgirt_plot(ps, group_names = NULL, time_name = "year", geo_name = "state")

In the call to dgirt_plot(), we passed the names of the state and year variables. The group_names argument was then NULL, because there were no grouping variables left after we aggregated over race3.

-
+
diff --git a/docs/index.html b/docs/index.html index 8a22ca2..faceb3f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -64,61 +64,43 @@
-
- -

dgo is an R package for the dynamic estimation of group-level opinion. The package can be used to estimate subpopulation groups’ average latent conservatism (or other latent trait) from individuals’ responses to dichotomous questions using a Bayesian group-level IRT approach developed by Caughey and Warshaw 2015 that models latent traits at the level of demographic and/or geographic groups rather than individuals. This approach uses a hierarchical model to borrow strength cross-sectionally and dynamic linear models to do so across time. The group-level estimates can be weighted to generate estimates for geographic units, such as states.

-

dgo can also be used to estimate smoothed estimates of subpopulation groups’ average responses on individual survey questions using a dynamic multi-level regression and poststratification (MRP) model (Park, Gelman, and Bafumi 2004). For instance, it could be used to estimate public opinion in each state on same-sex marriage or the Affordable Care Act.

-

This model opens up new areas of research on historical public opinion in the United States at the subnational level. It also enables scholars of comparative politics to estimate dynamic models of public opinion opinion at the country or subnational level.

-
-
-

-Installation

+
+

dgo is an R package for the dynamic estimation of group-level public opinion. You can use the package to estimate latent trait means in subpopulations from survey data. For example, dgo can estimate the average policy liberalism in each American state over time among Democrats, Independents, and Republicans, given their answers to survey questions about policy proposals.

+

dgo accomplishes this using a Bayesian group-level IRT approach developed by Caughey and Warshaw 2015. It models latent traits at the level of demographic and geographic groups rather than individuals. It uses a hierarchical model to borrow strength cross-sectionally and dynamic linear models to do so across time.

+

The package can also be used to estimate smoothed estimates of subpopulations’ average responses to single survey items, using a dynamic multi-level regression and poststratification (MRP) model (Park, Gelman, and Bafumi 2004). For instance, you can use dgo to estimate public opinion in each state on same-sex marriage or the Affordable Care Act.

+

This model opens up new areas of research on historical public opinion in the United States at the subnational level. It also allows scholars of comparative politics to estimate dynamic cross-national models of public opinion.

+

Installation

dgo can be installed from CRAN:

-
install.packages("dgo")
+
+install.packages("dgo")

Or get the latest version from GitHub using devtools:

-
if (!require(devtools, quietly = TRUE)) install.packages("devtools")
-devtools::install_github("jamesdunham/dgo")
+
+if (!require(devtools, quietly = TRUE)) install.packages("devtools")
+
devtools::install_github("jamesdunham/dgo")

dgo requires a working installation of RStan. If you don’t have already have RStan, follow its “Getting Started” guide.

-
-
-

-Usage

+

Usage

Load the package and set RStan’s recommended options for a local, multicore machine with excess RAM:

-
library(dgo)
-#> Loading required package: dgodata
-#> Loading required package: rstan
-#> Loading required package: ggplot2
-#> Loading required package: StanHeaders
-#> rstan (Version 2.16.2, packaged: 2017-07-03 09:24:58 UTC, GitRev: 2e1f913d3ca3)
-#> For execution on a local, multicore CPU with excess RAM we recommend calling
-#> rstan_options(auto_write = TRUE)
-#> options(mc.cores = parallel::detectCores())
-rstan_options(auto_write = TRUE)
-options(mc.cores = parallel::detectCores())
+
+library(dgo)
+
+rstan_options(auto_write = TRUE)
+
+options(mc.cores = parallel::detectCores())

The minimal workflow from raw data to estimation is:

-
    +
    1. shape input data using the shape() function; and
    2. pass the result to the dgirt() function to estimate a latent trait (e.g., conservatism) or dgmrp() function to estimate opinion on a single survey question.
    3. -
    -
-
-

-Troubleshooting

+

Troubleshooting

Please report issues that you encounter.

  • OS X only: RStan creates temporary files during estimation in a location given by tempdir(), typically an arbitrary location in /var/folders. If a model runs for days, these files can be cleaned up while still needed, which induces an error. A good solution is to set a safer path for temporary files, using an environment variable checked at session startup. For help setting environment variables, see the Stack Overflow question here. Confirm the new path before starting your model run by restarting R and checking the output from tempdir().

  • Models fitted before October 2016 (specifically < #8e6a2cf) using dgirt are not fully compatible with dgo. Their contents can be extracted without using dgo, however, with the $ indexing operator. For example: as.data.frame(dgirtfit_object$stan.cmb).

  • Calling dgirt() or dgmrp() can generate warnings during model compilation. These are safe to ignore, or can be suppressed by following the linked instructions.

  • -
-
-
-

-Contributing and citing

+

Contributing and citing

dgo is under development and we welcome suggestions.

The package citation is:

Dunham, James, Devin Caughey, and Christopher Warshaw. 2017. dgo: Dynamic Estimation of Group-level Opinion. R package. https://jdunham.io/dgo/.

-
+
diff --git a/docs/news/index.html b/docs/news/index.html index db896c6..c85e30b 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -89,92 +89,6 @@

Change log All releases

-
-

-dgo 0.2.11

-
    -
  • Add poststratification over posterior samples (closes #21).
  • -
  • -shape() now accepts aggregated item response data unaccompanied by individual-level item response data. The item_data and item_names arguments are no longer required.
  • -
  • Add a max_raked_weight argument to shape() for trimming raked weights. Note that trimming occurs before raked weights are rescaled to have mean 1, and the rescaled weights can be larger than max_raked_weight.
  • -
  • Remove the unused function expand_rownames().
  • -
  • Bugfixes.
  • -
-
-
-

-dgo 0.2.10

-
    -
  • Remove Rcpp dependency by rewriting dichotomize() in R.
  • -
  • Avoid estimating models (using RStan) during tests, with the goal of rendering moot variation in build environments. This addresses a test failure during CRAN’s r-release-osx-x86_64 build.
  • -
-
-
-

-dgo 0.2.9

-
    -
  • Switch from compiling Stan models at install time to compiling them at runtime, avoiding an Rcpp module issue.
  • -
  • Add model argument to dgirt() and dgmrp() taking for reuse a previously compiled Stan model, as found in the @stanmodel slot of a dgirt_fit- or dgmrp_fit-class object.
  • -
  • The version argument to dgirt() and dgmrp() can be used to specify arbitrary .stan files on the disk in addition to those included with the package.
  • -
  • Argument by to get_n() and get_item_n() methods properly accepts a vector of variable names when combined with aggregate arguments.
  • -
-
-
-

-dgo 0.2.8

-
    -
  • Improve Stan models for shorter run times
  • -
  • Add dgmrp() for fitting single-issue MRP models with hierarchical covariates
  • -
  • Add class dgmrp_fit for models fitted with dgmrp(), inheriting from a new virtual class dgo_fit -
  • -
  • -dgirt() now returns a dgirt_fit-class object that also inherits from dgo_fit class
  • -
  • Bugfixes
  • -
-
-
-

-dgo 0.2.7

-
    -
  • Package renamed dgo: Dynamic Estimation of Group-level Opinion
  • -
  • Tweaks to pass CRAN checks: clean up examples and docs
  • -
  • Use roxygen2 for classes, methods, and NAMESPACE -
  • -
  • Fix checks on P, S related to group_names change in 0.2.5
  • -
  • Fix Rcpp module issue from 0.2.6 (Error in .doLoadActions(where, attach))
  • -
  • Export expand_rownames() -
  • -
-
-
-

-dgo 0.2.6

-
    -
  • Fix error in dgirt_plot -
  • -
  • Fix path in tools/make_cpp.R -
  • -
-
-
-

-dgo 0.2.5

-
    -
  • -group_names is no longer required. If omitted, the geographic variable given by geo_name will define groups.
  • -
  • -aggregate_item_names is no longer required. It defaults to the observed values of the item column in aggregate_data.
  • -
  • -raking argument to shape() replaces strata_names. It takes a formula or list of formulas and allows more complicated preweighting.
  • -
  • -id_vars argument to shape() specifies variables to be kept in item_data.
  • -
  • -aggregate_data may include geographic areas, demographics, or time periods that don’t appear in item_data.
  • -
  • Fix: use a smaller epsilon than the default in survey::rake() for convergence with non-frequency weights.
  • -
  • New dgirtfit methods rhats() and plot_rhats() for model checking.
  • -
  • New dgirtfit method get_time_elapsed gives model run times. These also appear in summary output.
  • -
-
@@ -182,13 +96,6 @@

Contents

diff --git a/docs/reference/dgirt_fit-class.html b/docs/reference/dgirt_fit-class.html index 83d73a2..82d4e00 100644 --- a/docs/reference/dgirt_fit-class.html +++ b/docs/reference/dgirt_fit-class.html @@ -114,48 +114,48 @@

Examp
data(toy_dgirtfit) # summarize the fitted results summary(toy_dgirtfit, pars = 'xi')
#> dgirt samples from 4 chains of 400 iterations, 200 warmup, thinned every 1 -#> Drawn Sat Oct 28 09:40:04 2017 -#> Package version 0.2.11 +#> Drawn Mon Nov 13 18:15:19 2017 +#> Package version 0.2.12 #> Model version 2017_01_04 #> 43 parameters; 12 theta_bars (year state race3) #> 2 periods 2009 to 2010 #> #> n_eff
#> Min. 1st Qu. Median Mean 3rd Qu. Max. -#> 47.4 122.5 152.3 241.1 323.1 667.5
#> +#> 17.37 156.60 255.35 302.78 364.78 800.00
#> #> Rhat
#> Min. 1st Qu. Median Mean 3rd Qu. Max. -#> 0.9967 1.0051 1.0161 1.0140 1.0214 1.0394
#> +#> 0.9958 1.0058 1.0128 1.0161 1.0208 1.1404
#> #> Elapsed time
#> chain warmup sample total -#> 1: 1 4S 3S 7S -#> 2: 2 4S 4S 8S +#> 1: 1 3S 3S 6S +#> 2: 2 4S 3S 7S #> 3: 3 4S 3S 7S -#> 4: 4 4S 3S 7S
+#> 4: 4 4S 4S 8S
# get posterior means with a convenience function -get_posterior_mean(toy_dgirtfit, pars = 'theta_bar')
#> param state race3 year mean -#> 1: theta_bar SC black 2009 2.3366278 -#> 2: theta_bar SC black 2010 1.2377753 -#> 3: theta_bar SC other 2009 -1.3572482 -#> 4: theta_bar SC other 2010 0.0488835 -#> 5: theta_bar SC white 2009 -1.6299612 -#> 6: theta_bar SC white 2010 -1.3977524 -#> 7: theta_bar VA black 2009 2.0786160 -#> 8: theta_bar VA black 2010 1.4686529 -#> 9: theta_bar VA other 2009 -1.0419862 -#> 10: theta_bar VA other 2010 0.2789239 -#> 11: theta_bar VA white 2009 -0.9452116 -#> 12: theta_bar VA white 2010 -0.8000586
+get_posterior_mean(toy_dgirtfit, pars = 'theta_bar')
#> param state race3 year mean +#> 1: theta_bar SC black 2009 2.35662560 +#> 2: theta_bar SC black 2010 1.22256665 +#> 3: theta_bar SC other 2009 -1.36379358 +#> 4: theta_bar SC other 2010 0.04227331 +#> 5: theta_bar SC white 2009 -1.63196374 +#> 6: theta_bar SC white 2010 -1.47489402 +#> 7: theta_bar VA black 2009 1.90274077 +#> 8: theta_bar VA black 2010 1.52339960 +#> 9: theta_bar VA other 2009 -0.95920551 +#> 10: theta_bar VA other 2010 0.28518022 +#> 11: theta_bar VA white 2009 -0.92697350 +#> 12: theta_bar VA white 2010 -0.83419892
# generally apply functions to posterior samples after warmup; n.b. # `as.array` is iterations x chains x parameters so `MARGIN = 3` applies # `FUN` over iterations and chains -apply(as.array(toy_dgirtfit, pars = 'xi'), 3, mean)
#> xi[1] xi[2] -#> 1.632168 0.104962
+apply(as.array(toy_dgirtfit, pars = 'xi'), 3, mean)
#> xi[1] xi[2] +#> 1.35546402 0.04828843
# access the posterior samples -head(as.data.frame(toy_dgirtfit, pars = 'theta_bar'))
#> param state race3 year iteration value -#> 1: theta_bar SC black 2009 1 3.116552 -#> 2: theta_bar SC black 2009 2 4.538157 -#> 3: theta_bar SC black 2009 3 1.369630 -#> 4: theta_bar SC black 2009 4 2.999939 -#> 5: theta_bar SC black 2009 5 1.752752 -#> 6: theta_bar SC black 2009 6 1.553516
+head(as.data.frame(toy_dgirtfit, pars = 'theta_bar'))
#> param state race3 year iteration value +#> 1: theta_bar SC black 2009 1 0.7183680 +#> 2: theta_bar SC black 2009 2 0.7609363 +#> 3: theta_bar SC black 2009 3 0.7942637 +#> 4: theta_bar SC black 2009 4 1.4685053 +#> 5: theta_bar SC black 2009 5 1.1297978 +#> 6: theta_bar SC black 2009 6 0.6581956