Skip to content

Commit 2cbd0de

Browse files
author
Nicholas Clark
committed
bug fix to deal with NAs for binomial
1 parent 1ebdff1 commit 2cbd0de

27 files changed

+91
-55
lines changed

R/get_mvgam_priors.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ get_mvgam_priors = function(formula,
422422
# in the model
423423
ss_gam <- try(mvgam_setup(formula = formula,
424424
family = family_to_mgcvfam(family),
425-
data = data_train,
425+
dat = data_train,
426426
drop.unused.levels = FALSE,
427427
maxit = 5),
428428
silent = TRUE)

R/gp.R

+14-2
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,19 @@ make_gp_additions = function(gp_details, data,
8080
term_k <- mgcv_model$smooth[[min(which(smooth_bys %in% by[x] &
8181
smooth_terms == gp_covariates[x]))]]$df
8282

83+
# Check that response terms use the cbind() syntax
84+
resp_terms <- rlang::f_lhs(formula(mgcv_model))
85+
if(length(resp_terms) == 1){
86+
response <- resp_terms
87+
} else {
88+
if(any(grepl('cbind', resp_terms))){
89+
resp_terms <- resp_terms[-grepl('cbind', resp_terms)]
90+
response <- resp_terms[1]
91+
}
92+
}
93+
8394
prep_gp_covariate(data = data,
84-
response = rlang::f_lhs(formula(mgcv_model)),
95+
response = response,
8596
covariate = gp_covariates[x],
8697
by = by[x],
8798
level = level[x],
@@ -517,7 +528,8 @@ prep_gp_covariate = function(data,
517528
k = 20){
518529

519530
# Get default gp param priors from a call to brms::get_prior()
520-
def_gp_prior <- suppressWarnings(brms::get_prior(formula(paste0(response, ' ~ gp(', covariate,
531+
def_gp_prior <- suppressWarnings(brms::get_prior(formula(paste0(response,
532+
' ~ gp(', covariate,
521533
ifelse(is.na(by), ', ',
522534
paste0(', by = ', by, ', ')),
523535
'k = ', k,

R/interpret_mvgam.R

+5-3
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ interpret_mvgam = function(formula, N, family){
2020
if(is.function(family))
2121
family <- family()
2222

23-
if(family$family == 'binomial'){
23+
if(family$family %in% c('binomial', 'beta_binomial')){
2424
# Check that response terms use the cbind() syntax
2525
resp_terms <- as.character(terms(formula(formula))[[2]])
2626
if(length(resp_terms) == 1){
@@ -49,7 +49,8 @@ interpret_mvgam = function(formula, N, family){
4949

5050
# Preserve offset if included
5151
if(!is.null(attr(terms(formula(formula)), 'offset'))){
52-
newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',
52+
newformula <- as.formula(paste(dimnames(attr(terms(formula), 'factors'))[[1]][1],
53+
'~',
5354
grep('offset', rownames(attr(terms.formula(formula), 'factors')),
5455
value = TRUE),
5556
'+',
@@ -60,7 +61,8 @@ interpret_mvgam = function(formula, N, family){
6061
ifelse(int == 0, ' - 1', '')))
6162

6263
} else {
63-
newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',
64+
newformula <- as.formula(paste(dimnames(attr(terms(formula), 'factors'))[[1]][1],
65+
'~',
6466
paste(paste(newfacs, collapse = '+'),
6567
'+',
6668
paste(refacs, collapse = '+'),

R/mvgam.R

+18-6
Original file line numberDiff line numberDiff line change
@@ -756,8 +756,18 @@ mvgam = function(formula,
756756
validate_family_resrictions(response = orig_y, family = family)
757757
}
758758

759-
# Replace any NAs in the response for initial setup
760-
data_train$y <- replace_nas(data_train$y)
759+
# Fill in missing observations in data_train so the size of the dataset is correct when
760+
# building the initial JAGS model
761+
resp_terms <- as.character(terms(formula(formula))[[2]])
762+
if(length(resp_terms) == 1){
763+
out_name <- as.character(terms(formula(formula))[[2]])
764+
} else {
765+
if(any(grepl('cbind', resp_terms))){
766+
resp_terms <- resp_terms[-grepl('cbind', resp_terms)]
767+
out_name <- resp_terms[1]
768+
}
769+
}
770+
data_train[[out_name]] <- replace_nas(data_train[[out_name]])
761771

762772
# Compute default priors
763773
def_priors <- adapt_brms_priors(c(make_default_scales(orig_y,
@@ -783,7 +793,7 @@ mvgam = function(formula,
783793
ss_gam <- try(mvgam_setup(formula = formula,
784794
knots = knots,
785795
family = family_to_mgcvfam(family),
786-
data = data_train),
796+
dat = data_train),
787797
silent = TRUE)
788798
if(inherits(ss_gam, 'try-error')){
789799
if(grepl('missing values', ss_gam[1])){
@@ -827,7 +837,8 @@ mvgam = function(formula,
827837
}
828838
if(length(ss_gam$smooth) == 0) ss_jagam$jags.ini$lambda <- NULL
829839

830-
# Fill y with NAs if this is a simulation from the priors
840+
# Fill y with NAs if this is a simulation from the priors;
841+
# otherwise replace with the original supplied values
831842
data_train <- check_priorsim(prior_simulation,
832843
data_train, orig_y,
833844
formula)
@@ -1338,7 +1349,8 @@ mvgam = function(formula,
13381349
n_lv = n_lv,
13391350
jags_data = ss_jagam$jags.data,
13401351
family = ifelse(family_char %in% c('binomial',
1341-
'bernoulli'),
1352+
'bernoulli',
1353+
'beta_binomial'),
13421354
'poisson',
13431355
family_char),
13441356
upper_bounds = upper_bounds)
@@ -1585,7 +1597,7 @@ mvgam = function(formula,
15851597
}
15861598

15871599
# Updates for Binomial and Bernoulli families
1588-
if(family_char %in% c('binomial', 'bernoulli')){
1600+
if(family_char %in% c('binomial', 'bernoulli', 'beta_binomial')){
15891601
bin_additions <- add_binomial(formula,
15901602
vectorised$model_file,
15911603
vectorised$model_data,

R/mvgam_setup.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
mvgam_setup <- function(formula,
66
knots,
77
family = gaussian(),
8-
data = list(),
8+
dat = list(),
99
na.action,
1010
drop.unused.levels = FALSE,
1111
maxit = 5) {
1212

1313
if(missing(knots)){
1414
# Initialise the GAM for a few iterations to ensure it all works without error
1515
suppressWarnings(mgcv::gam(formula(formula),
16-
data = data,
16+
data = dat,
1717
method = 'GCV.Cp',
1818
family = family,
1919
control = list(maxit = maxit),
@@ -22,7 +22,7 @@ mvgam_setup <- function(formula,
2222
select = TRUE))
2323
} else {
2424
suppressWarnings(mgcv::gam(formula(formula),
25-
data = data,
25+
data = dat,
2626
method = 'GCV.Cp',
2727
family = family,
2828
knots = knots,

R/validations.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ validate_family_resrictions = function(response, family){
133133

134134
# negatives not allowed for several families
135135
if(family$family %in% c('poisson', 'negative binomial',
136-
'tweedie', 'binomial')){
136+
'tweedie', 'binomial', 'beta_binomial')){
137137
if(any(response < 0)){
138138
stop(paste0('Values < 0 not allowed for count family responses'),
139139
call. = FALSE)

README.Rmd

+2-3
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,8 @@ The goal of `mvgam` is to fit Bayesian Dynamic Generalized Additive Models to ti
2929
A series of [vignettes cover data formatting, forecasting and several extended case studies of DGAMs](https://nicholasjclark.github.io/mvgam/){target="_blank"}. A number of other examples have also been compiled:
3030

3131
* [Ecological Forecasting with Dynamic Generalized Additive Models](https://www.youtube.com/watch?v=0zZopLlomsQ){target="_blank"}
32-
* [mvgam case study 1: model comparison and data assimilation](https://rpubs.com/NickClark47/mvgam){target="_blank"}
33-
* [mvgam case study 2: multivariate models](https://rpubs.com/NickClark47/mvgam2){target="_blank"}
34-
* [mvgam case study 3: distributed lag models](https://rpubs.com/NickClark47/mvgam3){target="_blank"}
32+
* [Distributed lags (and hierarchical distributed lags) using `mgcv` and `mvgam`](https://ecogambler.netlify.app/blog/distributed-lags-mgcv/){target="_blank"}
33+
* [Ecological Forecasting with Dynamic GAMs; a tutorial and detailed case study](https://www.youtube.com/watch?v=RwllLjgPUmM){target="_blank"}
3534

3635
## Installation
3736
Install from `GitHub` using:

README.md

+26-26
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ been compiled:
2929
- <a href="https://www.youtube.com/watch?v=0zZopLlomsQ"
3030
target="_blank">Ecological Forecasting with Dynamic Generalized Additive
3131
Models</a>
32-
- <a href="https://rpubs.com/NickClark47/mvgam" target="_blank">mvgam case
33-
study 1: model comparison and data assimilation</a>
34-
- <a href="https://rpubs.com/NickClark47/mvgam2" target="_blank">mvgam
35-
case study 2: multivariate models</a>
36-
- <a href="https://rpubs.com/NickClark47/mvgam3" target="_blank">mvgam
37-
case study 3: distributed lag models</a>
32+
- <a href="https://ecogambler.netlify.app/blog/distributed-lags-mgcv/"
33+
target="_blank">Distributed lags (and hierarchical distributed lags)
34+
using <code>mgcv</code> and <code>mvgam</code></a>
35+
- <a href="https://www.youtube.com/watch?v=RwllLjgPUmM"
36+
target="_blank">Ecological Forecasting with Dynamic GAMs; a tutorial and
37+
detailed case study</a>
3838

3939
## Installation
4040

@@ -319,29 +319,29 @@ summary(lynx_mvgam)
319319
#>
320320
#>
321321
#> GAM coefficient (beta) estimates:
322-
#> 2.5% 50% 97.5% Rhat n_eff
323-
#> (Intercept) 6.10 6.6000 7.0000 1.00 771
324-
#> s(season).1 -0.59 0.0480 0.7200 1.00 973
325-
#> s(season).2 -0.25 0.8000 1.8000 1.01 389
326-
#> s(season).3 -0.12 1.2000 2.5000 1.01 321
327-
#> s(season).4 -0.55 0.4100 1.3000 1.00 892
328-
#> s(season).5 -1.20 -0.1000 0.9300 1.00 479
329-
#> s(season).6 -1.00 -0.0089 1.1000 1.00 577
330-
#> s(season).7 -0.72 0.3400 1.4000 1.01 659
331-
#> s(season).8 -0.95 0.2200 1.8000 1.01 328
332-
#> s(season).9 -1.10 -0.3100 0.6700 1.00 398
333-
#> s(season).10 -1.30 -0.6700 -0.0054 1.01 595
322+
#> 2.5% 50% 97.5% Rhat n_eff
323+
#> (Intercept) 6.0000 6.600 7.0000 1.00 653
324+
#> s(season).1 -0.6200 0.034 0.7100 1.00 788
325+
#> s(season).2 -0.2300 0.840 1.8000 1.01 351
326+
#> s(season).3 0.0081 1.300 2.5000 1.01 289
327+
#> s(season).4 -0.4700 0.440 1.4000 1.00 739
328+
#> s(season).5 -1.2000 -0.150 0.9200 1.01 485
329+
#> s(season).6 -1.2000 -0.027 1.1000 1.00 612
330+
#> s(season).7 -0.8000 0.380 1.5000 1.01 545
331+
#> s(season).8 -1.0000 0.250 1.8000 1.01 282
332+
#> s(season).9 -1.1000 -0.260 0.7100 1.01 369
333+
#> s(season).10 -1.4000 -0.680 0.0082 1.01 459
334334
#>
335335
#> Approximate significance of GAM observation smooths:
336336
#> edf Chi.sq p-value
337-
#> s(season) 4.44 17999 0.27
337+
#> s(season) 4.11 20324 0.22
338338
#>
339339
#> Latent trend AR parameter estimates:
340340
#> 2.5% 50% 97.5% Rhat n_eff
341-
#> ar1[1] 0.73 1.10 1.400 1.00 609
342-
#> ar2[1] -0.82 -0.41 0.043 1.00 1562
343-
#> ar3[1] -0.46 -0.12 0.300 1.01 510
344-
#> sigma[1] 0.40 0.50 0.630 1.00 1051
341+
#> ar1[1] 0.72 1.10 1.400 1.01 668
342+
#> ar2[1] -0.83 -0.41 0.059 1.00 1648
343+
#> ar3[1] -0.47 -0.11 0.330 1.01 466
344+
#> sigma[1] 0.40 0.50 0.640 1.00 1103
345345
#>
346346
#> Stan MCMC diagnostics:
347347
#> n_eff / iter looks reasonable for all parameters
@@ -350,7 +350,7 @@ summary(lynx_mvgam)
350350
#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
351351
#> E-FMI indicated no pathological behavior
352352
#>
353-
#> Samples were drawn using NUTS(diag_e) at Tue Mar 12 1:50:01 PM 2024.
353+
#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 3:04:49 PM 2024.
354354
#> For each parameter, n_eff is a crude measure of effective sample size,
355355
#> and Rhat is the potential scale reduction factor on split MCMC chains
356356
#> (at convergence, Rhat = 1)
@@ -472,7 +472,7 @@ plot(lynx_mvgam, type = 'forecast', newdata = lynx_test)
472472
<img src="man/figures/README-unnamed-chunk-21-1.png" width="60%" style="display: block; margin: auto;" />
473473

474474
#> Out of sample CRPS:
475-
#> [1] 2852.181
475+
#> [1] 2903.294
476476

477477
And the estimated latent trend component, again using the more flexible
478478
`plot_mvgam_...()` option to show first derivatives of the estimated
@@ -628,7 +628,7 @@ summary(mod, include_betas = FALSE)
628628
#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
629629
#> E-FMI indicated no pathological behavior
630630
#>
631-
#> Samples were drawn using NUTS(diag_e) at Tue Mar 12 1:51:32 PM 2024.
631+
#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 3:06:31 PM 2024.
632632
#> For each parameter, n_eff is a crude measure of effective sample size,
633633
#> and Rhat is the potential scale reduction factor on split MCMC chains
634634
#> (at convergence, Rhat = 1)

index.Rmd

+3-1
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ vembedr::embed_url("https://www.youtube.com/watch?v=0zZopLlomsQ")
3232
* `lognormal()` for non-negative real-valued data
3333
* `Gamma()` for non-negative real-valued data
3434
* `betar()` for proportional data on `(0,1)`
35+
* `bernoulli()` for binary data
3536
* `poisson()` for count data
3637
* `nb()` for overdispersed count data
37-
* `nmix()` for count data with imperfect detection
38+
* `binomial()` for count data with known number of trials
39+
* `nmix()` for count data with imperfect detection (unknown number of trials)
3840
* `tweedie()` for overdispersed count data
3941

4042
Note that only `poisson()`, `nb()`, and `tweedie()` are available if using `JAGS`. All families, apart from `tweedie()`, are supported if using `Stan`. See `??mvgam_families` for more information. Below is a simple example for simulating and modelling proportional data with `Beta` observations over a set of seasonal series with independent Gaussian Process dynamic trends:

index.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,12 @@ package can handle data for the following families:
7272
- `lognormal()` for non-negative real-valued data
7373
- `Gamma()` for non-negative real-valued data
7474
- `betar()` for proportional data on `(0,1)`
75+
- `bernoulli()` for binary data
7576
- `poisson()` for count data
7677
- `nb()` for overdispersed count data
77-
- `nmix()` for count data with imperfect detection
78+
- `binomial()` for count data with known number of trials
79+
- `nmix()` for count data with imperfect detection (unknown number of
80+
trials)
7881
- `tweedie()` for overdispersed count data
7982

8083
Note that only `poisson()`, `nb()`, and `tweedie()` are available if
-94 Bytes
Loading
559 Bytes
Loading
-28 Bytes
Loading
30 Bytes
Loading
-302 Bytes
Loading
6 Bytes
Loading
9 Bytes
Loading
308 Bytes
Loading
-254 Bytes
Loading
-279 Bytes
Loading
5 Bytes
Loading
-609 Bytes
Loading

src/mvgam.dll

0 Bytes
Binary file not shown.

tests/testthat/Rplots.pdf

-11 Bytes
Binary file not shown.

tests/testthat/test-binomial.R

+11-3
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ dat <- rbind(data.frame(y = rbinom(n = 50, size = trials, prob = detprob1),
1818
dplyr::mutate(series = as.factor(series)) %>%
1919
dplyr::arrange(time, series)
2020

21+
# Throw in some NAs
22+
dat$y[c(1,5,9)] <- NA
23+
2124
# Training and testing splits
2225
dat_train <- dat %>%
2326
dplyr::filter(time <= 40)
@@ -32,7 +35,8 @@ test_that("cbind() syntax required for binomial()", {
3235
'Binomial family requires cbind() syntax in the formula left-hand side',
3336
fixed = TRUE)
3437

35-
mod <- mvgam(cbind(y, ntrials) ~ series + s(x, by = series),
38+
mod <- mvgam(cbind(y, ntrials) ~ s(series, bs = 're') +
39+
gp(x, by = series, c = 5/4, k = 5),
3640
family = binomial(),
3741
data = dat_train,
3842
run_model = FALSE)
@@ -78,6 +82,9 @@ dat <- rbind(data.frame(y = rbinom(n = 50, size = 1, prob = detprob1),
7882
dplyr::mutate(series = as.factor(series)) %>%
7983
dplyr::arrange(time, series)
8084

85+
# Throw in some NAs
86+
dat$y[c(1,5,9)] <- NA
87+
8188
# Training and testing splits
8289
dat_train <- dat %>%
8390
dplyr::filter(time <= 40)
@@ -92,7 +99,8 @@ test_that("bernoulli() behaves appropriately", {
9299
'y values must be 0 <= y <= 1',
93100
fixed = TRUE)
94101

95-
mod <- mvgam(y ~ series + s(x, by = series),
102+
mod <- mvgam(y ~ s(series, bs = 're') +
103+
gp(x, by = series, c = 5/4, k = 5),
96104
family = bernoulli(),
97105
data = dat_train,
98106
run_model = FALSE)
@@ -102,7 +110,7 @@ test_that("bernoulli() behaves appropriately", {
102110

103111
# Also with a trend_formula
104112
mod <- mvgam(y ~ series,
105-
trend_formula = ~ s(x, by = trend),
113+
trend_formula = ~ gp(x, by = trend, c = 5/4),
106114
trend_model = AR(),
107115
family = bernoulli(),
108116
data = dat_train,

text.txt

-4
This file was deleted.

vignettes/mvgam_overview.Rmd

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,13 @@ Here $\alpha$ are the unknown intercepts, the $\boldsymbol{s}$'s are unknown smo
4444
|LogNormal (identity link) | `lognormal()` | Positive real values in $[0, \infty)$ | $\sigma$ |
4545
|Gamma (log link) | `Gamma()` | Positive real values in $[0, \infty)$ | $\alpha$ |
4646
|Beta (logit link) | `betar()` | Real values (proportional) in $[0,1]$ | $\phi$ |
47+
|Bernoulli (logit link) | `bernoulli()` | Binary data in ${0,1}$ | - |
4748
|Poisson (log link) | `poisson()` | Non-negative integers in $(0,1,2,...)$ | - |
4849
|Negative Binomial2 (log link)| `nb()` | Non-negative integers in $(0,1,2,...)$ | $\phi$ |
50+
|Binomial (logit link) | `binomial()` | Non-negative integers in $(0,1,2,...)$ | - |
4951
|Poisson Binomial N-mixture (log link)| `nmix()` | Non-negative integers in $(0,1,2,...)$ | - |
5052

51-
For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are estimated independently for each series. Note that default link functions cannot currently be changed in `mvgam`.
53+
For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are by default estimated independently for each series. However, users can opt to force all series to share extra observation parameters using `share_obs_params = TRUE` in `mvgam()`. Note that default link functions cannot currently be changed.
5254

5355
## Supported temporal dynamic processes
5456
The dynamic processes can take a wide variety of forms, some of which can be multivariate to allow the different time series to interact or be correlated. When using the `mvgam()` function, the user chooses between different process models with the `trend_model` argument. Available process models are described in detail below.

0 commit comments

Comments
 (0)