diff --git a/R/get_mvgam_priors.R b/R/get_mvgam_priors.R index e16e8631..7929f135 100644 --- a/R/get_mvgam_priors.R +++ b/R/get_mvgam_priors.R @@ -4,6 +4,8 @@ #'changed for a given `mvgam` model, as well listing their default distributions #' #'@inheritParams mvgam +#'@param factor_formula Can be supplied instead `trend_formula` to match syntax from +#'[jsdgam] #'@details Users can supply a model formula, prior to fitting the model, so that default priors can be inspected and #'altered. To make alterations, change the contents of the `prior` column and supplying this #'\code{data.frame} to the `mvgam` function using the argument `priors`. If using `Stan` as the backend, @@ -151,6 +153,7 @@ #'@export get_mvgam_priors = function(formula, trend_formula, + factor_formula, data, data_train, family = 'poisson', @@ -172,6 +175,14 @@ get_mvgam_priors = function(formula, } orig_data <- data_train + # Set trend_formula + if(!missing(factor_formula)){ + if(!missing(trend_formula)){ + warning('Both "trend_formula" and "factor_formula" supplied\nUsing "factor_formula" as default') + } + trend_formula <- factor_formula + } + # Validate the trend arguments if(drift) message('The "drift" argument is deprecated; use fixed effects of "time" instead') diff --git a/R/jsdgam.R b/R/jsdgam.R index b3e8ee06..ccb2d924 100644 --- a/R/jsdgam.R +++ b/R/jsdgam.R @@ -39,20 +39,28 @@ #' of trials #' \item`beta_binomial()` as for `binomial()` but allows for overdispersion} #'Default is `poisson()`. See \code{\link{mvgam_families}} for more details -#'@param ... Other arguments to pass to [mvgam()] +#'@param ... Other arguments to pass to [mvgam] #'@author Nicholas J Clark #'@details Joint Species Distribution Models allow for responses of multiple species to be #'learned hierarchically, whereby responses to environmental variables in `formula` can be partially #'pooled and any latent, unmodelled residual associations can also be learned. In \pkg{mvgam}, both of #'these effects can be modelled with the full power of latent factor Hierarchical GAMs, providing unmatched -#'flexibility to model full communities of species +#'flexibility to model full communities of species. When calling [jsdgam], an initial State-Space model using +#'`trend = 'None'` is set up and then modified to include the latent factors and their linear predictors. +#'Consequently, you can inspect priors for these models using [get_mvgam_priors] by supplying the relevant +#'`formula`, `factor_formula`, `data` and `family` arguments and using `trend = 'None'` #'@seealso [mvgam] -#'@return A \code{list} object of classes \code{jsdgam} and \code{mvgam} containing model output, +#'@references Nicholas J Clark & Konstans Wells (2020). Dynamic generalised additive models (DGAMs) for forecasting discrete ecological time series. +#'Methods in Ecology and Evolution. 14:3, 771-784. +#'David I Warton, F Guillaume Blanchet, Robert B O’Hara, Otso Ovaskainen, Sara Taskinen, Steven C +#'Walker & Francis KC Hui (2015). So many variables: joint modeling in community ecology. +#'Trends in Ecology & Evolution 30:12, 766-779. +#'@return A \code{list} object of class \code{mvgam} containing model output, #'the text representation of the model file, #'the mgcv model output (for easily generating simulations at #'unsampled covariate values), Dunn-Smyth residuals for each series and key information needed #'for other functions in the package. See \code{\link{mvgam-class}} for details. -#'Use `methods(class = "mvgam")` and `methods(class = "jsdgam")` for an overview on available methods +#'Use `methods(class = "mvgam")` for an overview on available methods #'@examples #'\dontrun{ #' # Simulate latent count data for 500 spatial locations and 10 species @@ -151,10 +159,28 @@ #' viridis::scale_color_viridis() + #' theme_classic() #' +#' # Inspect default priors for the underlying model +#' priors <- get_mvgam_priors(formula = count ~ +#' # Environmental model includes species-level intercepts +#' # and random slopes for a linear effect of temperature +#' species + +#' s(species, bs = 're', by = temperature), +#' +#' # Each factor estimates a different nonlinear spatial process, using +#' # 'by = trend' as in other mvgam State-Space models +#' factor_formula = ~ te(lat, lon, k = 5, by = trend) - 1, +#' +#' # The data +#' data = dat, +#' +#' # Poisson observations +#' family = poisson()) +#' priors +#' #' # Fit a JSDM that estimates a hierarchical temperature responses #' # and that uses three latent spatial factors #' mod <- jsdgam(formula = count ~ -#' # Environmental model includes species-level interecepts +#' # Environmental model includes species-level intercepts #' # and random slopes for a linear effect of temperature #' species + #' s(species, bs = 're', by = temperature), @@ -164,6 +190,9 @@ #' factor_formula = ~ te(lat, lon, k = 5, by = trend) - 1, #' n_lv = 3, #' +#' # Change default priors for fixed effect betas to standard normal +#' priors = prior(std_normal(), class = b), +#' #' # The data and the grouping variables #' data = dat, #' unit = site, diff --git a/R/mvgam.R b/R/mvgam.R index 08a266ac..6a9ceb3a 100644 --- a/R/mvgam.R +++ b/R/mvgam.R @@ -349,7 +349,7 @@ #'@references Nicholas J Clark & Konstans Wells (2020). Dynamic generalised additive models (DGAMs) for forecasting discrete ecological time series. #'Methods in Ecology and Evolution. 14:3, 771-784. #'@seealso \code{\link[mgcv]{jagam}}, \code{\link[mgcv]{gam}}, \code{\link[mgcv]{gam.models}}, -#'\code{\link{get_mvgam_priors}} +#'\code{\link{get_mvgam_priors}}, \code{\link{jsdgam}} #'@return A \code{list} object of class \code{mvgam} containing model output, the text representation of the model file, #'the mgcv model output (for easily generating simulations at #'unsampled covariate values), Dunn-Smyth residuals for each series and key information needed diff --git a/R/mvgam_setup.R b/R/mvgam_setup.R index d634a97a..474195cf 100644 --- a/R/mvgam_setup.R +++ b/R/mvgam_setup.R @@ -11,38 +11,14 @@ mvgam_setup <- function(formula, maxit = 5) { if(missing(knots)){ - # Initialise the GAM for a few iterations to ensure it all works without error - # suppressWarnings(mgcv::gam(formula(formula), - # data = dat, - # family = family, - # optimizer = c('outer', 'nlm'), - # control = list(maxit = 1, - # epsilon = 1e20, - # rank.tol = 1, - # mgcv.tol = 1e20, - # mgcv.half = 1, - # nlm = list(iterlim = 1)), - # drop.unused.levels = FALSE, - # na.action = na.fail, - # select = TRUE)) init_gam(formula(formula), data = dat, family = family) } else { - # suppressWarnings(mgcv::gam(formula(formula), - # data = dat, - # family = family, - # knots = knots, - # optimizer = c('outer', 'nlm'), - # control = list(maxit = 1, - # epsilon = 1e20, - # rank.tol = 1, - # mgcv.tol = 1e20, - # mgcv.half = 1, - # nlm = list(iterlim = 1)), - # drop.unused.levels = FALSE, - # na.action = na.fail, - # select = TRUE)) + if(!is.list(knots)){ + stop('all "knot" arguments must be supplied as lists', + call. = FALSE) + } init_gam(formula(formula), data = dat, family = family, diff --git a/man/get_mvgam_priors.Rd b/man/get_mvgam_priors.Rd index 97b00dde..800e1dde 100644 --- a/man/get_mvgam_priors.Rd +++ b/man/get_mvgam_priors.Rd @@ -7,6 +7,7 @@ get_mvgam_priors( formula, trend_formula, + factor_formula, data, data_train, family = "poisson", @@ -42,6 +43,9 @@ latent abundance. Be aware that it can be very challenging to simultaneously est for both the observation mode (captured by \code{formula}) and the process model (captured by \code{trend_formula}). Users are recommended to drop one of these using the \code{- 1} convention in the formula right hand side.} +\item{factor_formula}{Can be supplied instead \code{trend_formula} to match syntax from +\link{jsdgam}} + \item{data}{A \code{dataframe} or \code{list} containing the model response variable and covariates required by the GAM \code{formula} and optional \code{trend_formula}. Most models should include columns: \itemize{ diff --git a/man/jsdgam.Rd b/man/jsdgam.Rd index fe726e2e..1e378328 100644 --- a/man/jsdgam.Rd +++ b/man/jsdgam.Rd @@ -193,15 +193,15 @@ other stochastic elements that are not currently available in \code{mvgam}. Default is \code{FALSE} to reduce the size of the returned object, unless \code{run_model == FALSE}} -\item{...}{Other arguments to pass to \code{\link[=mvgam]{mvgam()}}} +\item{...}{Other arguments to pass to \link{mvgam}} } \value{ -A \code{list} object of classes \code{jsdgam} and \code{mvgam} containing model output, +A \code{list} object of class \code{mvgam} containing model output, the text representation of the model file, the mgcv model output (for easily generating simulations at unsampled covariate values), Dunn-Smyth residuals for each series and key information needed for other functions in the package. See \code{\link{mvgam-class}} for details. -Use \code{methods(class = "mvgam")} and \code{methods(class = "jsdgam")} for an overview on available methods +Use \code{methods(class = "mvgam")} for an overview on available methods } \description{ This function sets up a Joint Species Distribution Model whereby the residual associations among @@ -216,7 +216,10 @@ Joint Species Distribution Models allow for responses of multiple species to be learned hierarchically, whereby responses to environmental variables in \code{formula} can be partially pooled and any latent, unmodelled residual associations can also be learned. In \pkg{mvgam}, both of these effects can be modelled with the full power of latent factor Hierarchical GAMs, providing unmatched -flexibility to model full communities of species +flexibility to model full communities of species. When calling \link{jsdgam}, an initial State-Space model using +\code{trend = 'None'} is set up and then modified to include the latent factors and their linear predictors. +Consequently, you can inspect priors for these models using \link{get_mvgam_priors} by supplying the relevant +\code{formula}, \code{factor_formula}, \code{data} and \code{family} arguments and using \code{trend = 'None'} } \examples{ \dontrun{ @@ -316,10 +319,27 @@ ggplot(dat, aes(x = lat, y = lon, col = log(count + 1))) + viridis::scale_color_viridis() + theme_classic() +# Inspect default priors for the underlying model +priors <- get_mvgam_priors(formula = count ~ + # Environmental model includes species-level intercepts + # and random slopes for a linear effect of temperature + species + + s(species, bs = 're', by = temperature), + + # Each factor estimates a different nonlinear spatial process, using + # 'by = trend' as in other mvgam State-Space models + factor_formula = ~ te(lat, lon, k = 5, by = trend) - 1, + + # The data + data = dat, + + # Poisson observations + family = poisson()) + # Fit a JSDM that estimates a hierarchical temperature responses # and that uses three latent spatial factors mod <- jsdgam(formula = count ~ - # Environmental model includes species-level interecepts + # Environmental model includes species-level intercepts # and random slopes for a linear effect of temperature species + s(species, bs = 're', by = temperature), @@ -372,6 +392,13 @@ pp_check(mod, type = "ecdf_overlay_grouped", loo(mod) } } +\references{ +Nicholas J Clark & Konstans Wells (2020). Dynamic generalised additive models (DGAMs) for forecasting discrete ecological time series. +Methods in Ecology and Evolution. 14:3, 771-784. +David I Warton, F Guillaume Blanchet, Robert B O’Hara, Otso Ovaskainen, Sara Taskinen, Steven C +Walker & Francis KC Hui (2015). So many variables: joint modeling in community ecology. +Trends in Ecology & Evolution 30:12, 766-779. +} \seealso{ \link{mvgam} } diff --git a/man/mvgam.Rd b/man/mvgam.Rd index ef5e3a07..f2769e68 100644 --- a/man/mvgam.Rd +++ b/man/mvgam.Rd @@ -707,7 +707,7 @@ Methods in Ecology and Evolution. 14:3, 771-784. } \seealso{ \code{\link[mgcv]{jagam}}, \code{\link[mgcv]{gam}}, \code{\link[mgcv]{gam.models}}, -\code{\link{get_mvgam_priors}} +\code{\link{get_mvgam_priors}}, \code{\link{jsdgam}} } \author{ Nicholas J Clark diff --git a/man/mvgam_marginaleffects.Rd b/man/mvgam_marginaleffects.Rd index 5564cfdb..16e6648e 100644 --- a/man/mvgam_marginaleffects.Rd +++ b/man/mvgam_marginaleffects.Rd @@ -86,6 +86,8 @@ arguments.} \item \code{newdata = datagrid(cyl = c(4, 6))}: \code{cyl} variable equal to 4 and 6 and other regressors fixed at their means or modes. \item See the Examples section and the \code{\link[marginaleffects:datagrid]{datagrid()}} documentation. } +\item \code{\link[=subset]{subset()}} call with a single argument to select a subset of the dataset used to fit the model, ex: \code{newdata = subset(treatment == 1)} +\item \code{\link[dplyr:filter]{dplyr::filter()}} call with a single argument to select a subset of the dataset used to fit the model, ex: \code{newdata = filter(treatment == 1)} \item string: \itemize{ \item "mean": Marginal Effects at the Mean. Slopes when each predictor is held at its mean or mode. diff --git a/src/RcppExports.o b/src/RcppExports.o index a3f83073..03b01119 100644 Binary files a/src/RcppExports.o and b/src/RcppExports.o differ diff --git a/src/trend_funs.o b/src/trend_funs.o index 5efebda4..1c0d5a8f 100644 Binary files a/src/trend_funs.o and b/src/trend_funs.o differ diff --git a/tests/testthat/test-jsdgam.R b/tests/testthat/test-jsdgam.R index 7c89fb87..d22068b8 100644 --- a/tests/testthat/test-jsdgam.R +++ b/tests/testthat/test-jsdgam.R @@ -1,5 +1,6 @@ context("jsdgam") +# Reconstruct the spider data from mvabund spiderdat <- structure(list(abundance = c(25L, 0L, 15L, 2L, 1L, 0L, 2L, 0L, 1L, 3L, 15L, 16L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 7L, 17L, 11L, 9L, 3L, 29L, 15L, 10L, 2L, 20L, 6L, 20L, 6L, 7L, 11L, 1L, @@ -48,8 +49,10 @@ spiderdat <- structure(list(abundance = c(25L, 0L, 15L, 2L, 1L, 0L, 2L, 0L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), levels = c("Alopacce", - "Alopcune", "Alopfabr", "Arctlute", "Arctperi", "Auloalbi", "Pardlugu", - "Pardmont", "Pardnigr", "Pardpull", "Trocterr", "Zoraspin"), class = "factor"), + "Alopcune", "Alopfabr", "Arctlute", + "Arctperi", "Auloalbi", "Pardlugu", + "Pardmont", "Pardnigr", "Pardpull", + "Trocterr", "Zoraspin"), class = "factor"), site = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, @@ -339,3 +342,158 @@ test_that("response variable must be specified", { family = nb()), 'Not sure how to deal with this response variable specification') }) + +test_that("unit must exist in data", { + expect_error(jsdgam(formula = abundance ~ + # Environmental model includes species-level interecepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, by = trend) - 1, + data = spiderdat, + unit = banana, + subgr = taxon, + n_lv = 3, + family = nb()), + 'Variable "banana" not found in data') +}) + + +test_that("subgr must exist in data", { + expect_error(jsdgam(formula = abundance ~ + # Environmental model includes species-level interecepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, by = trend) - 1, + data = spiderdat, + unit = site, + subgr = banana, + n_lv = 3, + family = nb()), + 'Variable "banana" not found in data') +}) + +test_that("subgr must be a factor in data", { + spiderdat$taxon <- as.numeric(spiderdat$taxon) + expect_error(jsdgam(formula = abundance ~ + # Environmental model includes species-level interecepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, by = trend) - 1, + data = spiderdat, + unit = site, + subgr = taxon, + n_lv = 3, + family = nb()), + 'Variable "taxon" must be a factor type') +}) + +test_that("unit must be a numeric / integer in data", { + spiderdat$site <- as.factor(spiderdat$site) + expect_error(jsdgam(formula = abundance ~ + # Environmental model includes species-level interecepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, by = trend) - 1, + data = spiderdat, + unit = site, + subgr = taxon, + n_lv = 3, + family = nb()), + 'Variable "site" must be either numeric or integer type') +}) + +test_that("knots must be a list", { + expect_error(jsdgam(formula = abundance ~ + # Environmental model includes species-level interecepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, by = trend) - 1, + # supplying knots as a vector should fail + factor_knots = seq(min(spiderdat$soil.dry), + max(spiderdat$soil.dry), + length.out = 4), + data = spiderdat, + unit = site, + subgr = taxon, + n_lv = 3, + family = nb(), + run_model = FALSE), + 'all "knot" arguments must be supplied as lists') + +}) + +test_that("errors about knot lengths should be propagated from mgcv", { + expect_error(jsdgam(formula = abundance ~ + # Environmental model includes species-level interecepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, by = trend, bs = 'cr') - 1, + # knot length should be 5 for this CR basis + factor_knots = list(soil.dry = seq(min(spiderdat$soil.dry), + max(spiderdat$soil.dry), + length.out = 4)), + data = spiderdat, + unit = site, + subgr = taxon, + n_lv = 3, + family = nb(), + run_model = FALSE), + 'number of supplied knots != k for a cr smooth') + +}) + +test_that("get_mvgam_priors accepts factor_formula", { + expect_no_error(get_mvgam_priors(formula = abundance ~ + # Environmental model includes species-level intercepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, by = trend, bs = 'cr') - 1, + data = spiderdat, + trend_model = 'None')) +}) + +# Skip the next test as it should actually initiate the model, and may take a few seconds +skip_on_cran() +test_that("jsdgam should initiate correctly", { + mod <- jsdgam(formula = abundance ~ + # Environmental model includes species-level interecepts + # and random slopes for a linear effect of reflection + s(taxon, bs = 're') + + s(taxon, bs = 're', by = reflection), + # Each factor estimates a different, possibly nonlinear effect of soil.dry + factor_formula = ~ s(soil.dry, k = 5, bs = 'cr', by = trend) - 1, + # supplying knots should also work if k matches length(knots) + factor_knots = list(soil.dry = seq(min(spiderdat$soil.dry), + max(spiderdat$soil.dry), + length.out = 5)), + data = spiderdat, + unit = site, + subgr = taxon, + n_lv = 3, + family = nb(), + run_model = FALSE) + expect_true(inherits(mod, 'mvgam_prefit')) + expect_true(any(grepl('// raw latent factors (with linear predictors)', + mod$model_file, fixed = TRUE))) + expect_true(any(grepl('matrix[n_series, n_lv] lv_coefs = rep_matrix(0, n_series, n_lv);', + mod$model_file, fixed = TRUE))) + expect_true(attr(mod$model_data, 'trend_model') == 'None') + expect_true(inherits(attr(mod$model_data, 'prepped_trend_model'), + 'mvgam_trend')) + expect_true(attr(mod$model_data, 'prepped_trend_model')$subgr == 'taxon') + expect_true(attr(mod$model_data, 'prepped_trend_model')$trend_model == 'ZMVN') +})