diff --git a/.gitignore b/.gitignore index b242512a..09be2d9e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,11 +3,8 @@ .RData .Ruserdata .Rprofile -doc Meta .Rproj.user -/doc/ /Meta/ desktop.ini ^cran-comments\.md$ -^CRAN-SUBMISSION$ diff --git a/doc/SS_model.svg b/doc/SS_model.svg new file mode 100644 index 00000000..f6441719 --- /dev/null +++ b/doc/SS_model.svg @@ -0,0 +1,255 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + X + 1 + + X + 2 + + X + T + + + X + 3 + + Y + 1 + + + Y + 3 + + + Y + T + + Process model + Observation model + diff --git a/doc/data_in_mvgam.R b/doc/data_in_mvgam.R new file mode 100644 index 00000000..29efbeba --- /dev/null +++ b/doc/data_in_mvgam.R @@ -0,0 +1,248 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +simdat <- sim_mvgam(n_series = 4, T = 24, prop_missing = 0.2) +head(simdat$data_train, 16) + +## ----------------------------------------------------------------------------- +class(simdat$data_train$series) +levels(simdat$data_train$series) + +## ----------------------------------------------------------------------------- +all(levels(simdat$data_train$series) %in% unique(simdat$data_train$series)) + +## ----------------------------------------------------------------------------- +summary(glm(y ~ series + time, + data = simdat$data_train, + family = poisson())) + +## ----------------------------------------------------------------------------- +summary(gam(y ~ series + s(time, by = series), + data = simdat$data_train, + family = poisson())) + +## ----------------------------------------------------------------------------- +gauss_dat <- data.frame(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +gauss_dat + +## ----------------------------------------------------------------------------- +gam(outcome ~ time, + family = betar(), + data = gauss_dat) + +## ----error=TRUE--------------------------------------------------------------- +mvgam(outcome ~ time, + family = betar(), + data = gauss_dat) + +## ----------------------------------------------------------------------------- +# A function to ensure all timepoints within a sequence are identical +all_times_avail = function(time, min_time, max_time){ + identical(as.numeric(sort(time)), + as.numeric(seq.int(from = min_time, to = max_time))) +} + +# Get min and max times from the data +min_time <- min(simdat$data_train$time) +max_time <- max(simdat$data_train$time) + +# Check that all times are recorded for each series +data.frame(series = simdat$data_train$series, + time = simdat$data_train$time) %>% + dplyr::group_by(series) %>% + dplyr::summarise(all_there = all_times_avail(time, + min_time, + max_time)) -> checked_times +if(any(checked_times$all_there == FALSE)){ + warning("One or more series in is missing observations for one or more timepoints") +} else { + cat('All series have observations at all timepoints :)') +} + +## ----------------------------------------------------------------------------- +bad_times <- data.frame(time = seq(1, 16, by = 2), + series = factor('series_1'), + outcome = rnorm(8)) +bad_times + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = bad_times, + family = gaussian()) + +## ----------------------------------------------------------------------------- +bad_times %>% + dplyr::right_join(expand.grid(time = seq(min(bad_times$time), + max(bad_times$time)), + series = factor(unique(bad_times$series), + levels = levels(bad_times$series)))) %>% + dplyr::arrange(time) -> good_times +good_times + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = good_times, + family = gaussian()) + +## ----------------------------------------------------------------------------- +bad_levels <- data.frame(time = 1:8, + series = factor('series_1', + levels = c('series_1', + 'series_2')), + outcome = rnorm(8)) + +levels(bad_levels$series) + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = bad_levels, + family = gaussian()) + +## ----------------------------------------------------------------------------- +setdiff(levels(bad_levels$series), unique(bad_levels$series)) + +## ----------------------------------------------------------------------------- +bad_levels %>% + dplyr::mutate(series = droplevels(series)) -> good_levels +levels(good_levels$series) + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = good_levels, + family = gaussian()) + +## ----------------------------------------------------------------------------- +miss_dat <- data.frame(outcome = rnorm(10), + cov = c(NA, rnorm(9)), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) + +## ----------------------------------------------------------------------------- +miss_dat <- list(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat$cov <- matrix(rnorm(50), ncol = 5, nrow = 10) +miss_dat$cov[2,3] <- NA + +## ----error=TRUE--------------------------------------------------------------- +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 'all') + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 1) + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + y = 'y', + series = 1) + +## ----------------------------------------------------------------------------- +data("all_neon_tick_data") +str(dplyr::ungroup(all_neon_tick_data)) + +## ----------------------------------------------------------------------------- +plotIDs <- c('SCBI_013','SCBI_002', + 'SERC_001','SERC_005', + 'SERC_006','SERC_012', + 'BLAN_012','BLAN_005') + +## ----------------------------------------------------------------------------- +model_dat <- all_neon_tick_data %>% + dplyr::ungroup() %>% + dplyr::mutate(target = ixodes_scapularis) %>% + dplyr::filter(plotID %in% plotIDs) %>% + dplyr::select(Year, epiWeek, plotID, target) %>% + dplyr::mutate(epiWeek = as.numeric(epiWeek)) + +## ----------------------------------------------------------------------------- +model_dat %>% + # Create all possible combos of plotID, Year and epiWeek; + # missing outcomes will be filled in as NA + dplyr::full_join(expand.grid(plotID = unique(model_dat$plotID), + Year = unique(model_dat$Year), + epiWeek = seq(1, 52))) %>% + + # left_join back to original data so plotID and siteID will + # match up, in case you need the siteID for anything else later on + dplyr::left_join(all_neon_tick_data %>% + dplyr::select(siteID, plotID) %>% + dplyr::distinct()) -> model_dat + +## ----------------------------------------------------------------------------- +model_dat %>% + dplyr::mutate(series = plotID, + y = target) %>% + dplyr::mutate(siteID = factor(siteID), + series = factor(series)) %>% + dplyr::select(-target, -plotID) %>% + dplyr::arrange(Year, epiWeek, series) -> model_dat + +## ----------------------------------------------------------------------------- +model_dat %>% + dplyr::ungroup() %>% + dplyr::group_by(series) %>% + dplyr::arrange(Year, epiWeek) %>% + dplyr::mutate(time = seq(1, dplyr::n())) %>% + dplyr::ungroup() -> model_dat + +## ----------------------------------------------------------------------------- +levels(model_dat$series) + +## ----error=TRUE--------------------------------------------------------------- +get_mvgam_priors(y ~ 1, + data = model_dat, + family = poisson()) + +## ----------------------------------------------------------------------------- +testmod <- mvgam(y ~ s(epiWeek, by = series, bs = 'cc') + + s(series, bs = 're'), + trend_model = 'AR1', + data = model_dat, + backend = 'cmdstanr', + run_model = FALSE) + +## ----------------------------------------------------------------------------- +str(testmod$model_data) + +## ----------------------------------------------------------------------------- +code(testmod) + diff --git a/doc/data_in_mvgam.Rmd b/doc/data_in_mvgam.Rmd new file mode 100644 index 00000000..b8240edd --- /dev/null +++ b/doc/data_in_mvgam.Rmd @@ -0,0 +1,354 @@ +--- +title: "Formatting data for use in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Formatting data for use in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +This vignette gives an example of how to take raw data and format it for use in `mvgam`. This is not an exhaustive example, as data can be recorded and stored in a variety of ways, which requires different approaches to wrangle the data into the necessary format for `mvgam`. For full details on the basic `mvgam` functionality, please see [the introductory vignette](https://nicholasjclark.github.io/mvgam/articles/mvgam_overview.html). + +## Required *long* data format +Manipulating the data into a 'long' format is necessary for modelling in `mvgam`. By 'long' format, we mean that each `series x time` observation needs to have its own entry in the `dataframe` or `list` object that we wish to use as data for modelling. A simple example can be viewed by simulating data using the `sim_mvgam` function. See `?sim_mvgam` for more details +```{r} +simdat <- sim_mvgam(n_series = 4, T = 24, prop_missing = 0.2) +head(simdat$data_train, 16) +``` + +### `series` as a `factor` variable +Notice how we have four different time series in these simulated data, and we have identified the series-level indicator as a `factor` variable. +```{r} +class(simdat$data_train$series) +levels(simdat$data_train$series) +``` + +It is important that the number of levels matches the number of unique series in the data to ensure indexing across series works properly in the underlying modelling functions. Several of the main workhorse functions in the package (including `mvgam()` and `get_mvgam_priors()`) will give an error if this is not the case, but it may be worth checking anyway: +```{r} +all(levels(simdat$data_train$series) %in% unique(simdat$data_train$series)) +``` + +Note that you can technically supply data that does not have a `series` indicator, and the package will assume that you are only using a single time series. But again, it is better to have this included so there is no confusion. + +### A single outcome variable +You may also have notices that we do not spread the `numeric / integer`-classed outcome variable into different columns. Rather, there is only a single column for the outcome variable, labelled `y` in these simulated data (though the outcome does not have to be labelled `y`). This is another important requirement in `mvgam`, but it shouldn't be too unfamiliar to `R` users who frequently use modelling packages such as `lme4`, `mgcv`, `brms` or the many other regression modelling packages out there. The advantage of this format is that it is now very easy to specify effects that vary among time series: +```{r} +summary(glm(y ~ series + time, + data = simdat$data_train, + family = poisson())) +``` + +```{r} +summary(gam(y ~ series + s(time, by = series), + data = simdat$data_train, + family = poisson())) +``` + +Depending on the observation families you plan to use when building models, there may be some restrictions that need to be satisfied within the outcome variable. For example, a Beta regression can only handle proportional data, so values `>= 1` or `<= 0` are not allowed. Likewise, a Poisson regression can only handle non-negative integers. Most regression functions in `R` will assume the user knows all of this and so will not issue any warnings or errors if you choose the wrong distribution, but often this ends up leading to some unhelpful error from an optimizer that is difficult to interpret and diagnose. `mvgam` will attempt to provide some errors if you do something that is simply not allowed. For example, we can simulate data from a zero-centred Gaussian distribution (ensuring that some of our values will be `< 1`) and attempt a Beta regression in `mvgam` using the `betar` family: +```{r} +gauss_dat <- data.frame(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +gauss_dat +``` + +A call to `gam` using the `mgcv` package leads to a model that actually fits (though it does give an unhelpful warning message): +```{r} +gam(outcome ~ time, + family = betar(), + data = gauss_dat) +``` + +But the same call to `mvgam` gives us something more useful: +```{r error=TRUE} +mvgam(outcome ~ time, + family = betar(), + data = gauss_dat) +``` + +Please see `?mvgam_families` for more information on the types of responses that the package can handle and their restrictions + +### A `time` variable +The other requirement for most models that can be fit in `mvgam` is a `numeric / integer`-classed variable labelled `time`. This ensures the modelling software knows how to arrange the time series when building models. This setup still allows us to formulate multivariate time series models. If you plan to use any of the autoregressive dynamic trend functions available in `mvgam` (see `?mvgam_trends` for details of available dynamic processes), you will need to ensure your time series are entered with a fixed sampling interval (i.e. the time between timesteps 1 and 2 should be the same as the time between timesteps 2 and 3, etc...). But note that you can have missing observations for some (or all) series. `mvgam` will check this for you, but again it is useful to ensure you have no missing timepoint x series combinations in your data. You can generally do this with a simple `dplyr` call: +```{r} +# A function to ensure all timepoints within a sequence are identical +all_times_avail = function(time, min_time, max_time){ + identical(as.numeric(sort(time)), + as.numeric(seq.int(from = min_time, to = max_time))) +} + +# Get min and max times from the data +min_time <- min(simdat$data_train$time) +max_time <- max(simdat$data_train$time) + +# Check that all times are recorded for each series +data.frame(series = simdat$data_train$series, + time = simdat$data_train$time) %>% + dplyr::group_by(series) %>% + dplyr::summarise(all_there = all_times_avail(time, + min_time, + max_time)) -> checked_times +if(any(checked_times$all_there == FALSE)){ + warning("One or more series in is missing observations for one or more timepoints") +} else { + cat('All series have observations at all timepoints :)') +} +``` + +Note that models which use dynamic components will assume that smaller values of `time` are *older* (i.e. `time = 1` came *before* `time = 2`, etc...) + +### Irregular sampling intervals? +Most `mvgam` trend models expect `time` to be measured in discrete, evenly-spaced intervals (i.e. one measurement per week, or one per year, for example; though missing values are allowed). But please note that irregularly sampled time intervals are allowed, in which case the `CAR()` trend model (continuous time autoregressive) is appropriate. You can see an example of this kind of model in the **Examples** section in `?CAR`. You can also use `trend_model = 'None'` (the default in `mvgam()`) and instead use a Gaussian Process to model temporal variation for irregularly-sampled time series. See the `?brms::gp` for details + +## Checking data with `get_mvgam_priors` +The `get_mvgam_priors` function is designed to return information about the parameters in a model whose prior distributions can be modified by the user. But in doing so, it will perform a series of checks to ensure the data are formatted properly. It can therefore be very useful to new users for ensuring there isn't anything strange going on in the data setup. For example, we can replicate the steps taken above (to check factor levels and timepoint x series combinations) with a single call to `get_mvgam_priors`. Here we first simulate some data in which some of the timepoints in the `time` variable are not included in the data: +```{r} +bad_times <- data.frame(time = seq(1, 16, by = 2), + series = factor('series_1'), + outcome = rnorm(8)) +bad_times +``` + +Next we call `get_mvgam_priors` by simply specifying an intercept-only model, which is enough to trigger all the checks: +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = bad_times, + family = gaussian()) +``` + +This error is useful as it tells us where the problem is. There are many ways to fill in missing timepoints, so the correct way will have to be left up to the user. But if you don't have any covariates, it should be pretty easy using `expand.grid`: +```{r} +bad_times %>% + dplyr::right_join(expand.grid(time = seq(min(bad_times$time), + max(bad_times$time)), + series = factor(unique(bad_times$series), + levels = levels(bad_times$series)))) %>% + dplyr::arrange(time) -> good_times +good_times +``` + +Now the call to `get_mvgam_priors`, using our filled in data, should work: +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = good_times, + family = gaussian()) +``` + +This function should also pick up on misaligned factor levels for the `series` variable. We can check this by again simulating, this time adding an additional factor level that is not included in the data: +```{r} +bad_levels <- data.frame(time = 1:8, + series = factor('series_1', + levels = c('series_1', + 'series_2')), + outcome = rnorm(8)) + +levels(bad_levels$series) +``` + +Another call to `get_mvgam_priors` brings up a useful error: +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = bad_levels, + family = gaussian()) +``` + +Following the message's advice tells us there is a level for `series_2` in the `series` variable, but there are no observations for this series in the data: +```{r} +setdiff(levels(bad_levels$series), unique(bad_levels$series)) +``` + +Re-assigning the levels fixes the issue: +```{r} +bad_levels %>% + dplyr::mutate(series = droplevels(series)) -> good_levels +levels(good_levels$series) +``` + +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = good_levels, + family = gaussian()) +``` + +### Covariates with no `NA`s +Covariates can be used in models just as you would when using `mgcv` (see `?formula.gam` for details of the formula syntax). But although the outcome variable can have `NA`s, covariates cannot. Most regression software will silently drop any raws in the model matrix that have `NA`s, which is not helpful when debugging. Both the `mvgam` and `get_mvgam_priors` functions will run some simple checks for you, and hopefully will return useful errors if it finds in missing values: +```{r} +miss_dat <- data.frame(outcome = rnorm(10), + cov = c(NA, rnorm(9)), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat +``` + +```{r error = TRUE} +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) +``` + +Just like with the `mgcv` package, `mvgam` can also accept data as a `list` object. This is useful if you want to set up [linear functional predictors](https://rdrr.io/cran/mgcv/man/linear.functional.terms.html) or even distributed lag predictors. The checks run by `mvgam` should still work on these data. Here we change the `cov` predictor to be a `matrix`: +```{r} +miss_dat <- list(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat$cov <- matrix(rnorm(50), ncol = 5, nrow = 10) +miss_dat$cov[2,3] <- NA +``` + +A call to `mvgam` returns the same error: +```{r error=TRUE} +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) +``` + +## Plotting with `plot_mvgam_series` +Plotting the data is a useful way to ensure everything looks ok, once you've gone throug the above checks on factor levels and timepoint x series combinations. The `plot_mvgam_series` function will take supplied data and plot either a series of line plots (if you choose `series = 'all'`) or a set of plots to describe the distribution for a single time series. For example, to plot all of the time series in our data, and highlight a single series in each plot, we can use: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 'all') +``` + +Or we can look more closely at the distribution for the first time series: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 1) +``` + +If you have split your data into training and testing folds (i.e. for forecast evaluation), you can include the test data in your plots: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + y = 'y', + series = 1) +``` + +## Example with NEON tick data +To give one example of how data can be reformatted for `mvgam` modelling, we will use observations from the National Ecological Observatory Network (NEON) tick drag cloth samples. *Ixodes scapularis* is a widespread tick species capable of transmitting a diversity of parasites to animals and humans, many of which are zoonotic. Due to the medical and ecological importance of this tick species, a common goal is to understand factors that influence their abundances. The NEON field team carries out standardised [long-term monitoring of tick abundances as well as other important indicators of ecological change](https://www.neonscience.org/data-collection/ticks){target="_blank"}. Nymphal abundance of *I. scapularis* is routinely recorded across NEON plots using a field sampling method called drag cloth sampling, which is a common method for sampling ticks in the landscape. Field researchers sample ticks by dragging a large cloth behind themselves through terrain that is suspected of harboring ticks, usually working in a grid-like pattern. The sites have been sampled since 2014, resulting in a rich dataset of nymph abundance time series. These tick time series show strong seasonality and incorporate many of the challenging features associated with ecological data including overdispersion, high proportions of missingness and irregular sampling in time, making them useful for exploring the utility of dynamic GAMs. + +We begin by loading NEON tick data for the years 2014 - 2021, which were downloaded from NEON and prepared as described in [Clark & Wells 2022](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.13974){target="_blank"}. You can read a bit about the data using the call `?all_neon_tick_data` +```{r} +data("all_neon_tick_data") +str(dplyr::ungroup(all_neon_tick_data)) +``` + +For this exercise, we will use the `epiWeek` variable as an index of seasonality, and we will only work with observations from a few sampling plots (labelled in the `plotID` column): +```{r} +plotIDs <- c('SCBI_013','SCBI_002', + 'SERC_001','SERC_005', + 'SERC_006','SERC_012', + 'BLAN_012','BLAN_005') +``` + +Now we can select the target species we want (*I. scapularis*), filter to the correct plot IDs and convert the `epiWeek` variable from `character` to `numeric`: +```{r} +model_dat <- all_neon_tick_data %>% + dplyr::ungroup() %>% + dplyr::mutate(target = ixodes_scapularis) %>% + dplyr::filter(plotID %in% plotIDs) %>% + dplyr::select(Year, epiWeek, plotID, target) %>% + dplyr::mutate(epiWeek = as.numeric(epiWeek)) +``` + +Now is the tricky part: we need to fill in missing observations with `NA`s. The tick data are sparse in that field observers do not go out and sample in each possible `epiWeek`. So there are many particular weeks in which observations are not included in the data. But we can use `expand.grid` again to take care of this: +```{r} +model_dat %>% + # Create all possible combos of plotID, Year and epiWeek; + # missing outcomes will be filled in as NA + dplyr::full_join(expand.grid(plotID = unique(model_dat$plotID), + Year = unique(model_dat$Year), + epiWeek = seq(1, 52))) %>% + + # left_join back to original data so plotID and siteID will + # match up, in case you need the siteID for anything else later on + dplyr::left_join(all_neon_tick_data %>% + dplyr::select(siteID, plotID) %>% + dplyr::distinct()) -> model_dat +``` + +Create the `series` variable needed for `mvgam` modelling: +```{r} +model_dat %>% + dplyr::mutate(series = plotID, + y = target) %>% + dplyr::mutate(siteID = factor(siteID), + series = factor(series)) %>% + dplyr::select(-target, -plotID) %>% + dplyr::arrange(Year, epiWeek, series) -> model_dat +``` + +Now create the `time` variable, which needs to track `Year` and `epiWeek` for each unique series. The `n` function from `dplyr` is often useful if generating a `time` index for grouped dataframes: +```{r} +model_dat %>% + dplyr::ungroup() %>% + dplyr::group_by(series) %>% + dplyr::arrange(Year, epiWeek) %>% + dplyr::mutate(time = seq(1, dplyr::n())) %>% + dplyr::ungroup() -> model_dat +``` + +Check factor levels for the `series`: +```{r} +levels(model_dat$series) +``` + +This looks good, as does a more rigorous check using `get_mvgam_priors`: +```{r error=TRUE} +get_mvgam_priors(y ~ 1, + data = model_dat, + family = poisson()) +``` + +We can also set up a model in `mvgam` but use `run_model = FALSE` to further ensure all of the necessary steps for creating the modelling code and objects will run. It is recommended that you use the `cmdstanr` backend if possible, as the auto-formatting options available in this package are very useful for checking the package-generated `Stan` code for any inefficiencies that can be fixed to lead to sampling performance improvements: +```{r} +testmod <- mvgam(y ~ s(epiWeek, by = series, bs = 'cc') + + s(series, bs = 're'), + trend_model = 'AR1', + data = model_dat, + backend = 'cmdstanr', + run_model = FALSE) +``` + +This call runs without issue, and the resulting object now contains the model code and data objects that are needed to initiate sampling: +```{r} +str(testmod$model_data) +``` + +```{r} +code(testmod) +``` + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/doc/data_in_mvgam.html b/doc/data_in_mvgam.html new file mode 100644 index 00000000..369d4844 --- /dev/null +++ b/doc/data_in_mvgam.html @@ -0,0 +1,1148 @@ + + + + + + + + + + + + + + + + +Formatting data for use in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Formatting data for use in mvgam

+

Nicholas J Clark

+

2024-04-16

+ + +
+ +
+ +

This vignette gives an example of how to take raw data and format it +for use in mvgam. This is not an exhaustive example, as +data can be recorded and stored in a variety of ways, which requires +different approaches to wrangle the data into the necessary format for +mvgam. For full details on the basic mvgam +functionality, please see the +introductory vignette.

+
+

Required long data format

+

Manipulating the data into a ‘long’ format is necessary for modelling +in mvgam. By ‘long’ format, we mean that each +series x time observation needs to have its own entry in +the dataframe or list object that we wish to +use as data for modelling. A simple example can be viewed by simulating +data using the sim_mvgam function. See +?sim_mvgam for more details

+
simdat <- sim_mvgam(n_series = 4, T = 24, prop_missing = 0.2)
+head(simdat$data_train, 16)
+#>     y season year   series time
+#> 1  NA      1    1 series_1    1
+#> 2   0      1    1 series_2    1
+#> 3   0      1    1 series_3    1
+#> 4   0      1    1 series_4    1
+#> 5   0      2    1 series_1    2
+#> 6   1      2    1 series_2    2
+#> 7   1      2    1 series_3    2
+#> 8   1      2    1 series_4    2
+#> 9   0      3    1 series_1    3
+#> 10 NA      3    1 series_2    3
+#> 11  0      3    1 series_3    3
+#> 12 NA      3    1 series_4    3
+#> 13  1      4    1 series_1    4
+#> 14  0      4    1 series_2    4
+#> 15  0      4    1 series_3    4
+#> 16  2      4    1 series_4    4
+
+

series as a factor variable

+

Notice how we have four different time series in these simulated +data, and we have identified the series-level indicator as a +factor variable.

+
class(simdat$data_train$series)
+#> [1] "factor"
+levels(simdat$data_train$series)
+#> [1] "series_1" "series_2" "series_3" "series_4"
+

It is important that the number of levels matches the number of +unique series in the data to ensure indexing across series works +properly in the underlying modelling functions. Several of the main +workhorse functions in the package (including mvgam() and +get_mvgam_priors()) will give an error if this is not the +case, but it may be worth checking anyway:

+
all(levels(simdat$data_train$series) %in% unique(simdat$data_train$series))
+#> [1] TRUE
+

Note that you can technically supply data that does not have a +series indicator, and the package will assume that you are +only using a single time series. But again, it is better to have this +included so there is no confusion.

+
+
+

A single outcome variable

+

You may also have notices that we do not spread the +numeric / integer-classed outcome variable into different +columns. Rather, there is only a single column for the outcome variable, +labelled y in these simulated data (though the outcome does +not have to be labelled y). This is another important +requirement in mvgam, but it shouldn’t be too unfamiliar to +R users who frequently use modelling packages such as +lme4, mgcv, brms or the many +other regression modelling packages out there. The advantage of this +format is that it is now very easy to specify effects that vary among +time series:

+
summary(glm(y ~ series + time,
+            data = simdat$data_train,
+            family = poisson()))
+#> 
+#> Call:
+#> glm(formula = y ~ series + time, family = poisson(), data = simdat$data_train)
+#> 
+#> Coefficients:
+#>                Estimate Std. Error z value Pr(>|z|)  
+#> (Intercept)    -0.05275    0.38870  -0.136   0.8920  
+#> seriesseries_2 -0.80716    0.45417  -1.777   0.0755 .
+#> seriesseries_3 -1.21614    0.51290  -2.371   0.0177 *
+#> seriesseries_4  0.55084    0.31854   1.729   0.0838 .
+#> time            0.01725    0.02701   0.639   0.5229  
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> (Dispersion parameter for poisson family taken to be 1)
+#> 
+#>     Null deviance: 120.029  on 56  degrees of freedom
+#> Residual deviance:  96.641  on 52  degrees of freedom
+#>   (15 observations deleted due to missingness)
+#> AIC: 166.83
+#> 
+#> Number of Fisher Scoring iterations: 6
+
summary(gam(y ~ series + s(time, by = series),
+            data = simdat$data_train,
+            family = poisson()))
+#> 
+#> Family: poisson 
+#> Link function: log 
+#> 
+#> Formula:
+#> y ~ series + s(time, by = series)
+#> 
+#> Parametric coefficients:
+#>                Estimate Std. Error z value Pr(>|z|)
+#> (Intercept)      -4.293      5.500  -0.781    0.435
+#> seriesseries_2    3.001      5.533   0.542    0.588
+#> seriesseries_3    3.193      5.518   0.579    0.563
+#> seriesseries_4    4.795      5.505   0.871    0.384
+#> 
+#> Approximate significance of smooth terms:
+#>                          edf Ref.df Chi.sq p-value  
+#> s(time):seriesseries_1 7.737  8.181  6.541  0.5585  
+#> s(time):seriesseries_2 3.444  4.213  4.739  0.3415  
+#> s(time):seriesseries_3 1.000  1.000  0.006  0.9365  
+#> s(time):seriesseries_4 3.958  4.832 11.636  0.0363 *
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> R-sq.(adj) =  0.605   Deviance explained = 66.2%
+#> UBRE = 0.4193  Scale est. = 1         n = 57
+

Depending on the observation families you plan to use when building +models, there may be some restrictions that need to be satisfied within +the outcome variable. For example, a Beta regression can only handle +proportional data, so values >= 1 or +<= 0 are not allowed. Likewise, a Poisson regression can +only handle non-negative integers. Most regression functions in +R will assume the user knows all of this and so will not +issue any warnings or errors if you choose the wrong distribution, but +often this ends up leading to some unhelpful error from an optimizer +that is difficult to interpret and diagnose. mvgam will +attempt to provide some errors if you do something that is simply not +allowed. For example, we can simulate data from a zero-centred Gaussian +distribution (ensuring that some of our values will be +< 1) and attempt a Beta regression in mvgam +using the betar family:

+
gauss_dat <- data.frame(outcome = rnorm(10),
+                        series = factor('series1',
+                                        levels = 'series1'),
+                        time = 1:10)
+gauss_dat
+#>        outcome  series time
+#> 1  -1.51807964 series1    1
+#> 2  -0.12895041 series1    2
+#> 3   0.91902592 series1    3
+#> 4  -0.78329254 series1    4
+#> 5   0.28469724 series1    5
+#> 6   0.07481887 series1    6
+#> 7   0.03770728 series1    7
+#> 8  -0.37485636 series1    8
+#> 9   0.23694172 series1    9
+#> 10 -0.53988302 series1   10
+

A call to gam using the mgcv package leads +to a model that actually fits (though it does give an unhelpful warning +message):

+
gam(outcome ~ time,
+    family = betar(),
+    data = gauss_dat)
+#> Warning in family$saturated.ll(y, prior.weights, theta): saturated likelihood
+#> may be inaccurate
+#> 
+#> Family: Beta regression(0.44) 
+#> Link function: logit 
+#> 
+#> Formula:
+#> outcome ~ time
+#> Total model degrees of freedom 2 
+#> 
+#> REML score: -127.2706
+

But the same call to mvgam gives us something more +useful:

+
mvgam(outcome ~ time,
+      family = betar(),
+      data = gauss_dat)
+#> Error: Values <= 0 not allowed for beta responses
+

Please see ?mvgam_families for more information on the +types of responses that the package can handle and their +restrictions

+
+
+

A time variable

+

The other requirement for most models that can be fit in +mvgam is a numeric / integer-classed variable +labelled time. This ensures the modelling software knows +how to arrange the time series when building models. This setup still +allows us to formulate multivariate time series models. If you plan to +use any of the autoregressive dynamic trend functions available in +mvgam (see ?mvgam_trends for details of +available dynamic processes), you will need to ensure your time series +are entered with a fixed sampling interval (i.e. the time between +timesteps 1 and 2 should be the same as the time between timesteps 2 and +3, etc…). But note that you can have missing observations for some (or +all) series. mvgam will check this for you, but again it is +useful to ensure you have no missing timepoint x series combinations in +your data. You can generally do this with a simple dplyr +call:

+
# A function to ensure all timepoints within a sequence are identical
+all_times_avail = function(time, min_time, max_time){
+    identical(as.numeric(sort(time)),
+              as.numeric(seq.int(from = min_time, to = max_time)))
+}
+
+# Get min and max times from the data
+min_time <- min(simdat$data_train$time)
+max_time <- max(simdat$data_train$time)
+
+# Check that all times are recorded for each series
+data.frame(series = simdat$data_train$series,
+           time = simdat$data_train$time) %>%
+    dplyr::group_by(series) %>%
+    dplyr::summarise(all_there = all_times_avail(time,
+                                                 min_time,
+                                                 max_time)) -> checked_times
+if(any(checked_times$all_there == FALSE)){
+  warning("One or more series in is missing observations for one or more timepoints")
+} else {
+  cat('All series have observations at all timepoints :)')
+}
+#> All series have observations at all timepoints :)
+

Note that models which use dynamic components will assume that +smaller values of time are older +(i.e. time = 1 came before time = 2, +etc…)

+
+
+

Irregular sampling intervals?

+

Most mvgam trend models expect time to be +measured in discrete, evenly-spaced intervals (i.e. one measurement per +week, or one per year, for example; though missing values are allowed). +But please note that irregularly sampled time intervals are allowed, in +which case the CAR() trend model (continuous time +autoregressive) is appropriate. You can see an example of this kind of +model in the Examples section in ?CAR. You +can also use trend_model = 'None' (the default in +mvgam()) and instead use a Gaussian Process to model +temporal variation for irregularly-sampled time series. See the +?brms::gp for details

+
+
+
+

Checking data with get_mvgam_priors

+

The get_mvgam_priors function is designed to return +information about the parameters in a model whose prior distributions +can be modified by the user. But in doing so, it will perform a series +of checks to ensure the data are formatted properly. It can therefore be +very useful to new users for ensuring there isn’t anything strange going +on in the data setup. For example, we can replicate the steps taken +above (to check factor levels and timepoint x series combinations) with +a single call to get_mvgam_priors. Here we first simulate +some data in which some of the timepoints in the time +variable are not included in the data:

+
bad_times <- data.frame(time = seq(1, 16, by = 2),
+                        series = factor('series_1'),
+                        outcome = rnorm(8))
+bad_times
+#>   time   series    outcome
+#> 1    1 series_1  1.4681068
+#> 2    3 series_1  0.1796627
+#> 3    5 series_1 -0.4204020
+#> 4    7 series_1 -1.0729359
+#> 5    9 series_1 -0.1738239
+#> 6   11 series_1 -0.5463268
+#> 7   13 series_1  0.8275198
+#> 8   15 series_1  2.2085085
+

Next we call get_mvgam_priors by simply specifying an +intercept-only model, which is enough to trigger all the checks:

+
get_mvgam_priors(outcome ~ 1,
+                 data = bad_times,
+                 family = gaussian())
+#> Error: One or more series in data is missing observations for one or more timepoints
+

This error is useful as it tells us where the problem is. There are +many ways to fill in missing timepoints, so the correct way will have to +be left up to the user. But if you don’t have any covariates, it should +be pretty easy using expand.grid:

+
bad_times %>%
+  dplyr::right_join(expand.grid(time = seq(min(bad_times$time),
+                                           max(bad_times$time)),
+                                series = factor(unique(bad_times$series),
+                                                levels = levels(bad_times$series)))) %>%
+  dplyr::arrange(time) -> good_times
+#> Joining with `by = join_by(time, series)`
+good_times
+#>    time   series    outcome
+#> 1     1 series_1  1.4681068
+#> 2     2 series_1         NA
+#> 3     3 series_1  0.1796627
+#> 4     4 series_1         NA
+#> 5     5 series_1 -0.4204020
+#> 6     6 series_1         NA
+#> 7     7 series_1 -1.0729359
+#> 8     8 series_1         NA
+#> 9     9 series_1 -0.1738239
+#> 10   10 series_1         NA
+#> 11   11 series_1 -0.5463268
+#> 12   12 series_1         NA
+#> 13   13 series_1  0.8275198
+#> 14   14 series_1         NA
+#> 15   15 series_1  2.2085085
+

Now the call to get_mvgam_priors, using our filled in +data, should work:

+
get_mvgam_priors(outcome ~ 1,
+                 data = good_times,
+                 family = gaussian())
+#>                             param_name param_length           param_info
+#> 1                          (Intercept)            1          (Intercept)
+#> 2 vector<lower=0>[n_series] sigma_obs;            1 observation error sd
+#>                                 prior                   example_change
+#> 1 (Intercept) ~ student_t(3, 0, 2.5);      (Intercept) ~ normal(0, 1);
+#> 2   sigma_obs ~ student_t(3, 0, 2.5); sigma_obs ~ normal(-0.22, 0.33);
+#>   new_lowerbound new_upperbound
+#> 1             NA             NA
+#> 2             NA             NA
+

This function should also pick up on misaligned factor levels for the +series variable. We can check this by again simulating, +this time adding an additional factor level that is not included in the +data:

+
bad_levels <- data.frame(time = 1:8,
+                        series = factor('series_1',
+                                        levels = c('series_1',
+                                                   'series_2')),
+                        outcome = rnorm(8))
+
+levels(bad_levels$series)
+#> [1] "series_1" "series_2"
+

Another call to get_mvgam_priors brings up a useful +error:

+
get_mvgam_priors(outcome ~ 1,
+                 data = bad_levels,
+                 family = gaussian())
+#> Error: Mismatch between factor levels of "series" and unique values of "series"
+#> Use
+#>   `setdiff(levels(data$series), unique(data$series))` 
+#> and
+#>   `intersect(levels(data$series), unique(data$series))`
+#> for guidance
+

Following the message’s advice tells us there is a level for +series_2 in the series variable, but there are +no observations for this series in the data:

+
setdiff(levels(bad_levels$series), unique(bad_levels$series))
+#> [1] "series_2"
+

Re-assigning the levels fixes the issue:

+
bad_levels %>%
+  dplyr::mutate(series = droplevels(series)) -> good_levels
+levels(good_levels$series)
+#> [1] "series_1"
+
get_mvgam_priors(outcome ~ 1,
+                 data = good_levels,
+                 family = gaussian())
+#>                             param_name param_length           param_info
+#> 1                          (Intercept)            1          (Intercept)
+#> 2 vector<lower=0>[n_series] sigma_obs;            1 observation error sd
+#>                                  prior                  example_change
+#> 1 (Intercept) ~ student_t(3, -1, 2.5);     (Intercept) ~ normal(0, 1);
+#> 2    sigma_obs ~ student_t(3, 0, 2.5); sigma_obs ~ normal(0.98, 0.91);
+#>   new_lowerbound new_upperbound
+#> 1             NA             NA
+#> 2             NA             NA
+
+

Covariates with no NAs

+

Covariates can be used in models just as you would when using +mgcv (see ?formula.gam for details of the +formula syntax). But although the outcome variable can have +NAs, covariates cannot. Most regression software will +silently drop any raws in the model matrix that have NAs, +which is not helpful when debugging. Both the mvgam and +get_mvgam_priors functions will run some simple checks for +you, and hopefully will return useful errors if it finds in missing +values:

+
miss_dat <- data.frame(outcome = rnorm(10),
+                       cov = c(NA, rnorm(9)),
+                       series = factor('series1',
+                                       levels = 'series1'),
+                       time = 1:10)
+miss_dat
+#>        outcome        cov  series time
+#> 1   0.77436859         NA series1    1
+#> 2   0.33222199 -0.2653819 series1    2
+#> 3   0.50385503  0.6658354 series1    3
+#> 4  -0.99577591  0.3541730 series1    4
+#> 5  -1.09812817 -2.3125954 series1    5
+#> 6  -0.49687774 -1.0778578 series1    6
+#> 7  -1.26666072 -0.1973507 series1    7
+#> 8  -0.11638041 -3.0585179 series1    8
+#> 9   0.08890432  1.7964928 series1    9
+#> 10 -0.64375459  0.7894733 series1   10
+
get_mvgam_priors(outcome ~ cov,
+                 data = miss_dat,
+                 family = gaussian())
+#> Error: Missing values found in data predictors:
+#>  Error in na.fail.default(structure(list(outcome = c(0.774368589907313, : missing values in object
+

Just like with the mgcv package, mvgam can +also accept data as a list object. This is useful if you +want to set up linear +functional predictors or even distributed lag predictors. The checks +run by mvgam should still work on these data. Here we +change the cov predictor to be a matrix:

+
miss_dat <- list(outcome = rnorm(10),
+                 series = factor('series1',
+                                 levels = 'series1'),
+                 time = 1:10)
+miss_dat$cov <- matrix(rnorm(50), ncol = 5, nrow = 10)
+miss_dat$cov[2,3] <- NA
+

A call to mvgam returns the same error:

+
get_mvgam_priors(outcome ~ cov,
+                 data = miss_dat,
+                 family = gaussian())
+#> Error: Missing values found in data predictors:
+#>  Error in na.fail.default(structure(list(outcome = c(-0.708736388395862, : missing values in object
+
+
+
+

Plotting with plot_mvgam_series

+

Plotting the data is a useful way to ensure everything looks ok, once +you’ve gone throug the above checks on factor levels and timepoint x +series combinations. The plot_mvgam_series function will +take supplied data and plot either a series of line plots (if you choose +series = 'all') or a set of plots to describe the +distribution for a single time series. For example, to plot all of the +time series in our data, and highlight a single series in each plot, we +can use:

+
plot_mvgam_series(data = simdat$data_train, 
+                  y = 'y', 
+                  series = 'all')
+

Plotting time series features for GAM models in mvgam

+

Or we can look more closely at the distribution for the first time +series:

+
plot_mvgam_series(data = simdat$data_train, 
+                  y = 'y', 
+                  series = 1)
+

Plotting time series features for GAM models in mvgam

+

If you have split your data into training and testing folds (i.e. for +forecast evaluation), you can include the test data in your plots:

+
plot_mvgam_series(data = simdat$data_train,
+                  newdata = simdat$data_test,
+                  y = 'y', 
+                  series = 1)
+

Plotting time series features for GAM models in mvgam

+
+
+

Example with NEON tick data

+

To give one example of how data can be reformatted for +mvgam modelling, we will use observations from the National +Ecological Observatory Network (NEON) tick drag cloth samples. +Ixodes scapularis is a widespread tick species capable of +transmitting a diversity of parasites to animals and humans, many of +which are zoonotic. Due to the medical and ecological importance of this +tick species, a common goal is to understand factors that influence +their abundances. The NEON field team carries out standardised long-term monitoring of tick abundances as well as other +important indicators of ecological change. Nymphal abundance of +I. scapularis is routinely recorded across NEON plots using a +field sampling method called drag cloth sampling, which is a common +method for sampling ticks in the landscape. Field researchers sample +ticks by dragging a large cloth behind themselves through terrain that +is suspected of harboring ticks, usually working in a grid-like pattern. +The sites have been sampled since 2014, resulting in a rich dataset of +nymph abundance time series. These tick time series show strong +seasonality and incorporate many of the challenging features associated +with ecological data including overdispersion, high proportions of +missingness and irregular sampling in time, making them useful for +exploring the utility of dynamic GAMs.

+

We begin by loading NEON tick data for the years 2014 - 2021, which +were downloaded from NEON and prepared as described in Clark & Wells 2022. You can read a bit about the +data using the call ?all_neon_tick_data

+
data("all_neon_tick_data")
+str(dplyr::ungroup(all_neon_tick_data))
+#> tibble [3,505 × 24] (S3: tbl_df/tbl/data.frame)
+#>  $ Year                : num [1:3505] 2015 2015 2015 2015 2015 ...
+#>  $ epiWeek             : chr [1:3505] "37" "38" "39" "40" ...
+#>  $ yearWeek            : chr [1:3505] "201537" "201538" "201539" "201540" ...
+#>  $ plotID              : chr [1:3505] "BLAN_005" "BLAN_005" "BLAN_005" "BLAN_005" ...
+#>  $ siteID              : chr [1:3505] "BLAN" "BLAN" "BLAN" "BLAN" ...
+#>  $ nlcdClass           : chr [1:3505] "deciduousForest" "deciduousForest" "deciduousForest" "deciduousForest" ...
+#>  $ decimalLatitude     : num [1:3505] 39.1 39.1 39.1 39.1 39.1 ...
+#>  $ decimalLongitude    : num [1:3505] -78 -78 -78 -78 -78 ...
+#>  $ elevation           : num [1:3505] 168 168 168 168 168 ...
+#>  $ totalSampledArea    : num [1:3505] 162 NA NA NA 162 NA NA NA NA 164 ...
+#>  $ amblyomma_americanum: num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ ixodes_scapularis   : num [1:3505] 2 NA NA NA 0 NA NA NA NA 0 ...
+#>  $ time                : Date[1:3505], format: "2015-09-13" "2015-09-20" ...
+#>  $ RHMin_precent       : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ RHMin_variance      : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ RHMax_precent       : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ RHMax_variance      : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMin_degC     : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMin_variance : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMax_degC     : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMax_variance : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ soi                 : num [1:3505] -18.4 -17.9 -23.5 -28.4 -25.9 ...
+#>  $ cum_sdd             : num [1:3505] 173 173 173 173 173 ...
+#>  $ cum_gdd             : num [1:3505] 1129 1129 1129 1129 1129 ...
+

For this exercise, we will use the epiWeek variable as +an index of seasonality, and we will only work with observations from a +few sampling plots (labelled in the plotID column):

+
plotIDs <- c('SCBI_013','SCBI_002',
+             'SERC_001','SERC_005',
+             'SERC_006','SERC_012',
+             'BLAN_012','BLAN_005')
+

Now we can select the target species we want (I. +scapularis), filter to the correct plot IDs and convert the +epiWeek variable from character to +numeric:

+
model_dat <- all_neon_tick_data %>%
+  dplyr::ungroup() %>%
+  dplyr::mutate(target = ixodes_scapularis) %>%
+  dplyr::filter(plotID %in% plotIDs) %>%
+  dplyr::select(Year, epiWeek, plotID, target) %>%
+  dplyr::mutate(epiWeek = as.numeric(epiWeek))
+

Now is the tricky part: we need to fill in missing observations with +NAs. The tick data are sparse in that field observers do +not go out and sample in each possible epiWeek. So there +are many particular weeks in which observations are not included in the +data. But we can use expand.grid again to take care of +this:

+
model_dat %>%
+  # Create all possible combos of plotID, Year and epiWeek; 
+  # missing outcomes will be filled in as NA
+  dplyr::full_join(expand.grid(plotID = unique(model_dat$plotID),
+                               Year = unique(model_dat$Year),
+                               epiWeek = seq(1, 52))) %>%
+  
+  # left_join back to original data so plotID and siteID will
+  # match up, in case you need the siteID for anything else later on
+  dplyr::left_join(all_neon_tick_data %>%
+                     dplyr::select(siteID, plotID) %>%
+                     dplyr::distinct()) -> model_dat
+#> Joining with `by = join_by(Year, epiWeek, plotID)`
+#> Joining with `by = join_by(plotID)`
+

Create the series variable needed for mvgam +modelling:

+
model_dat %>%
+  dplyr::mutate(series = plotID,
+                y = target) %>%
+  dplyr::mutate(siteID = factor(siteID),
+                series = factor(series)) %>%
+  dplyr::select(-target, -plotID) %>%
+  dplyr::arrange(Year, epiWeek, series) -> model_dat 
+

Now create the time variable, which needs to track +Year and epiWeek for each unique series. The +n function from dplyr is often useful if +generating a time index for grouped dataframes:

+
model_dat %>%
+  dplyr::ungroup() %>%
+  dplyr::group_by(series) %>%
+  dplyr::arrange(Year, epiWeek) %>%
+  dplyr::mutate(time = seq(1, dplyr::n())) %>%
+  dplyr::ungroup() -> model_dat
+

Check factor levels for the series:

+
levels(model_dat$series)
+#> [1] "BLAN_005" "BLAN_012" "SCBI_002" "SCBI_013" "SERC_001" "SERC_005" "SERC_006"
+#> [8] "SERC_012"
+

This looks good, as does a more rigorous check using +get_mvgam_priors:

+
get_mvgam_priors(y ~ 1,
+                 data = model_dat,
+                 family = poisson())
+#>    param_name param_length  param_info                                  prior
+#> 1 (Intercept)            1 (Intercept) (Intercept) ~ student_t(3, -2.3, 2.5);
+#>                example_change new_lowerbound new_upperbound
+#> 1 (Intercept) ~ normal(0, 1);             NA             NA
+

We can also set up a model in mvgam but use +run_model = FALSE to further ensure all of the necessary +steps for creating the modelling code and objects will run. It is +recommended that you use the cmdstanr backend if possible, +as the auto-formatting options available in this package are very useful +for checking the package-generated Stan code for any +inefficiencies that can be fixed to lead to sampling performance +improvements:

+
testmod <- mvgam(y ~ s(epiWeek, by = series, bs = 'cc') +
+                   s(series, bs = 're'),
+                 trend_model = 'AR1',
+                 data = model_dat,
+                 backend = 'cmdstanr',
+                 run_model = FALSE)
+

This call runs without issue, and the resulting object now contains +the model code and data objects that are needed to initiate +sampling:

+
str(testmod$model_data)
+#> List of 25
+#>  $ y           : num [1:416, 1:8] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
+#>  $ n           : int 416
+#>  $ X           : num [1:3328, 1:73] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..- attr(*, "dimnames")=List of 2
+#>   .. ..$ : chr [1:3328] "1" "2" "3" "4" ...
+#>   .. ..$ : chr [1:73] "X.Intercept." "V2" "V3" "V4" ...
+#>  $ S1          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ zero        : num [1:73] 0 0 0 0 0 0 0 0 0 0 ...
+#>  $ S2          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S3          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S4          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S5          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S6          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S7          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S8          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ p_coefs     : Named num 0.806
+#>   ..- attr(*, "names")= chr "(Intercept)"
+#>  $ p_taus      : num 301
+#>  $ ytimes      : int [1:416, 1:8] 1 9 17 25 33 41 49 57 65 73 ...
+#>  $ n_series    : int 8
+#>  $ sp          : Named num [1:9] 4.68 59.57 1.11 2.73 6.5 ...
+#>   ..- attr(*, "names")= chr [1:9] "s(epiWeek):seriesBLAN_005" "s(epiWeek):seriesBLAN_012" "s(epiWeek):seriesSCBI_002" "s(epiWeek):seriesSCBI_013" ...
+#>  $ y_observed  : num [1:416, 1:8] 0 0 0 0 0 0 0 0 0 0 ...
+#>  $ total_obs   : int 3328
+#>  $ num_basis   : int 73
+#>  $ n_sp        : num 9
+#>  $ n_nonmissing: int 400
+#>  $ obs_ind     : int [1:400] 89 93 98 101 115 118 121 124 127 130 ...
+#>  $ flat_ys     : num [1:400] 2 0 0 0 0 0 0 25 36 14 ...
+#>  $ flat_xs     : num [1:400, 1:73] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..- attr(*, "dimnames")=List of 2
+#>   .. ..$ : chr [1:400] "705" "737" "777" "801" ...
+#>   .. ..$ : chr [1:73] "X.Intercept." "V2" "V3" "V4" ...
+#>  - attr(*, "trend_model")= chr "AR1"
+
code(testmod)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_sp; // number of smoothing parameters
+#>   int<lower=0> n_series; // number of series
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   vector[num_basis] zero; // prior locations for basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   matrix[8, 8] S1; // mgcv smooth penalty matrix S1
+#>   matrix[8, 8] S2; // mgcv smooth penalty matrix S2
+#>   matrix[8, 8] S3; // mgcv smooth penalty matrix S3
+#>   matrix[8, 8] S4; // mgcv smooth penalty matrix S4
+#>   matrix[8, 8] S5; // mgcv smooth penalty matrix S5
+#>   matrix[8, 8] S6; // mgcv smooth penalty matrix S6
+#>   matrix[8, 8] S7; // mgcv smooth penalty matrix S7
+#>   matrix[8, 8] S8; // mgcv smooth penalty matrix S8
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   
+#>   // random effect variances
+#>   vector<lower=0>[1] sigma_raw;
+#>   
+#>   // random effect means
+#>   vector[1] mu_raw;
+#>   
+#>   // latent trend AR1 terms
+#>   vector<lower=-1.5, upper=1.5>[n_series] ar1;
+#>   
+#>   // latent trend variance parameters
+#>   vector<lower=0>[n_series] sigma;
+#>   
+#>   // latent trends
+#>   matrix[n, n_series] trend;
+#>   
+#>   // smoothing parameters
+#>   vector<lower=0>[n_sp] lambda;
+#> }
+#> transformed parameters {
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   b[1 : 65] = b_raw[1 : 65];
+#>   b[66 : 73] = mu_raw[1] + b_raw[66 : 73] * sigma_raw[1];
+#> }
+#> model {
+#>   // prior for random effect population variances
+#>   sigma_raw ~ student_t(3, 0, 2.5);
+#>   
+#>   // prior for random effect population means
+#>   mu_raw ~ std_normal();
+#>   
+#>   // prior for (Intercept)...
+#>   b_raw[1] ~ student_t(3, -2.3, 2.5);
+#>   
+#>   // prior for s(epiWeek):seriesBLAN_005...
+#>   b_raw[2 : 9] ~ multi_normal_prec(zero[2 : 9], S1[1 : 8, 1 : 8] * lambda[1]);
+#>   
+#>   // prior for s(epiWeek):seriesBLAN_012...
+#>   b_raw[10 : 17] ~ multi_normal_prec(zero[10 : 17],
+#>                                      S2[1 : 8, 1 : 8] * lambda[2]);
+#>   
+#>   // prior for s(epiWeek):seriesSCBI_002...
+#>   b_raw[18 : 25] ~ multi_normal_prec(zero[18 : 25],
+#>                                      S3[1 : 8, 1 : 8] * lambda[3]);
+#>   
+#>   // prior for s(epiWeek):seriesSCBI_013...
+#>   b_raw[26 : 33] ~ multi_normal_prec(zero[26 : 33],
+#>                                      S4[1 : 8, 1 : 8] * lambda[4]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_001...
+#>   b_raw[34 : 41] ~ multi_normal_prec(zero[34 : 41],
+#>                                      S5[1 : 8, 1 : 8] * lambda[5]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_005...
+#>   b_raw[42 : 49] ~ multi_normal_prec(zero[42 : 49],
+#>                                      S6[1 : 8, 1 : 8] * lambda[6]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_006...
+#>   b_raw[50 : 57] ~ multi_normal_prec(zero[50 : 57],
+#>                                      S7[1 : 8, 1 : 8] * lambda[7]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_012...
+#>   b_raw[58 : 65] ~ multi_normal_prec(zero[58 : 65],
+#>                                      S8[1 : 8, 1 : 8] * lambda[8]);
+#>   
+#>   // prior (non-centred) for s(series)...
+#>   b_raw[66 : 73] ~ std_normal();
+#>   
+#>   // priors for AR parameters
+#>   ar1 ~ std_normal();
+#>   
+#>   // priors for smoothing parameters
+#>   lambda ~ normal(5, 30);
+#>   
+#>   // priors for latent trend variance parameters
+#>   sigma ~ student_t(3, 0, 2.5);
+#>   
+#>   // trend estimates
+#>   trend[1, 1 : n_series] ~ normal(0, sigma);
+#>   for (s in 1 : n_series) {
+#>     trend[2 : n, s] ~ normal(ar1[s] * trend[1 : (n - 1), s], sigma[s]);
+#>   }
+#>   {
+#>     // likelihood functions
+#>     vector[n_nonmissing] flat_trends;
+#>     flat_trends = to_vector(trend)[obs_ind];
+#>     flat_ys ~ poisson_log_glm(append_col(flat_xs, flat_trends), 0.0,
+#>                               append_row(b, 1.0));
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   vector[n_sp] rho;
+#>   vector[n_series] tau;
+#>   array[n, n_series] int ypred;
+#>   rho = log(lambda);
+#>   for (s in 1 : n_series) {
+#>     tau[s] = pow(sigma[s], -2.0);
+#>   }
+#>   
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]] + trend[1 : n, s];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/doc/forecast_evaluation.R b/doc/forecast_evaluation.R new file mode 100644 index 00000000..28942b03 --- /dev/null +++ b/doc/forecast_evaluation.R @@ -0,0 +1,221 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +set.seed(2345) +simdat <- sim_mvgam(T = 100, + n_series = 3, + trend_model = 'GP', + prop_trend = 0.75, + family = poisson(), + prop_missing = 0.10) + +## ----------------------------------------------------------------------------- +str(simdat) + +## ----fig.alt = "Simulating data for dynamic GAM models in mvgam"-------------- +plot(simdat$global_seasonality[1:12], + type = 'l', lwd = 2, + ylab = 'Relative effect', + xlab = 'Season', + bty = 'l') + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + series = 'all') + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 1) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 2) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 3) + +## ----include=FALSE------------------------------------------------------------ +mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + s(time, by = series, bs = 'cr', k = 20), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + +# s(time, by = series, bs = 'cr', k = 20), +# knots = list(season = c(0.5, 12.5)), +# trend_model = 'None', +# data = simdat$data_train) + +## ----------------------------------------------------------------------------- +summary(mod1, include_betas = FALSE) + +## ----fig.alt = "Plotting GAM smooth functions using mvgam"-------------------- +plot(mod1, type = 'smooths') + +## ----include=FALSE------------------------------------------------------------ +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + adapt_delta = 0.98) + +## ----eval=FALSE--------------------------------------------------------------- +# mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + +# gp(time, by = series, c = 5/4, k = 20, +# scale = FALSE), +# knots = list(season = c(0.5, 12.5)), +# trend_model = 'None', +# data = simdat$data_train) + +## ----------------------------------------------------------------------------- +summary(mod2, include_betas = FALSE) + +## ----fig.alt = "Summarising latent Gaussian Process parameters in mvgam"------ +mcmc_plot(mod2, variable = c('alpha_gp'), regex = TRUE, type = 'areas') + +## ----fig.alt = "Summarising latent Gaussian Process parameters in mvgam"------ +mcmc_plot(mod2, variable = c('rho_gp'), regex = TRUE, type = 'areas') + +## ----fig.alt = "Plotting Gaussian Process effects in mvgam"------------------- +plot(mod2, type = 'smooths') + +## ----fig.alt = "Summarising latent Gaussian Process parameters in mvgam and marginaleffects"---- +require('ggplot2') +plot_predictions(mod2, + condition = c('time', 'series', 'series'), + type = 'link') + + theme(legend.position = 'none') + +## ----------------------------------------------------------------------------- +fc_mod1 <- forecast(mod1, newdata = simdat$data_test) +fc_mod2 <- forecast(mod2, newdata = simdat$data_test) + +## ----------------------------------------------------------------------------- +str(fc_mod1) + +## ----------------------------------------------------------------------------- +plot(fc_mod1, series = 1) +plot(fc_mod2, series = 1) + +plot(fc_mod1, series = 2) +plot(fc_mod2, series = 2) + +plot(fc_mod1, series = 3) +plot(fc_mod2, series = 3) + +## ----include=FALSE------------------------------------------------------------ +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + newdata = simdat$data_test, + adapt_delta = 0.98) + +## ----eval=FALSE--------------------------------------------------------------- +# mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + +# gp(time, by = series, c = 5/4, k = 20, +# scale = FALSE), +# knots = list(season = c(0.5, 12.5)), +# trend_model = 'None', +# data = simdat$data_train, +# newdata = simdat$data_test) + +## ----------------------------------------------------------------------------- +fc_mod2 <- forecast(mod2) + +## ----warning=FALSE, fig.alt = "Plotting posterior forecast distributions using mvgam and R"---- +plot(fc_mod2, series = 1) + +## ----warning=FALSE------------------------------------------------------------ +crps_mod1 <- score(fc_mod1, score = 'crps') +str(crps_mod1) +crps_mod1$series_1 + +## ----warning=FALSE------------------------------------------------------------ +crps_mod1 <- score(fc_mod1, score = 'crps', interval_width = 0.6) +crps_mod1$series_1 + +## ----------------------------------------------------------------------------- +link_mod1 <- forecast(mod1, newdata = simdat$data_test, type = 'link') +score(link_mod1, score = 'elpd')$series_1 + +## ----------------------------------------------------------------------------- +energy_mod2 <- score(fc_mod2, score = 'energy') +str(energy_mod2) + +## ----------------------------------------------------------------------------- +energy_mod2$all_series + +## ----------------------------------------------------------------------------- +crps_mod1 <- score(fc_mod1, score = 'crps') +crps_mod2 <- score(fc_mod2, score = 'crps') + +diff_scores <- crps_mod2$series_1$score - + crps_mod1$series_1$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + + +diff_scores <- crps_mod2$series_2$score - + crps_mod1$series_2$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + +diff_scores <- crps_mod2$series_3$score - + crps_mod1$series_3$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + + diff --git a/doc/forecast_evaluation.Rmd b/doc/forecast_evaluation.Rmd new file mode 100644 index 00000000..36ea6227 --- /dev/null +++ b/doc/forecast_evaluation.Rmd @@ -0,0 +1,314 @@ +--- +title: "Forecasting and forecast evaluation in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Forecasting and forecast evaluation in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to produce probabilistic forecasts and to evaluate those forecasts using a variety of proper scoring rules. + +## Simulating discrete time series +We begin by simulating some data to show how forecasts are computed and evaluated in `mvgam`. The `sim_mvgam()` function can be used to simulate series that come from a variety of response distributions as well as seasonal patterns and/or dynamic temporal patterns. Here we simulate a collection of three time count-valued series. These series all share the same seasonal pattern but have different temporal dynamics. By setting `trend_model = 'GP'` and `prop_trend = 0.75`, we are generating time series that have smooth underlying temporal trends (evolving as Gaussian Processes with squared exponential kernel) and moderate seasonal patterns. The observations are Poisson-distributed and we allow 10% of observations to be missing. +```{r} +set.seed(2345) +simdat <- sim_mvgam(T = 100, + n_series = 3, + trend_model = 'GP', + prop_trend = 0.75, + family = poisson(), + prop_missing = 0.10) +``` + +The returned object is a `list` containing training and testing data (`sim_mvgam()` automatically splits the data into these folds for us) together with some other information about the data generating process that was used to simulate the data +```{r} +str(simdat) +``` + +Each series in this case has a shared seasonal pattern, which we can visualise: +```{r, fig.alt = "Simulating data for dynamic GAM models in mvgam"} +plot(simdat$global_seasonality[1:12], + type = 'l', lwd = 2, + ylab = 'Relative effect', + xlab = 'Season', + bty = 'l') +``` + +The resulting time series are similar to what we might encounter when dealing with count-valued data that can take small counts: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + series = 'all') +``` + +For each individual series, we can plot the training and testing data, as well as some more specific features of the observed data: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 1) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 2) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 3) +``` + +### Modelling dynamics with splines +The first model we will fit uses a shared cyclic spline to capture the repeated seasonality, as well as series-specific splines of time to capture the long-term dynamics. We allow the temporal splines to be fairly complex so they can capture as much of the temporal variation as possible: +```{r include=FALSE} +mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + s(time, by = series, bs = 'cr', k = 20), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) +``` + +```{r eval=FALSE} +mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + s(time, by = series, bs = 'cr', k = 20), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) +``` + +The model fits without issue: +```{r} +summary(mod1, include_betas = FALSE) +``` + +And we can plot the partial effects of the splines to see that they are estimated to be highly nonlinear +```{r, fig.alt = "Plotting GAM smooth functions using mvgam"} +plot(mod1, type = 'smooths') +``` + +### Modelling dynamics with GPs +Before showing how to produce and evaluate forecasts, we will fit a second model to these data so the two models can be compared. This model is equivalent to the above, except we now use Gaussian Processes to model series-specific dynamics. This makes use of the `gp()` function from `brms`, which can fit Hilbert space approximate GPs. See `?brms::gp` for more details. +```{r include=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + adapt_delta = 0.98) +``` + +```{r eval=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) +``` + +The summary for this model now contains information on the GP parameters for each time series: +```{r} +summary(mod2, include_betas = FALSE) +``` + +We can plot the posteriors for these parameters, and for any other parameter for that matter, using `bayesplot` routines. First the marginal deviation ($\alpha$) parameters: +```{r, fig.alt = "Summarising latent Gaussian Process parameters in mvgam"} +mcmc_plot(mod2, variable = c('alpha_gp'), regex = TRUE, type = 'areas') +``` + +And now the length scale ($\rho$) parameters: +```{r, fig.alt = "Summarising latent Gaussian Process parameters in mvgam"} +mcmc_plot(mod2, variable = c('rho_gp'), regex = TRUE, type = 'areas') +``` + +We can also plot the nonlinear effects as before: +```{r, fig.alt = "Plotting Gaussian Process effects in mvgam"} +plot(mod2, type = 'smooths') +``` +These can also be plotted using `marginaleffects` utilities: +```{r, fig.alt = "Summarising latent Gaussian Process parameters in mvgam and marginaleffects"} +require('ggplot2') +plot_predictions(mod2, + condition = c('time', 'series', 'series'), + type = 'link') + + theme(legend.position = 'none') +``` + +The estimates for the temporal trends are fairly similar for the two models, but below we will see if they produce similar forecasts + +## Forecasting with the `forecast()` function +Probabilistic forecasts can be computed in two main ways in `mvgam`. The first is to take a model that was fit only to training data (as we did above in the two example models) and produce temporal predictions from the posterior predictive distribution by feeding `newdata` to the `forecast()` function. It is crucial that any `newdata` fed to the `forecast()` function follows on sequentially from the data that was used to fit the model (this is not internally checked by the package because it might be a headache to do so when data are not supplied in a specific time-order). When calling the `forecast()` function, you have the option to generate different kinds of predictions (i.e. predicting on the link scale, response scale or to produce expectations; see `?forecast.mvgam` for details). We will use the default and produce forecasts on the response scale, which is the most common way to evaluate forecast distributions +```{r} +fc_mod1 <- forecast(mod1, newdata = simdat$data_test) +fc_mod2 <- forecast(mod2, newdata = simdat$data_test) +``` + +The objects we have created are of class `mvgam_forecast`, which contain information on hindcast distributions, forecast distributions and true observations for each series in the data: +```{r} +str(fc_mod1) +``` + +We can plot the forecasts for each series from each model using the `S3 plot` method for objects of this class: +```{r} +plot(fc_mod1, series = 1) +plot(fc_mod2, series = 1) + +plot(fc_mod1, series = 2) +plot(fc_mod2, series = 2) + +plot(fc_mod1, series = 3) +plot(fc_mod2, series = 3) +``` + +Clearly the two models do not produce equivalent forecasts. We will come back to scoring these forecasts in a moment. + +## Forecasting with `newdata` in `mvgam()` +The second way we can produce forecasts in `mvgam` is to feed the testing data directly to the `mvgam()` function as `newdata`. This will include the testing data as missing observations so that they are automatically predicted from the posterior predictive distribution using the `generated quantities` block in `Stan`. As an example, we can refit `mod2` but include the testing data for automatic forecasts: +```{r include=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + newdata = simdat$data_test, + adapt_delta = 0.98) +``` + +```{r eval=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + newdata = simdat$data_test) +``` + +Because the model already contains a forecast distribution, we do not need to feed `newdata` to the `forecast()` function: +```{r} +fc_mod2 <- forecast(mod2) +``` + +The forecasts will be nearly identical to those calculated previously: +```{r warning=FALSE, fig.alt = "Plotting posterior forecast distributions using mvgam and R"} +plot(fc_mod2, series = 1) +``` + +## Scoring forecast distributions +A primary purpose of the `mvgam_forecast` class is to readily allow forecast evaluations for each series in the data, using a variety of possible scoring functions. See `?mvgam::score.mvgam_forecast` to view the types of scores that are available. A useful scoring metric is the [Continuous Rank Probability Score (CRPS)](https://www.annualreviews.org/content/journals/10.1146/annurev-statistics-062713-085831){target="_blank"}. A CRPS value is similar to what we might get if we calculated a weighted absolute error using the full forecast distribution. +```{r warning=FALSE} +crps_mod1 <- score(fc_mod1, score = 'crps') +str(crps_mod1) +crps_mod1$series_1 +``` + +The returned list contains a `data.frame` for each series in the data that shows the CRPS score for each evaluation in the testing data, along with some other useful information about the fit of the forecast distribution. In particular, we are given a logical value (1s and 0s) telling us whether the true value was within a pre-specified credible interval (i.e. the coverage of the forecast distribution). The default interval width is 0.9, so we would hope that the values in the `in_interval` column take a 1 approximately 90% of the time. This value can be changed if you wish to compute different coverages, say using a 60% interval: +```{r warning=FALSE} +crps_mod1 <- score(fc_mod1, score = 'crps', interval_width = 0.6) +crps_mod1$series_1 +``` + +We can also compare forecasts against out of sample observations using the [Expected Log Predictive Density (ELPD; also known as the log score)](https://link.springer.com/article/10.1007/s11222-016-9696-4){target="_blank"}. The ELPD is a strictly proper scoring rule that can be applied to any distributional forecast, but to compute it we need predictions on the link scale rather than on the outcome scale. This is where it is advantageous to change the type of prediction we can get using the `forecast()` function: +```{r} +link_mod1 <- forecast(mod1, newdata = simdat$data_test, type = 'link') +score(link_mod1, score = 'elpd')$series_1 +``` + +Finally, when we have multiple time series it may also make sense to use a multivariate proper scoring rule. `mvgam` offers two such options: the Energy score and the Variogram score. The first penalizes forecast distributions that are less well calibrated against the truth, while the second penalizes forecasts that do not capture the observed true correlation structure. Which score to use depends on your goals, but both are very easy to compute: +```{r} +energy_mod2 <- score(fc_mod2, score = 'energy') +str(energy_mod2) +``` + +The returned object still provides information on interval coverage for each individual series, but there is only a single score per horizon now (which is provided in the `all_series` slot): +```{r} +energy_mod2$all_series +``` + +You can use your score(s) of choice to compare different models. For example, we can compute and plot the difference in CRPS scores for each series in data. Here, a negative value means the Gaussian Process model (`mod2`) is better, while a positive value means the spline model (`mod1`) is better. +```{r} +crps_mod1 <- score(fc_mod1, score = 'crps') +crps_mod2 <- score(fc_mod2, score = 'crps') + +diff_scores <- crps_mod2$series_1$score - + crps_mod1$series_1$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + + +diff_scores <- crps_mod2$series_2$score - + crps_mod1$series_2$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + +diff_scores <- crps_mod2$series_3$score - + crps_mod1$series_3$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + +``` + +The GP model consistently gives better forecasts, and the difference between scores grows quickly as the forecast horizon increases. This is not unexpected given the way that splines linearly extrapolate outside the range of training data + +## Further reading +The following papers and resources offer useful material about Bayesian forecasting and proper scoring rules: + +Hyndman, Rob J., and George Athanasopoulos. [Forecasting: principles and practice](https://otexts.com/fpp3/distaccuracy.html). *OTexts*, 2018. + +Gneiting, Tilmann, and Adrian E. Raftery. [Strictly proper scoring rules, prediction, and estimation](https://www.tandfonline.com/doi/abs/10.1198/016214506000001437) *Journal of the American statistical Association* 102.477 (2007) 359-378. + +Simonis, Juniper L., Ethan P. White, and SK Morgan Ernest. [Evaluating probabilistic ecological forecasts](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecy.3431) *Ecology* 102.8 (2021) e03431. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/doc/forecast_evaluation.html b/doc/forecast_evaluation.html new file mode 100644 index 00000000..570bab86 --- /dev/null +++ b/doc/forecast_evaluation.html @@ -0,0 +1,1020 @@ + + + + + + + + + + + + + + + + +Forecasting and forecast evaluation in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Forecasting and forecast evaluation in +mvgam

+

Nicholas J Clark

+

2024-04-18

+ + +
+ +
+ +

The purpose of this vignette is to show how the mvgam +package can be used to produce probabilistic forecasts and to evaluate +those forecasts using a variety of proper scoring rules.

+
+

Simulating discrete time series

+

We begin by simulating some data to show how forecasts are computed +and evaluated in mvgam. The sim_mvgam() +function can be used to simulate series that come from a variety of +response distributions as well as seasonal patterns and/or dynamic +temporal patterns. Here we simulate a collection of three time +count-valued series. These series all share the same seasonal pattern +but have different temporal dynamics. By setting +trend_model = 'GP' and prop_trend = 0.75, we +are generating time series that have smooth underlying temporal trends +(evolving as Gaussian Processes with squared exponential kernel) and +moderate seasonal patterns. The observations are Poisson-distributed and +we allow 10% of observations to be missing.

+
set.seed(2345)
+simdat <- sim_mvgam(T = 100, 
+                    n_series = 3, 
+                    trend_model = 'GP',
+                    prop_trend = 0.75,
+                    family = poisson(),
+                    prop_missing = 0.10)
+

The returned object is a list containing training and +testing data (sim_mvgam() automatically splits the data +into these folds for us) together with some other information about the +data generating process that was used to simulate the data

+
str(simdat)
+#> List of 6
+#>  $ data_train        :'data.frame':  225 obs. of  5 variables:
+#>   ..$ y     : int [1:225] 0 1 3 0 0 0 1 0 3 1 ...
+#>   ..$ season: int [1:225] 1 1 1 2 2 2 3 3 3 4 ...
+#>   ..$ year  : int [1:225] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..$ series: Factor w/ 3 levels "series_1","series_2",..: 1 2 3 1 2 3 1 2 3 1 ...
+#>   ..$ time  : int [1:225] 1 1 1 2 2 2 3 3 3 4 ...
+#>  $ data_test         :'data.frame':  75 obs. of  5 variables:
+#>   ..$ y     : int [1:75] 0 1 1 0 0 0 2 2 0 NA ...
+#>   ..$ season: int [1:75] 4 4 4 5 5 5 6 6 6 7 ...
+#>   ..$ year  : int [1:75] 7 7 7 7 7 7 7 7 7 7 ...
+#>   ..$ series: Factor w/ 3 levels "series_1","series_2",..: 1 2 3 1 2 3 1 2 3 1 ...
+#>   ..$ time  : int [1:75] 76 76 76 77 77 77 78 78 78 79 ...
+#>  $ true_corrs        : num [1:3, 1:3] 1 0.465 -0.577 0.465 1 ...
+#>  $ true_trends       : num [1:100, 1:3] -1.45 -1.54 -1.61 -1.67 -1.73 ...
+#>  $ global_seasonality: num [1:100] 0.0559 0.6249 1.3746 1.6805 0.5246 ...
+#>  $ trend_params      :List of 2
+#>   ..$ alpha: num [1:3] 0.767 0.988 0.897
+#>   ..$ rho  : num [1:3] 6.02 6.94 5.04
+

Each series in this case has a shared seasonal pattern, which we can +visualise:

+
plot(simdat$global_seasonality[1:12], 
+     type = 'l', lwd = 2,
+     ylab = 'Relative effect',
+     xlab = 'Season',
+     bty = 'l')
+

Simulating data for dynamic GAM models in mvgam

+

The resulting time series are similar to what we might encounter when +dealing with count-valued data that can take small counts:

+
plot_mvgam_series(data = simdat$data_train, 
+                  series = 'all')
+

Plotting time series features for GAM models in mvgam

+

For each individual series, we can plot the training and testing +data, as well as some more specific features of the observed data:

+
plot_mvgam_series(data = simdat$data_train, 
+                  newdata = simdat$data_test,
+                  series = 1)
+

Plotting time series features for GAM models in mvgam

+
plot_mvgam_series(data = simdat$data_train, 
+                  newdata = simdat$data_test,
+                  series = 2)
+

Plotting time series features for GAM models in mvgam

+
plot_mvgam_series(data = simdat$data_train, 
+                  newdata = simdat$data_test,
+                  series = 3)
+

Plotting time series features for GAM models in mvgam

+
+

Modelling dynamics with splines

+

The first model we will fit uses a shared cyclic spline to capture +the repeated seasonality, as well as series-specific splines of time to +capture the long-term dynamics. We allow the temporal splines to be +fairly complex so they can capture as much of the temporal variation as +possible:

+
mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + 
+                s(time, by = series, bs = 'cr', k = 20),
+              knots = list(season = c(0.5, 12.5)),
+              trend_model = 'None',
+              data = simdat$data_train)
+

The model fits without issue:

+
summary(mod1, include_betas = FALSE)
+#> GAM formula:
+#> y ~ s(season, bs = "cc", k = 8) + s(time, by = series, bs = "cr", 
+#>     k = 20)
+#> <environment: 0x0000029cbf2b3570>
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 75 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>              2.5%   50%  97.5% Rhat n_eff
+#> (Intercept) -0.41 -0.21 -0.039    1   813
+#> 
+#> Approximate significance of GAM smooths:
+#>                         edf Ref.df Chi.sq p-value    
+#> s(season)              3.77      6   21.8   0.004 ** 
+#> s(time):seriesseries_1 6.50     19   15.3   0.848    
+#> s(time):seriesseries_2 9.49     19  226.0  <2e-16 ***
+#> s(time):seriesseries_3 5.93     19   18.3   0.867    
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:31:33 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

And we can plot the partial effects of the splines to see that they +are estimated to be highly nonlinear

+
plot(mod1, type = 'smooths')
+

Plotting GAM smooth functions using mvgam

+
+
+

Modelling dynamics with GPs

+

Before showing how to produce and evaluate forecasts, we will fit a +second model to these data so the two models can be compared. This model +is equivalent to the above, except we now use Gaussian Processes to +model series-specific dynamics. This makes use of the gp() +function from brms, which can fit Hilbert space approximate +GPs. See ?brms::gp for more details.

+
mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + 
+                gp(time, by = series, c = 5/4, k = 20,
+                   scale = FALSE),
+              knots = list(season = c(0.5, 12.5)),
+              trend_model = 'None',
+              data = simdat$data_train)
+

The summary for this model now contains information on the GP +parameters for each time series:

+
summary(mod2, include_betas = FALSE)
+#> GAM formula:
+#> y ~ s(season, bs = "cc", k = 8) + gp(time, by = series, c = 5/4, 
+#>     k = 20, scale = FALSE)
+#> <environment: 0x0000029cbf2b3570>
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 75 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5%   50% 97.5% Rhat n_eff
+#> (Intercept) -1.1 -0.52  0.31    1   768
+#> 
+#> GAM gp term marginal deviation (alpha) and length scale (rho) estimates:
+#>                               2.5%  50% 97.5% Rhat n_eff
+#> alpha_gp(time):seriesseries_1 0.21  0.8   2.1 1.01   763
+#> alpha_gp(time):seriesseries_2 0.74  1.4   2.9 1.00  1028
+#> alpha_gp(time):seriesseries_3 0.50  1.1   2.8 1.00  1026
+#> rho_gp(time):seriesseries_1   1.20  5.1  23.0 1.00   681
+#> rho_gp(time):seriesseries_2   2.20 10.0  17.0 1.00   644
+#> rho_gp(time):seriesseries_3   1.50  8.8  23.0 1.00   819
+#> 
+#> Approximate significance of GAM smooths:
+#>            edf Ref.df Chi.sq p-value    
+#> s(season) 4.12      6   25.9 0.00053 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 4 of 2000 iterations ended with a divergence (0.2%)
+#>  *Try running with larger adapt_delta to remove the divergences
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:33:03 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

We can plot the posteriors for these parameters, and for any other +parameter for that matter, using bayesplot routines. First +the marginal deviation (\(\alpha\)) +parameters:

+
mcmc_plot(mod2, variable = c('alpha_gp'), regex = TRUE, type = 'areas')
+

Summarising latent Gaussian Process parameters in mvgam

+

And now the length scale (\(\rho\)) +parameters:

+
mcmc_plot(mod2, variable = c('rho_gp'), regex = TRUE, type = 'areas')
+

Summarising latent Gaussian Process parameters in mvgam

+

We can also plot the nonlinear effects as before:

+
plot(mod2, type = 'smooths')
+

Plotting Gaussian Process effects in mvgam +These can also be plotted using marginaleffects +utilities:

+
require('ggplot2')
+plot_predictions(mod2, 
+                 condition = c('time', 'series', 'series'),
+                 type = 'link') +
+  theme(legend.position = 'none')
+

Summarising latent Gaussian Process parameters in mvgam and marginaleffects

+

The estimates for the temporal trends are fairly similar for the two +models, but below we will see if they produce similar forecasts

+
+
+
+

Forecasting with the forecast() function

+

Probabilistic forecasts can be computed in two main ways in +mvgam. The first is to take a model that was fit only to +training data (as we did above in the two example models) and produce +temporal predictions from the posterior predictive distribution by +feeding newdata to the forecast() function. It +is crucial that any newdata fed to the +forecast() function follows on sequentially from the data +that was used to fit the model (this is not internally checked by the +package because it might be a headache to do so when data are not +supplied in a specific time-order). When calling the +forecast() function, you have the option to generate +different kinds of predictions (i.e. predicting on the link scale, +response scale or to produce expectations; see +?forecast.mvgam for details). We will use the default and +produce forecasts on the response scale, which is the most common way to +evaluate forecast distributions

+
fc_mod1 <- forecast(mod1, newdata = simdat$data_test)
+fc_mod2 <- forecast(mod2, newdata = simdat$data_test)
+

The objects we have created are of class mvgam_forecast, +which contain information on hindcast distributions, forecast +distributions and true observations for each series in the data:

+
str(fc_mod1)
+#> List of 16
+#>  $ call              :Class 'formula'  language y ~ s(season, bs = "cc", k = 8) + s(time, by = series, bs = "cr", k = 20)
+#>   .. ..- attr(*, ".Environment")=<environment: 0x0000029cbf2b3570> 
+#>  $ trend_call        : NULL
+#>  $ family            : chr "poisson"
+#>  $ family_pars       : NULL
+#>  $ trend_model       : chr "None"
+#>  $ drift             : logi FALSE
+#>  $ use_lv            : logi FALSE
+#>  $ fit_engine        : chr "stan"
+#>  $ type              : chr "response"
+#>  $ series_names      : Factor w/ 3 levels "series_1","series_2",..: 1 2 3
+#>  $ train_observations:List of 3
+#>   ..$ series_1: int [1:75] 0 0 1 1 0 0 0 0 0 0 ...
+#>   ..$ series_2: int [1:75] 1 0 0 1 1 0 1 0 1 2 ...
+#>   ..$ series_3: int [1:75] 3 0 3 NA 2 1 1 1 1 3 ...
+#>  $ train_times       : int [1:75] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ test_observations :List of 3
+#>   ..$ series_1: int [1:25] 0 0 2 NA 0 2 2 1 1 1 ...
+#>   ..$ series_2: int [1:25] 1 0 2 1 1 3 0 1 0 NA ...
+#>   ..$ series_3: int [1:25] 1 0 0 1 0 0 1 0 1 0 ...
+#>  $ test_times        : int [1:25] 76 77 78 79 80 81 82 83 84 85 ...
+#>  $ hindcasts         :List of 3
+#>   ..$ series_1: num [1:2000, 1:75] 1 1 0 0 0 1 1 1 0 0 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:75] "ypred[1,1]" "ypred[2,1]" "ypred[3,1]" "ypred[4,1]" ...
+#>   ..$ series_2: num [1:2000, 1:75] 0 0 0 0 0 0 0 1 0 0 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:75] "ypred[1,2]" "ypred[2,2]" "ypred[3,2]" "ypred[4,2]" ...
+#>   ..$ series_3: num [1:2000, 1:75] 3 0 2 1 0 1 2 1 5 1 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:75] "ypred[1,3]" "ypred[2,3]" "ypred[3,3]" "ypred[4,3]" ...
+#>  $ forecasts         :List of 3
+#>   ..$ series_1: num [1:2000, 1:25] 1 3 2 1 0 0 1 1 0 0 ...
+#>   ..$ series_2: num [1:2000, 1:25] 6 0 0 0 0 2 0 0 0 0 ...
+#>   ..$ series_3: num [1:2000, 1:25] 0 1 1 3 3 1 3 2 4 2 ...
+#>  - attr(*, "class")= chr "mvgam_forecast"
+

We can plot the forecasts for each series from each model using the +S3 plot method for objects of this class:

+
plot(fc_mod1, series = 1)
+

+
#> Out of sample CRPS:
+#> [1] 14.62964
+plot(fc_mod2, series = 1)
+

+
#> Out of sample DRPS:
+#> [1] 10.92516
+
+plot(fc_mod1, series = 2)
+

+
#> Out of sample CRPS:
+#> [1] 84201962708
+plot(fc_mod2, series = 2)
+

+
#> Out of sample DRPS:
+#> [1] 14.31152
+
+plot(fc_mod1, series = 3)
+

+
#> Out of sample CRPS:
+#> [1] 32.44136
+plot(fc_mod2, series = 3)
+

+
#> Out of sample DRPS:
+#> [1] 15.44332
+

Clearly the two models do not produce equivalent forecasts. We will +come back to scoring these forecasts in a moment.

+
+
+

Forecasting with newdata in mvgam()

+

The second way we can produce forecasts in mvgam is to +feed the testing data directly to the mvgam() function as +newdata. This will include the testing data as missing +observations so that they are automatically predicted from the posterior +predictive distribution using the generated quantities +block in Stan. As an example, we can refit +mod2 but include the testing data for automatic +forecasts:

+
mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + 
+                gp(time, by = series, c = 5/4, k = 20,
+                   scale = FALSE),
+              knots = list(season = c(0.5, 12.5)),
+              trend_model = 'None',
+              data = simdat$data_train,
+              newdata = simdat$data_test)
+

Because the model already contains a forecast distribution, we do not +need to feed newdata to the forecast() +function:

+
fc_mod2 <- forecast(mod2)
+

The forecasts will be nearly identical to those calculated +previously:

+
plot(fc_mod2, series = 1)
+

Plotting posterior forecast distributions using mvgam and R

+
#> Out of sample DRPS:
+#> [1] 10.85762
+
+
+

Scoring forecast distributions

+

A primary purpose of the mvgam_forecast class is to +readily allow forecast evaluations for each series in the data, using a +variety of possible scoring functions. See +?mvgam::score.mvgam_forecast to view the types of scores +that are available. A useful scoring metric is the Continuous Rank Probability Score (CRPS). A CRPS +value is similar to what we might get if we calculated a weighted +absolute error using the full forecast distribution.

+
crps_mod1 <- score(fc_mod1, score = 'crps')
+str(crps_mod1)
+#> List of 4
+#>  $ series_1  :'data.frame':  25 obs. of  5 variables:
+#>   ..$ score         : num [1:25] 0.1938 0.1366 1.355 NA 0.0348 ...
+#>   ..$ in_interval   : num [1:25] 1 1 1 NA 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type    : chr [1:25] "crps" "crps" "crps" "crps" ...
+#>  $ series_2  :'data.frame':  25 obs. of  5 variables:
+#>   ..$ score         : num [1:25] 0.379 0.306 0.941 0.5 0.573 ...
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 NA ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type    : chr [1:25] "crps" "crps" "crps" "crps" ...
+#>  $ series_3  :'data.frame':  25 obs. of  5 variables:
+#>   ..$ score         : num [1:25] 0.32 0.556 0.379 0.362 0.219 ...
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type    : chr [1:25] "crps" "crps" "crps" "crps" ...
+#>  $ all_series:'data.frame':  25 obs. of  3 variables:
+#>   ..$ score       : num [1:25] 0.892 0.999 2.675 NA 0.827 ...
+#>   ..$ eval_horizon: int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type  : chr [1:25] "sum_crps" "sum_crps" "sum_crps" "sum_crps" ...
+crps_mod1$series_1
+#>         score in_interval interval_width eval_horizon score_type
+#> 1  0.19375525           1            0.9            1       crps
+#> 2  0.13663925           1            0.9            2       crps
+#> 3  1.35502175           1            0.9            3       crps
+#> 4          NA          NA            0.9            4       crps
+#> 5  0.03482775           1            0.9            5       crps
+#> 6  1.55416700           1            0.9            6       crps
+#> 7  1.51028900           1            0.9            7       crps
+#> 8  0.62121225           1            0.9            8       crps
+#> 9  0.62630125           1            0.9            9       crps
+#> 10 0.59853100           1            0.9           10       crps
+#> 11 1.30998625           1            0.9           11       crps
+#> 12 2.04829775           1            0.9           12       crps
+#> 13 0.61251800           1            0.9           13       crps
+#> 14 0.14052300           1            0.9           14       crps
+#> 15 0.65110800           1            0.9           15       crps
+#> 16 0.07973125           1            0.9           16       crps
+#> 17 0.07675600           1            0.9           17       crps
+#> 18 0.09382375           1            0.9           18       crps
+#> 19 0.12356725           1            0.9           19       crps
+#> 20         NA          NA            0.9           20       crps
+#> 21 0.20173600           1            0.9           21       crps
+#> 22 0.84066825           1            0.9           22       crps
+#> 23         NA          NA            0.9           23       crps
+#> 24 1.06489225           1            0.9           24       crps
+#> 25 0.75528825           1            0.9           25       crps
+

The returned list contains a data.frame for each series +in the data that shows the CRPS score for each evaluation in the testing +data, along with some other useful information about the fit of the +forecast distribution. In particular, we are given a logical value (1s +and 0s) telling us whether the true value was within a pre-specified +credible interval (i.e. the coverage of the forecast distribution). The +default interval width is 0.9, so we would hope that the values in the +in_interval column take a 1 approximately 90% of the time. +This value can be changed if you wish to compute different coverages, +say using a 60% interval:

+
crps_mod1 <- score(fc_mod1, score = 'crps', interval_width = 0.6)
+crps_mod1$series_1
+#>         score in_interval interval_width eval_horizon score_type
+#> 1  0.19375525           1            0.6            1       crps
+#> 2  0.13663925           1            0.6            2       crps
+#> 3  1.35502175           0            0.6            3       crps
+#> 4          NA          NA            0.6            4       crps
+#> 5  0.03482775           1            0.6            5       crps
+#> 6  1.55416700           0            0.6            6       crps
+#> 7  1.51028900           0            0.6            7       crps
+#> 8  0.62121225           1            0.6            8       crps
+#> 9  0.62630125           1            0.6            9       crps
+#> 10 0.59853100           1            0.6           10       crps
+#> 11 1.30998625           0            0.6           11       crps
+#> 12 2.04829775           0            0.6           12       crps
+#> 13 0.61251800           1            0.6           13       crps
+#> 14 0.14052300           1            0.6           14       crps
+#> 15 0.65110800           1            0.6           15       crps
+#> 16 0.07973125           1            0.6           16       crps
+#> 17 0.07675600           1            0.6           17       crps
+#> 18 0.09382375           1            0.6           18       crps
+#> 19 0.12356725           1            0.6           19       crps
+#> 20         NA          NA            0.6           20       crps
+#> 21 0.20173600           1            0.6           21       crps
+#> 22 0.84066825           1            0.6           22       crps
+#> 23         NA          NA            0.6           23       crps
+#> 24 1.06489225           1            0.6           24       crps
+#> 25 0.75528825           1            0.6           25       crps
+

We can also compare forecasts against out of sample observations +using the Expected Log Predictive Density (ELPD; also known as the +log score). The ELPD is a strictly proper scoring rule that can be +applied to any distributional forecast, but to compute it we need +predictions on the link scale rather than on the outcome scale. This is +where it is advantageous to change the type of prediction we can get +using the forecast() function:

+
link_mod1 <- forecast(mod1, newdata = simdat$data_test, type = 'link')
+score(link_mod1, score = 'elpd')$series_1
+#>         score eval_horizon score_type
+#> 1  -0.5304414            1       elpd
+#> 2  -0.4298955            2       elpd
+#> 3  -2.9617583            3       elpd
+#> 4          NA            4       elpd
+#> 5  -0.2007644            5       elpd
+#> 6  -3.3781408            6       elpd
+#> 7  -3.2729088            7       elpd
+#> 8  -2.0363750            8       elpd
+#> 9  -2.0670612            9       elpd
+#> 10 -2.0844818           10       elpd
+#> 11 -3.0576463           11       elpd
+#> 12 -3.6291058           12       elpd
+#> 13 -2.1692669           13       elpd
+#> 14 -0.2960899           14       elpd
+#> 15 -2.3738851           15       elpd
+#> 16 -0.2160804           16       elpd
+#> 17 -0.2036782           17       elpd
+#> 18 -0.2115539           18       elpd
+#> 19 -0.2235072           19       elpd
+#> 20         NA           20       elpd
+#> 21 -0.2413680           21       elpd
+#> 22 -2.6791984           22       elpd
+#> 23         NA           23       elpd
+#> 24 -2.6851981           24       elpd
+#> 25 -0.2836901           25       elpd
+

Finally, when we have multiple time series it may also make sense to +use a multivariate proper scoring rule. mvgam offers two +such options: the Energy score and the Variogram score. The first +penalizes forecast distributions that are less well calibrated against +the truth, while the second penalizes forecasts that do not capture the +observed true correlation structure. Which score to use depends on your +goals, but both are very easy to compute:

+
energy_mod2 <- score(fc_mod2, score = 'energy')
+str(energy_mod2)
+#> List of 4
+#>  $ series_1  :'data.frame':  25 obs. of  3 variables:
+#>   ..$ in_interval   : num [1:25] 1 1 1 NA 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ series_2  :'data.frame':  25 obs. of  3 variables:
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 NA ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ series_3  :'data.frame':  25 obs. of  3 variables:
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ all_series:'data.frame':  25 obs. of  3 variables:
+#>   ..$ score       : num [1:25] 0.773 1.147 1.226 NA 0.458 ...
+#>   ..$ eval_horizon: int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type  : chr [1:25] "energy" "energy" "energy" "energy" ...
+

The returned object still provides information on interval coverage +for each individual series, but there is only a single score per horizon +now (which is provided in the all_series slot):

+
energy_mod2$all_series
+#>        score eval_horizon score_type
+#> 1  0.7728517            1     energy
+#> 2  1.1469836            2     energy
+#> 3  1.2258781            3     energy
+#> 4         NA            4     energy
+#> 5  0.4577536            5     energy
+#> 6  1.8094487            6     energy
+#> 7  1.4887317            7     energy
+#> 8  0.7651593            8     energy
+#> 9  1.1180634            9     energy
+#> 10        NA           10     energy
+#> 11 1.5008324           11     energy
+#> 12 3.2142460           12     energy
+#> 13 1.6129732           13     energy
+#> 14 1.2704438           14     energy
+#> 15 1.1335958           15     energy
+#> 16 1.8717420           16     energy
+#> 17        NA           17     energy
+#> 18 0.7953392           18     energy
+#> 19 0.9919119           19     energy
+#> 20        NA           20     energy
+#> 21 1.2461964           21     energy
+#> 22 1.5170615           22     energy
+#> 23        NA           23     energy
+#> 24 2.3824552           24     energy
+#> 25 1.5314557           25     energy
+

You can use your score(s) of choice to compare different models. For +example, we can compute and plot the difference in CRPS scores for each +series in data. Here, a negative value means the Gaussian Process model +(mod2) is better, while a positive value means the spline +model (mod1) is better.

+
crps_mod1 <- score(fc_mod1, score = 'crps')
+crps_mod2 <- score(fc_mod2, score = 'crps')
+
+diff_scores <- crps_mod2$series_1$score -
+  crps_mod1$series_1$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(CRPS[GP]~-~CRPS[spline]))
+abline(h = 0, lty = 'dashed', lwd = 2)
+gp_better <- length(which(diff_scores < 0))
+title(main = paste0('GP better in ', gp_better, ' of 25 evaluations',
+                    '\nMean difference = ', 
+                    round(mean(diff_scores, na.rm = TRUE), 2)))
+

+

+
+diff_scores <- crps_mod2$series_2$score -
+  crps_mod1$series_2$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(CRPS[GP]~-~CRPS[spline]))
+abline(h = 0, lty = 'dashed', lwd = 2)
+gp_better <- length(which(diff_scores < 0))
+title(main = paste0('GP better in ', gp_better, ' of 25 evaluations',
+                    '\nMean difference = ', 
+                    round(mean(diff_scores, na.rm = TRUE), 2)))
+

+

+diff_scores <- crps_mod2$series_3$score -
+  crps_mod1$series_3$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(CRPS[GP]~-~CRPS[spline]))
+abline(h = 0, lty = 'dashed', lwd = 2)
+gp_better <- length(which(diff_scores < 0))
+title(main = paste0('GP better in ', gp_better, ' of 25 evaluations',
+                    '\nMean difference = ', 
+                    round(mean(diff_scores, na.rm = TRUE), 2)))
+

+

The GP model consistently gives better forecasts, and the difference +between scores grows quickly as the forecast horizon increases. This is +not unexpected given the way that splines linearly extrapolate outside +the range of training data

+
+
+

Further reading

+

The following papers and resources offer useful material about +Bayesian forecasting and proper scoring rules:

+

Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles +and practice. OTexts, 2018.

+

Gneiting, Tilmann, and Adrian E. Raftery. Strictly +proper scoring rules, prediction, and estimation Journal of the +American statistical Association 102.477 (2007) 359-378.

+

Simonis, Juniper L., Ethan P. White, and SK Morgan Ernest. Evaluating +probabilistic ecological forecasts Ecology 102.8 (2021) +e03431.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/doc/mvgam_overview.R b/doc/mvgam_overview.R new file mode 100644 index 00000000..c8b40d2d --- /dev/null +++ b/doc/mvgam_overview.R @@ -0,0 +1,311 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----Access time series data-------------------------------------------------- +data("portal_data") + +## ----Inspect data format and structure---------------------------------------- +head(portal_data) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(portal_data) + +## ----------------------------------------------------------------------------- +data <- sim_mvgam(n_series = 4, T = 24) +head(data$data_train, 12) + +## ----Wrangle data for modelling----------------------------------------------- +portal_data %>% + + # mvgam requires a 'time' variable be present in the data to index + # the temporal observations. This is especially important when tracking + # multiple time series. In the Portal data, the 'moon' variable indexes the + # lunar monthly timestep of the trapping sessions + dplyr::mutate(time = moon - (min(moon)) + 1) %>% + + # We can also provide a more informative name for the outcome variable, which + # is counts of the 'PP' species (Chaetodipus penicillatus) across all control + # plots + dplyr::mutate(count = PP) %>% + + # The other requirement for mvgam is a 'series' variable, which needs to be a + # factor variable to index which time series each row in the data belongs to. + # Again, this is more useful when you have multiple time series in the data + dplyr::mutate(series = as.factor('PP')) %>% + + # Select the variables of interest to keep in the model_data + dplyr::select(series, year, time, count, mintemp, ndvi) -> model_data + +## ----------------------------------------------------------------------------- +head(model_data) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(model_data) + +## ----Summarise variables------------------------------------------------------ +summary(model_data) + +## ----------------------------------------------------------------------------- +image(is.na(t(model_data %>% + dplyr::arrange(dplyr::desc(time)))), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(model_data)), labels = colnames(model_data)) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = model_data, series = 1, y = 'count') + +## ----------------------------------------------------------------------------- +model_data %>% + + # Create a 'year_fac' factor version of 'year' + dplyr::mutate(year_fac = factor(year)) -> model_data + +## ----------------------------------------------------------------------------- +dplyr::glimpse(model_data) +levels(model_data$year_fac) + +## ----model1, include=FALSE, results='hide'------------------------------------ +model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, +# family = poisson(), +# data = model_data) + +## ----------------------------------------------------------------------------- +get_mvgam_priors(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data) + +## ----------------------------------------------------------------------------- +summary(model1) + +## ----Extract coefficient posteriors------------------------------------------- +beta_post <- as.data.frame(model1, variable = 'betas') +dplyr::glimpse(beta_post) + +## ----------------------------------------------------------------------------- +code(model1) + +## ----Plot random effect estimates--------------------------------------------- +plot(model1, type = 're') + +## ----------------------------------------------------------------------------- +mcmc_plot(object = model1, + variable = 'betas', + type = 'areas') + +## ----------------------------------------------------------------------------- +pp_check(object = model1) +pp_check(model1, type = "rootogram") + +## ----Plot posterior hindcasts------------------------------------------------- +plot(model1, type = 'forecast') + +## ----Extract posterior hindcast----------------------------------------------- +hc <- hindcast(model1) +str(hc) + +## ----Extract hindcasts on the linear predictor scale-------------------------- +hc <- hindcast(model1, type = 'link') +range(hc$hindcasts$PP) + +## ----Plot hindcasts on the linear predictor scale----------------------------- +plot(hc) + +## ----Plot posterior residuals------------------------------------------------- +plot(model1, type = 'residuals') + +## ----------------------------------------------------------------------------- +model_data %>% + dplyr::filter(time <= 160) -> data_train +model_data %>% + dplyr::filter(time > 160) -> data_test + +## ----include=FALSE, message=FALSE, warning=FALSE------------------------------ +model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, +# family = poisson(), +# data = data_train, +# newdata = data_test) + +## ----------------------------------------------------------------------------- +plot(model1b, type = 're') + +## ----------------------------------------------------------------------------- +plot(model1b, type = 'forecast') + +## ----Plotting predictions against test data----------------------------------- +plot(model1b, type = 'forecast', newdata = data_test) + +## ----Extract posterior forecasts---------------------------------------------- +fc <- forecast(model1b) +str(fc) + +## ----model2, include=FALSE, message=FALSE, warning=FALSE---------------------- +model2 <- mvgam(count ~ s(year_fac, bs = 're') + + ndvi - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model2 <- mvgam(count ~ s(year_fac, bs = 're') + +# ndvi - 1, +# family = poisson(), +# data = data_train, +# newdata = data_test) + +## ----class.output="scroll-300"------------------------------------------------ +summary(model2) + +## ----Posterior quantiles of model coefficients-------------------------------- +coef(model2) + +## ----Plot NDVI effect--------------------------------------------------------- +plot(model2, type = 'pterms') + +## ----------------------------------------------------------------------------- +beta_post <- as.data.frame(model2, variable = 'betas') +dplyr::glimpse(beta_post) + +## ----Histogram of NDVI effects------------------------------------------------ +hist(beta_post$ndvi, + xlim = c(-1 * max(abs(beta_post$ndvi)), + max(abs(beta_post$ndvi))), + col = 'darkred', + border = 'white', + xlab = expression(beta[NDVI]), + ylab = '', + yaxt = 'n', + main = '', + lwd = 2) +abline(v = 0, lwd = 2.5) + +## ----warning=FALSE------------------------------------------------------------ +plot_predictions(model2, + condition = "ndvi", + # include the observed count values + # as points, and show rugs for the observed + # ndvi and count values on the axes + points = 0.5, rug = TRUE) + +## ----warning=FALSE------------------------------------------------------------ +plot(conditional_effects(model2), ask = FALSE) + +## ----model3, include=FALSE, message=FALSE, warning=FALSE---------------------- +model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + + ndvi, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + +# ndvi, +# family = poisson(), +# data = data_train, +# newdata = data_test) + +## ----------------------------------------------------------------------------- +summary(model3) + +## ----------------------------------------------------------------------------- +plot(model3, type = 'smooths') + +## ----------------------------------------------------------------------------- +plot(model3, type = 'smooths', realisations = TRUE, + n_realisations = 30) + +## ----Plot smooth term derivatives, warning = FALSE, fig.asp = 1--------------- +plot(model3, type = 'smooths', derivatives = TRUE) + +## ----warning=FALSE------------------------------------------------------------ +plot(conditional_effects(model3), ask = FALSE) + +## ----warning=FALSE------------------------------------------------------------ +plot(conditional_effects(model3, type = 'link'), ask = FALSE) + +## ----class.output="scroll-300"------------------------------------------------ +code(model3) + +## ----------------------------------------------------------------------------- +plot(model3, type = 'forecast', newdata = data_test) + +## ----Plot extrapolated temporal functions using newdata----------------------- +plot_mvgam_smooth(model3, smooth = 's(time)', + # feed newdata to the plot function to generate + # predictions of the temporal smooth to the end of the + # testing period + newdata = data.frame(time = 1:max(data_test$time), + ndvi = 0)) +abline(v = max(data_train$time), lty = 'dashed', lwd = 2) + +## ----model4, include=FALSE---------------------------------------------------- +model4 <- mvgam(count ~ s(ndvi, k = 6), + family = poisson(), + data = data_train, + newdata = data_test, + trend_model = 'AR1', + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model4 <- mvgam(count ~ s(ndvi, k = 6), +# family = poisson(), +# data = data_train, +# newdata = data_test, +# trend_model = 'AR1') + +## ----Summarise the mvgam autocorrelated error model, class.output="scroll-300"---- +summary(model4) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +plot_predictions(model4, + condition = "ndvi", + points = 0.5, rug = TRUE) + +## ----------------------------------------------------------------------------- +plot(model4, type = 'forecast', newdata = data_test) + +## ----------------------------------------------------------------------------- +plot(model4, type = 'trend', newdata = data_test) + +## ----------------------------------------------------------------------------- +loo_compare(model3, model4) + +## ----------------------------------------------------------------------------- +fc_mod3 <- forecast(model3) +fc_mod4 <- forecast(model4) +score_mod3 <- score(fc_mod3, score = 'drps') +score_mod4 <- score(fc_mod4, score = 'drps') +sum(score_mod4$PP$score, na.rm = TRUE) - sum(score_mod3$PP$score, na.rm = TRUE) + diff --git a/doc/mvgam_overview.Rmd b/doc/mvgam_overview.Rmd new file mode 100644 index 00000000..508c63d3 --- /dev/null +++ b/doc/mvgam_overview.Rmd @@ -0,0 +1,630 @@ +--- +title: "Overview of the mvgam package" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Overview of the mvgam package} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to give a general overview of the `mvgam` package and its primary functions. + +## Dynamic GAMs +`mvgam` is designed to propagate unobserved temporal processes to capture latent dynamics in the observed time series. This works in a state-space format, with the temporal *trend* evolving independently of the observation process. An introduction to the package and some worked examples are also shown in this seminar: [Ecological Forecasting with Dynamic Generalized Additive Models](https://www.youtube.com/watch?v=0zZopLlomsQ){target="_blank"}. Briefly, assume $\tilde{\boldsymbol{y}}_{i,t}$ is the conditional expectation of response variable $\boldsymbol{i}$ at time $\boldsymbol{t}$. Assuming $\boldsymbol{y_i}$ is drawn from an exponential distribution with an invertible link function, the linear predictor for a multivariate Dynamic GAM can be written as: + +$$for~i~in~1:N_{series}~...$$ +$$for~t~in~1:N_{timepoints}~...$$ + +$$g^{-1}(\tilde{\boldsymbol{y}}_{i,t})=\alpha_{i}+\sum\limits_{j=1}^J\boldsymbol{s}_{i,j,t}\boldsymbol{x}_{j,t}+\boldsymbol{z}_{i,t}\,,$$ +Here $\alpha$ are the unknown intercepts, the $\boldsymbol{s}$'s are unknown smooth functions of covariates ($\boldsymbol{x}$'s), which can potentially vary among the response series, and $\boldsymbol{z}$ are dynamic latent processes. Each smooth function $\boldsymbol{s_j}$ is composed of basis expansions whose coefficients, which must be estimated, control the functional relationship between $\boldsymbol{x}_{j}$ and $g^{-1}(\tilde{\boldsymbol{y}})$. The size of the basis expansion limits the smooth’s potential complexity. A larger set of basis functions allows greater flexibility. For more information on GAMs and how they can smooth through data, see [this blogpost on how to interpret nonlinear effects from Generalized Additive Models](https://ecogambler.netlify.app/blog/interpreting-gams/){target="_blank"}. + +Several advantages of GAMs are that they can model a diversity of response families, including discrete distributions (i.e. Poisson, Negative Binomial, Gamma) that accommodate common ecological features such as zero-inflation or overdispersion, and that they can be formulated to include hierarchical smoothing for multivariate responses. `mvgam` supports a number of different observation families, which are summarized below: + +## Supported observation families + +|Distribution | Function | Support | Extra parameter(s) | +|:----------------:|:---------------:| :------------------------------------------------:|:--------------------:| +|Gaussian (identity link) | `gaussian()` | Real values in $(-\infty, \infty)$ | $\sigma$ | +|Student's T (identity link) | `student-t()` | Heavy-tailed real values in $(-\infty, \infty)$ | $\sigma$, $\nu$ | +|LogNormal (identity link) | `lognormal()` | Positive real values in $[0, \infty)$ | $\sigma$ | +|Gamma (log link) | `Gamma()` | Positive real values in $[0, \infty)$ | $\alpha$ | +|Beta (logit link) | `betar()` | Real values (proportional) in $[0,1]$ | $\phi$ | +|Bernoulli (logit link) | `bernoulli()` | Binary data in ${0,1}$ | - | +|Poisson (log link) | `poisson()` | Non-negative integers in $(0,1,2,...)$ | - | +|Negative Binomial2 (log link)| `nb()` | Non-negative integers in $(0,1,2,...)$ | $\phi$ | +|Binomial (logit link) | `binomial()` | Non-negative integers in $(0,1,2,...)$ | - | +|Beta-Binomial (logit link) | `beta_binomial()` | Non-negative integers in $(0,1,2,...)$ | $\phi$ | +|Poisson Binomial N-mixture (log link)| `nmix()` | Non-negative integers in $(0,1,2,...)$ | - | + +For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are by default estimated independently for each series. However, users can opt to force all series to share extra observation parameters using `share_obs_params = TRUE` in `mvgam()`. Note that default link functions cannot currently be changed. + +## Supported temporal dynamic processes +The dynamic processes can take a wide variety of forms, some of which can be multivariate to allow the different time series to interact or be correlated. When using the `mvgam()` function, the user chooses between different process models with the `trend_model` argument. Available process models are described in detail below. + +### Independent Random Walks +Use `trend_model = 'RW'` or `trend_model = RW()` to set up a model where each series in `data` has independent latent temporal dynamics of the form: + + +\begin{align*} +z_{i,t} & \sim \text{Normal}(z_{i,t-1}, \sigma_i) \end{align*} + +Process error parameters $\sigma$ are modeled independently for each series. If a moving average process is required, use `trend_model = RW(ma = TRUE)` to set up the following: + +\begin{align*} +z_{i,t} & = z_{i,t-1} + \theta_i * error_{i,t-1} + error_{i,t} \\ +error_{i,t} & \sim \text{Normal}(0, \sigma_i) \end{align*} + +Moving average coefficients $\theta$ are independently estimated for each series and will be forced to be stationary by default $(abs(\theta)<1)$. Only moving averages of order $q=1$ are currently allowed. + +### Multivariate Random Walks +If more than one series is included in `data` $(N_{series} > 1)$, a multivariate Random Walk can be set up using `trend_model = RW(cor = TRUE)`, resulting in the following: + +\begin{align*} +z_{t} & \sim \text{MVNormal}(z_{t-1}, \Sigma) \end{align*} + +Where the latent process estimate $z_t$ now takes the form of a vector. The covariance matrix $\Sigma$ will capture contemporaneously correlated process errors. It is parameterised using a Cholesky factorization, which requires priors on the series-level variances $\sigma$ and on the strength of correlations using `Stan`'s `lkj_corr_cholesky` distribution. + +Moving average terms can also be included for multivariate random walks, in which case the moving average coefficients $\theta$ will be parameterised as an $N_{series} * N_{series}$ matrix + +### Autoregressive processes +Autoregressive models up to $p=3$, in which the autoregressive coefficients are estimated independently for each series, can be used by specifying `trend_model = 'AR1'`, `trend_model = 'AR2'`, `trend_model = 'AR3'`, or `trend_model = AR(p = 1, 2, or 3)`. For example, a univariate AR(1) model takes the form: + +\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i * z_{i,t-1}, \sigma_i) \end{align*} + + +All options are the same as for Random Walks, but additional options will be available for placing priors on the autoregressive coefficients. By default, these coefficients will not be forced into stationarity, but users can impose this restriction by changing the upper and lower bounds on their priors. See `?get_mvgam_priors` for more details. + +### Vector Autoregressive processes +A Vector Autoregression of order $p=1$ can be specified if $N_{series} > 1$ using `trend_model = 'VAR1'` or `trend_model = VAR()`. A VAR(1) model takes the form: + +\begin{align*} +z_{t} & \sim \text{Normal}(A * z_{t-1}, \Sigma) \end{align*} + +Where $A$ is an $N_{series} * N_{series}$ matrix of autoregressive coefficients in which the diagonals capture lagged self-dependence (i.e. the effect of a process at time $t$ on its own estimate at time $t+1$), while off-diagonals capture lagged cross-dependence (i.e. the effect of a process at time $t$ on the process for another series at time $t+1$). By default, the covariance matrix $\Sigma$ will assume no process error covariance by fixing the off-diagonals to $0$. To allow for correlated errors, use `trend_model = 'VAR1cor'` or `trend_model = VAR(cor = TRUE)`. A moving average of order $q=1$ can also be included using `trend_model = VAR(ma = TRUE, cor = TRUE)`. + +Note that for all VAR models, stationarity of the process is enforced with a structured prior distribution that is described in detail in [Heaps 2022](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648) + +Heaps, Sarah E. "[Enforcing stationarity through the prior in vector autoregressions.](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648)" *Journal of Computational and Graphical Statistics* 32.1 (2023): 74-83. + +### Gaussian Processes +The final option for modelling temporal dynamics is to use a Gaussian Process with squared exponential kernel. These are set up independently for each series (there is currently no multivariate GP option), using `trend_model = 'GP'`. The dynamics for each latent process are modelled as: + +\begin{align*} +z & \sim \text{MVNormal}(0, \Sigma_{error}) \\ +\Sigma_{error}[t_i, t_j] & = \alpha^2 * exp(-0.5 * ((|t_i - t_j| / \rho))^2) \end{align*} + +The latent dynamic process evolves from a complex, high-dimensional Multivariate Normal distribution which depends on $\rho$ (often called the length scale parameter) to control how quickly the correlations between the model's errors decay as a function of time. For these models, covariance decays exponentially fast with the squared distance (in time) between the observations. The functions also depend on a parameter $\alpha$, which controls the marginal variability of the temporal function at all points; in other words it controls how much the GP term contributes to the linear predictor. `mvgam` capitalizes on some advances that allow GPs to be approximated using Hilbert space basis functions, which [considerably speed up computation at little cost to accuracy or prediction performance](https://link.springer.com/article/10.1007/s11222-022-10167-2){target="_blank"}. + +### Piecewise logistic and linear trends +Modeling growth for many types of time series is often similar to modeling population growth in natural ecosystems, where there series exhibits nonlinear growth that saturates at some particular carrying capacity. The logistic trend model available in {`mvgam`} allows for a time-varying capacity $C(t)$ as well as a non-constant growth rate. Changes in the base growth rate $k$ are incorporated by explicitly defining changepoints throughout the training period where the growth rate is allowed to vary. The changepoint vector $a$ is represented as a vector of `1`s and `0`s, and the rate of growth at time $t$ is represented as $k+a(t)^T\delta$. Potential changepoints are selected uniformly across the training period, and the number of changepoints, as well as the flexibility of the potential rate changes at these changepoints, can be controlled using `trend_model = PW()`. The full piecewise logistic growth model is then: + +\begin{align*} +z_t & = \frac{C_t}{1 + \exp(-(k+a(t)^T\delta)(t-(m+a(t)^T\gamma)))} \end{align*} + +For time series that do not appear to exhibit saturating growth, a piece-wise constant rate of growth can often provide a useful trend model. The piecewise linear trend is defined as: + +\begin{align*} +z_t & = (k+a(t)^T\delta)t + (m+a(t)^T\gamma) \end{align*} + +In both trend models, $m$ is an offset parameter that controls the trend intercept. Because of this parameter, it is not recommended that you include an intercept in your observation formula because this will not be identifiable. You can read about the full description of piecewise linear and logistic trends [in this paper by Taylor and Letham](https://www.tandfonline.com/doi/abs/10.1080/00031305.2017.1380080){target="_blank"}. + +Sean J. Taylor and Benjamin Letham. "[Forecasting at scale.](https://www.tandfonline.com/doi/full/10.1080/00031305.2017.1380080)" *The American Statistician* 72.1 (2018): 37-45. + +### Continuous time AR(1) processes +Most trend models in the `mvgam()` function expect time to be measured in regularly-spaced, discrete intervals (i.e. one measurement per week, or one per year for example). But some time series are taken at irregular intervals and we'd like to model autoregressive properties of these. The `trend_model = CAR()` can be useful to set up these models, which currently only support autoregressive processes of order `1`. The evolution of the latent dynamic process follows the form: + +\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i^{distance} * z_{i,t-1}, \sigma_i) \end{align*} + +Where $distance$ is a vector of non-negative measurements of the time differences between successive observations. See the **Examples** section in `?CAR` for an illustration of how to set these models up. + +## Regression formulae +`mvgam` supports an observation model regression formula, built off the `mvgcv` package, as well as an optional process model regression formula. The formulae supplied to \code{\link{mvgam}} are exactly like those supplied to `glm()` except that smooth terms, `s()`, +`te()`, `ti()` and `t2()`, time-varying effects using `dynamic()`, monotonically increasing (using `s(x, bs = 'moi')`) or decreasing splines (using `s(x, bs = 'mod')`; see `?smooth.construct.moi.smooth.spec` for details), as well as Gaussian Process functions using `gp()`, can be added to the right hand side (and `.` is not supported in `mvgam` formulae). See `?mvgam_formulae` for more guidance. + +For setting up State-Space models, the optional process model formula can be used (see [the State-Space model vignette](https://nicholasjclark.github.io/mvgam/articles/trend_formulas.html) and [the shared latent states vignette](https://nicholasjclark.github.io/mvgam/articles/trend_formulas.html) for guidance on using trend formulae). + +## Example time series data +The 'portal_data' object contains time series of rodent captures from the Portal Project, [a long-term monitoring study based near the town of Portal, Arizona](https://portal.weecology.org/){target="_blank"}. Researchers have been operating a standardized set of baited traps within 24 experimental plots at this site since the 1970's. Sampling follows the lunar monthly cycle, with observations occurring on average about 28 days apart. However, missing observations do occur due to difficulties accessing the site (weather events, COVID disruptions etc...). You can read about the full sampling protocol [in this preprint by Ernest et al on the Biorxiv](https://www.biorxiv.org/content/10.1101/332783v3.full){target="_blank"}. +```{r Access time series data} +data("portal_data") +``` + +As the data come pre-loaded with the `mvgam` package, you can read a little about it in the help page using `?portal_data`. Before working with data, it is important to inspect how the data are structured, first using `head`: +```{r Inspect data format and structure} +head(portal_data) +``` + +But the `glimpse` function in `dplyr` is also useful for understanding how variables are structured +```{r} +dplyr::glimpse(portal_data) +``` + +We will focus analyses on the time series of captures for one specific rodent species, the Desert Pocket Mouse *Chaetodipus penicillatus*. This species is interesting in that it goes into a kind of "hibernation" during the colder months, leading to very low captures during the winter period + +## Manipulating data for modelling + +Manipulating the data into a 'long' format is necessary for modelling in `mvgam`. By 'long' format, we mean that each `series x time` observation needs to have its own entry in the `dataframe` or `list` object that we wish to use as data for modelling. A simple example can be viewed by simulating data using the `sim_mvgam` function. See `?sim_mvgam` for more details +```{r} +data <- sim_mvgam(n_series = 4, T = 24) +head(data$data_train, 12) +``` + +Notice how we have four different time series in these simulated data, but we do not spread the outcome values into different columns. Rather, there is only a single column for the outcome variable, labelled `y` in these simulated data. We also must supply a variable labelled `time` to ensure the modelling software knows how to arrange the time series when building models. This setup still allows us to formulate multivariate time series models, as you can see in the [State-Space vignette](https://nicholasjclark.github.io/mvgam/articles/trend_formulas.html). Below are the steps needed to shape our `portal_data` object into the correct form. First, we create a `time` variable, select the column representing counts of our target species (`PP`), and select appropriate variables that we can use as predictors +```{r Wrangle data for modelling} +portal_data %>% + + # mvgam requires a 'time' variable be present in the data to index + # the temporal observations. This is especially important when tracking + # multiple time series. In the Portal data, the 'moon' variable indexes the + # lunar monthly timestep of the trapping sessions + dplyr::mutate(time = moon - (min(moon)) + 1) %>% + + # We can also provide a more informative name for the outcome variable, which + # is counts of the 'PP' species (Chaetodipus penicillatus) across all control + # plots + dplyr::mutate(count = PP) %>% + + # The other requirement for mvgam is a 'series' variable, which needs to be a + # factor variable to index which time series each row in the data belongs to. + # Again, this is more useful when you have multiple time series in the data + dplyr::mutate(series = as.factor('PP')) %>% + + # Select the variables of interest to keep in the model_data + dplyr::select(series, year, time, count, mintemp, ndvi) -> model_data +``` + +The data now contain six variables: + `series`, a factor indexing which time series each observation belongs to + `year`, the year of sampling + `time`, the indicator of which time step each observation belongs to + `count`, the response variable representing the number of captures of the species `PP` in each sampling observation + `mintemp`, the monthly average minimum temperature at each time step + `ndvi`, the monthly average Normalized Difference Vegetation Index at each time step + +Now check the data structure again +```{r} +head(model_data) +``` + +```{r} +dplyr::glimpse(model_data) +``` + +You can also summarize multiple variables, which is helpful to search for data ranges and identify missing values +```{r Summarise variables} +summary(model_data) +``` + +We have some `NA`s in our response variable `count`. Let's visualize the data as a heatmap to get a sense of where these are distributed (`NA`s are shown as red bars in the below plot) +```{r} +image(is.na(t(model_data %>% + dplyr::arrange(dplyr::desc(time)))), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(model_data)), labels = colnames(model_data)) +``` + +These observations will generally be thrown out by most modelling packages in \R. But as you will see when we work through the tutorials, `mvgam` keeps these in the data so that predictions can be automatically returned for the full dataset. The time series and some of its descriptive features can be plotted using `plot_mvgam_series()`: +```{r} +plot_mvgam_series(data = model_data, series = 1, y = 'count') +``` + +## GLMs with temporal random effects +Our first task will be to fit a Generalized Linear Model (GLM) that can adequately capture the features of our `count` observations (integer data, lower bound at zero, missing values) while also attempting to model temporal variation. We are almost ready to fit our first model, which will be a GLM with Poisson observations, a log link function and random (hierarchical) intercepts for `year`. This will allow us to capture our prior belief that, although each year is unique, having been sampled from the same population of effects, all years are connected and thus might contain valuable information about one another. This will be done by capitalizing on the partial pooling properties of hierarchical models. Hierarchical (also known as random) effects offer many advantages when modelling data with grouping structures (i.e. multiple species, locations, years etc...). The ability to incorporate these in time series models is a huge advantage over traditional models such as ARIMA or Exponential Smoothing. But before we fit the model, we will need to convert `year` to a factor so that we can use a random effect basis in `mvgam`. See `?smooth.terms` and +`?smooth.construct.re.smooth.spec` for details about the `re` basis construction that is used by both `mvgam` and `mgcv` +```{r} +model_data %>% + + # Create a 'year_fac' factor version of 'year' + dplyr::mutate(year_fac = factor(year)) -> model_data +``` + +Preview the dataset to ensure year is now a factor with a unique factor level for each year in the data +```{r} +dplyr::glimpse(model_data) +levels(model_data$year_fac) +``` + +We are now ready for our first `mvgam` model. The syntax will be familiar to users who have previously built models with `mgcv`. But for a refresher, see `?formula.gam` and the examples in `?gam`. Random effects can be specified using the `s` wrapper with the `re` basis. Note that we can also suppress the primary intercept using the usual `R` formula syntax `- 1`. `mvgam` has a number of possible observation families that can be used, see `?mvgam_families` for more information. We will use `Stan` as the fitting engine, which deploys Hamiltonian Monte Carlo (HMC) for full Bayesian inference. By default, 4 HMC chains will be run using a warmup of 500 iterations and collecting 500 posterior samples from each chain. The package will also aim to use the `Cmdstan` backend when possible, so it is recommended that users have an up-to-date installation of `Cmdstan` and the associated `cmdstanr` interface on their machines (note that you can set the backend yourself using the `backend` argument: see `?mvgam` for details). Interested users should consult the [`Stan` user's guide](https://mc-stan.org/docs/stan-users-guide/index.html){target="_blank"} for more information about the software and the enormous variety of models that can be tackled with HMC. +```{r model1, include=FALSE, results='hide'} +model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data, + parallel = FALSE) +``` + +```{r eval=FALSE} +model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data) +``` + +The model can be described mathematically for each timepoint $t$ as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) \end{align*} + +Where the $\beta_{year}$ effects are drawn from a *population* distribution that is parameterized by a common mean $(\mu_{year})$ and variance $(\sigma_{year})$. Priors on most of the model parameters can be interrogated and changed using similar functionality to the options available in `brms`. For example, the default priors on $(\mu_{year})$ and $(\sigma_{year})$ can be viewed using the following code: +```{r} +get_mvgam_priors(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data) +``` + +See examples in `?get_mvgam_priors` to find out different ways that priors can be altered. +Once the model has finished, the first step is to inspect the `summary` to ensure no major diagnostic warnings have been produced and to quickly summarise posterior distributions for key parameters +```{r} +summary(model1) +``` + +The diagnostic messages at the bottom of the summary show that the HMC sampler did not encounter any problems or difficult posterior spaces. This is a good sign. Posterior distributions for model parameters can be extracted in any way that an object of class `brmsfit` can (see `?mvgam::mvgam_draws` for details). For example, we can extract the coefficients related to the GAM linear predictor (i.e. the $\beta$'s) into a `data.frame` using: +```{r Extract coefficient posteriors} +beta_post <- as.data.frame(model1, variable = 'betas') +dplyr::glimpse(beta_post) +``` + +With any model fitted in `mvgam`, the underlying `Stan` code can be viewed using the `code` function: +```{r} +code(model1) +``` + +### Plotting effects and residuals + +Now for interrogating the model. We can get some sense of the variation in yearly intercepts from the summary above, but it is easier to understand them using targeted plots. Plot posterior distributions of the temporal random effects using `plot.mvgam` with `type = 're'`. See `?plot.mvgam` for more details about the types of plots that can be produced from fitted `mvgam` objects +```{r Plot random effect estimates} +plot(model1, type = 're') +``` + +### `bayesplot` support +We can also capitalize on most of the useful MCMC plotting functions from the `bayesplot` package to visualize posterior distributions and diagnostics (see `?mvgam::mcmc_plot.mvgam` for details): +```{r} +mcmc_plot(object = model1, + variable = 'betas', + type = 'areas') +``` + +We can also use the wide range of posterior checking functions available in `bayesplot` (see `?mvgam::ppc_check.mvgam` for details): +```{r} +pp_check(object = model1) +pp_check(model1, type = "rootogram") +``` + +There is clearly some variation in these yearly intercept estimates. But how do these translate into time-varying predictions? To understand this, we can plot posterior hindcasts from this model for the training period using `plot.mvgam` with `type = 'forecast'` +```{r Plot posterior hindcasts} +plot(model1, type = 'forecast') +``` + +If you wish to extract these hindcasts for other downstream analyses, the `hindcast` function can be used. This will return a list object of class `mvgam_forecast`. In the `hindcasts` slot, a matrix of posterior retrodictions will be returned for each series in the data (only one series in our example): +```{r Extract posterior hindcast} +hc <- hindcast(model1) +str(hc) +``` + +You can also extract these hindcasts on the linear predictor scale, which in this case is the log scale (our Poisson GLM used a log link function). Sometimes this can be useful for asking more targeted questions about drivers of variation: +```{r Extract hindcasts on the linear predictor scale} +hc <- hindcast(model1, type = 'link') +range(hc$hindcasts$PP) +``` + +Objects of class `mvgam_forecast` have an associated plot function as well: +```{r Plot hindcasts on the linear predictor scale} +plot(hc) +``` + +This plot can look a bit confusing as it seems like there is linear interpolation from the end of one year to the start of the next. But this is just due to the way the lines are automatically connected in base \R plots + +In any regression analysis, a key question is whether the residuals show any patterns that can be indicative of un-modelled sources of variation. For GLMs, we can use a modified residual called the [Dunn-Smyth, or randomized quantile, residual](https://www.jstor.org/stable/1390802){target="_blank"}. Inspect Dunn-Smyth residuals from the model using `plot.mvgam` with `type = 'residuals'` +```{r Plot posterior residuals} +plot(model1, type = 'residuals') +``` + +## Automatic forecasting for new data +These temporal random effects do not have a sense of "time". Because of this, each yearly random intercept is not restricted in some way to be similar to the previous yearly intercept. This drawback becomes evident when we predict for a new year. To do this, we can repeat the exercise above but this time will split the data into training and testing sets before re-running the model. We can then supply the test set as `newdata`. For splitting, we will make use of the `filter` function from `dplyr` +```{r} +model_data %>% + dplyr::filter(time <= 160) -> data_train +model_data %>% + dplyr::filter(time > 160) -> data_test +``` + +```{r include=FALSE, message=FALSE, warning=FALSE} +model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) +``` + +```{r eval=FALSE} +model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = data_train, + newdata = data_test) +``` + +Repeating the plots above gives insight into how the model's hierarchical prior formulation provides all the structure needed to sample values for un-modelled years +```{r} +plot(model1b, type = 're') +``` + + +```{r} +plot(model1b, type = 'forecast') +``` + +We can also view the test data in the forecast plot to see that the predictions do not capture the temporal variation in the test set +```{r Plotting predictions against test data} +plot(model1b, type = 'forecast', newdata = data_test) +``` + +As with the `hindcast` function, we can use the `forecast` function to automatically extract the posterior distributions for these predictions. This also returns an object of class `mvgam_forecast`, but now it will contain both the hindcasts and forecasts for each series in the data: +```{r Extract posterior forecasts} +fc <- forecast(model1b) +str(fc) +``` + +## Adding predictors as "fixed" effects +Any users familiar with GLMs will know that we nearly always wish to include predictor variables that may explain some of the variation in our observations. Predictors are easily incorporated into GLMs / GAMs. Here, we will update the model from above by including a parametric (fixed) effect of `ndvi` as a linear predictor: +```{r model2, include=FALSE, message=FALSE, warning=FALSE} +model2 <- mvgam(count ~ s(year_fac, bs = 're') + + ndvi - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) +``` + +```{r eval=FALSE} +model2 <- mvgam(count ~ s(year_fac, bs = 're') + + ndvi - 1, + family = poisson(), + data = data_train, + newdata = data_test) +``` + +The model can be described mathematically as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} + \beta_{ndvi} * \boldsymbol{ndvi}_t \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*} + +Where the $\beta_{year}$ effects are the same as before but we now have another predictor $(\beta_{ndvi})$ that applies to the `ndvi` value at each timepoint $t$. Inspect the summary of this model + +```{r, class.output="scroll-300"} +summary(model2) +``` + +Rather than printing the summary each time, we can also quickly look at the posterior empirical quantiles for the fixed effect of `ndvi` (and other linear predictor coefficients) using `coef`: +```{r Posterior quantiles of model coefficients} +coef(model2) +``` + +Look at the estimated effect of `ndvi` using `plot.mvgam` with `type = 'pterms'` +```{r Plot NDVI effect} +plot(model2, type = 'pterms') +``` + +This plot indicates a positive linear effect of `ndvi` on `log(counts)`. But it may be easier to visualise using a histogram, especially for parametric (linear) effects. This can be done by first extracting the posterior coefficients as we did in the first example: +```{r} +beta_post <- as.data.frame(model2, variable = 'betas') +dplyr::glimpse(beta_post) +``` + +The posterior distribution for the effect of `ndvi` is stored in the `ndvi` column. A quick histogram confirms our inference that `log(counts)` respond positively to increases in `ndvi`: +```{r Histogram of NDVI effects} +hist(beta_post$ndvi, + xlim = c(-1 * max(abs(beta_post$ndvi)), + max(abs(beta_post$ndvi))), + col = 'darkred', + border = 'white', + xlab = expression(beta[NDVI]), + ylab = '', + yaxt = 'n', + main = '', + lwd = 2) +abline(v = 0, lwd = 2.5) +``` + +### `marginaleffects` support +Given our model used a nonlinear link function (log link in this example), it can still be difficult to fully understand what relationship our model is estimating between a predictor and the response. Fortunately, the `marginaleffects` package makes this relatively straightforward. Objects of class `mvgam` can be used with `marginaleffects` to inspect contrasts, scenario-based predictions, conditional and marginal effects, all on the outcome scale. Here we will use the `plot_predictions` function from `marginaleffects` to inspect the conditional effect of `ndvi` (use `?plot_predictions` for guidance on how to modify these plots): +```{r warning=FALSE} +plot_predictions(model2, + condition = "ndvi", + # include the observed count values + # as points, and show rugs for the observed + # ndvi and count values on the axes + points = 0.5, rug = TRUE) +``` + +Now it is easier to get a sense of the nonlinear but positive relationship estimated between `ndvi` and `count`. Like `brms`, `mvgam` has the simple `conditional_effects` function to make quick and informative plots for main effects. This will likely be your go-to function for quickly understanding patterns from fitted `mvgam` models +```{r warning=FALSE} +plot(conditional_effects(model2), ask = FALSE) +``` + +## Adding predictors as smooths + +Smooth functions, using penalized splines, are a major feature of `mvgam`. Nonlinear splines are commonly viewed as variations of random effects in which the coefficients that control the shape of the spline are drawn from a joint, penalized distribution. This strategy is very often used in ecological time series analysis to capture smooth temporal variation in the processes we seek to study. When we construct smoothing splines, the workhorse package `mgcv` will calculate a set of basis functions that will collectively control the shape and complexity of the resulting spline. It is often helpful to visualize these basis functions to get a better sense of how splines work. We'll create a set of 6 basis functions to represent possible variation in the effect of `time` on our outcome.In addition to constructing the basis functions, `mgcv` also creates a penalty matrix $S$, which contains **known** coefficients that work to constrain the wiggliness of the resulting smooth function. When fitting a GAM to data, we must estimate the smoothing parameters ($\lambda$) that will penalize these matrices, resulting in constrained basis coefficients and smoother functions that are less likely to overfit the data. This is the key to fitting GAMs in a Bayesian framework, as we can jointly estimate the $\lambda$'s using informative priors to prevent overfitting and expand the complexity of models we can tackle. To see this in practice, we can now fit a model that replaces the yearly random effects with a smooth function of `time`. We will need a reasonably complex function (large `k`) to try and accommodate the temporal variation in our observations. Following some [useful advice by Gavin Simpson](https://fromthebottomoftheheap.net/2020/06/03/extrapolating-with-gams/){target="_blank"}, we will use a b-spline basis for the temporal smooth. Because we no longer have intercepts for each year, we also retain the primary intercept term in this model (there is no `-1` in the formula now): +```{r model3, include=FALSE, message=FALSE, warning=FALSE} +model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + + ndvi, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) +``` + +```{r eval=FALSE} +model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + + ndvi, + family = poisson(), + data = data_train, + newdata = data_test) +``` + +The model can be described mathematically as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{time})_t + \beta_{ndvi} * \boldsymbol{ndvi}_t \\ +f(\boldsymbol{time}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*} + + +Where the smooth function $f_{time}$ is built by summing across a set of weighted basis functions. The basis functions $(b)$ are constructed using a thin plate regression basis in `mgcv`. The weights $(\beta_{smooth})$ are drawn from a penalized multivariate normal distribution where the precision matrix $(\Omega$) is multiplied by a smoothing penalty $(\lambda)$. If $\lambda$ becomes large, this acts to *squeeze* the covariances among the weights $(\beta_{smooth})$, leading to a less wiggly spline. Note that sometimes there are multiple smoothing penalties that contribute to the covariance matrix, but I am only showing one here for simplicity. View the summary as before +```{r} +summary(model3) +``` + +The summary above now contains posterior estimates for the smoothing parameters as well as the basis coefficients for the nonlinear effect of `time`. We can visualize the conditional `time` effect using the `plot` function with `type = 'smooths'`: +```{r} +plot(model3, type = 'smooths') +``` + +By default this plots shows posterior empirical quantiles, but it can also be helpful to view some realizations of the underlying function (here, each line is a different potential curve drawn from the posterior of all possible curves): +```{r} +plot(model3, type = 'smooths', realisations = TRUE, + n_realisations = 30) +``` + +### Derivatives of smooths +A useful question when modelling using GAMs is to identify where the function is changing most rapidly. To address this, we can plot estimated 1st derivatives of the spline: +```{r Plot smooth term derivatives, warning = FALSE, fig.asp = 1} +plot(model3, type = 'smooths', derivatives = TRUE) +``` + +Here, values above `>0` indicate the function was increasing at that time point, while values `<0` indicate the function was declining. The most rapid declines appear to have been happening around timepoints 50 and again toward the end of the training period, for example. + +Use `conditional_effects` again for useful plots on the outcome scale: +```{r warning=FALSE} +plot(conditional_effects(model3), ask = FALSE) +``` + +Or on the link scale: +```{r warning=FALSE} +plot(conditional_effects(model3, type = 'link'), ask = FALSE) +``` + +Inspect the underlying `Stan` code to gain some idea of how the spline is being penalized: +```{r, class.output="scroll-300"} +code(model3) +``` + +The line below `// prior for s(time)...` shows how the spline basis coefficients are drawn from a zero-centred multivariate normal distribution. The precision matrix $S$ is penalized by two different smoothing parameters (the $\lambda$'s) to enforce smoothness and reduce overfitting + +## Latent dynamics in `mvgam` + +Forecasts from the above model are not ideal: +```{r} +plot(model3, type = 'forecast', newdata = data_test) +``` + +Why is this happening? The forecasts are driven almost entirely by variation in the temporal spline, which is extrapolating linearly *forever* beyond the edge of the training data. Any slight wiggles near the end of the training set will result in wildly different forecasts. To visualize this, we can plot the extrapolated temporal functions into the out-of-sample test set for the two models. Here are the extrapolated functions for the first model, with 15 basis functions: +```{r Plot extrapolated temporal functions using newdata} +plot_mvgam_smooth(model3, smooth = 's(time)', + # feed newdata to the plot function to generate + # predictions of the temporal smooth to the end of the + # testing period + newdata = data.frame(time = 1:max(data_test$time), + ndvi = 0)) +abline(v = max(data_train$time), lty = 'dashed', lwd = 2) +``` + +This model is not doing well. Clearly we need to somehow account for the strong temporal autocorrelation when modelling these data without using a smooth function of `time`. Now onto another prominent feature of `mvgam`: the ability to include (possibly latent) autocorrelated residuals in regression models. To do so, we use the `trend_model` argument (see `?mvgam_trends` for details of different dynamic trend models that are supported). This model will use a separate sub-model for latent residuals that evolve as an AR1 process (i.e. the error in the current time point is a function of the error in the previous time point, plus some stochastic noise). We also include a smooth function of `ndvi` in this model, rather than the parametric term that was used above, to showcase that `mvgam` can include combinations of smooths and dynamic components: +```{r model4, include=FALSE} +model4 <- mvgam(count ~ s(ndvi, k = 6), + family = poisson(), + data = data_train, + newdata = data_test, + trend_model = 'AR1', + parallel = FALSE) +``` + +```{r eval=FALSE} +model4 <- mvgam(count ~ s(ndvi, k = 6), + family = poisson(), + data = data_train, + newdata = data_test, + trend_model = 'AR1') +``` + +The model can be described mathematically as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{ndvi})_t + z_t \\ +z_t & \sim \text{Normal}(ar1 * z_{t-1}, \sigma_{error}) \\ +ar1 & \sim \text{Normal}(0, 1)[-1, 1] \\ +\sigma_{error} & \sim \text{Exponential}(2) \\ +f(\boldsymbol{ndvi}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) \end{align*} + +Here the term $z_t$ captures autocorrelated latent residuals, which are modelled using an AR1 process. You can also notice that this model is estimating autocorrelated errors for the full time period, even though some of these time points have missing observations. This is useful for getting more realistic estimates of the residual autocorrelation parameters. Summarise the model to see how it now returns posterior summaries for the latent AR1 process: +```{r Summarise the mvgam autocorrelated error model, class.output="scroll-300"} +summary(model4) +``` + +View conditional smooths for the `ndvi` effect: +```{r warning=FALSE, message=FALSE} +plot_predictions(model4, + condition = "ndvi", + points = 0.5, rug = TRUE) +``` + +View posterior hindcasts / forecasts and compare against the out of sample test data +```{r} +plot(model4, type = 'forecast', newdata = data_test) +``` + +The trend is evolving as an AR1 process, which we can also view: +```{r} +plot(model4, type = 'trend', newdata = data_test) +``` + +In-sample model performance can be interrogated using leave-one-out cross-validation utilities from the `loo` package (a higher value is preferred for this metric): +```{r} +loo_compare(model3, model4) +``` + +The higher estimated log predictive density (ELPD) value for the dynamic model suggests it provides a better fit to the in-sample data. + +Though it should be obvious that this model provides better forecasts, we can quantify forecast performance for models 3 and 4 using the `forecast` and `score` functions. Here we will compare models based on their Discrete Ranked Probability Scores (a lower value is preferred for this metric) +```{r} +fc_mod3 <- forecast(model3) +fc_mod4 <- forecast(model4) +score_mod3 <- score(fc_mod3, score = 'drps') +score_mod4 <- score(fc_mod4, score = 'drps') +sum(score_mod4$PP$score, na.rm = TRUE) - sum(score_mod3$PP$score, na.rm = TRUE) +``` + +A strongly negative value here suggests the score for the dynamic model (model 4) is much smaller than the score for the model with a smooth function of time (model 3) + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/doc/mvgam_overview.html b/doc/mvgam_overview.html new file mode 100644 index 00000000..6ebc839b --- /dev/null +++ b/doc/mvgam_overview.html @@ -0,0 +1,1890 @@ + + + + + + + + + + + + + + + + +Overview of the mvgam package + + + + + + + + + + + + + + + + + + + + + + + + + + +

Overview of the mvgam package

+

Nicholas J Clark

+

2024-04-16

+ + +
+ +
+ +

The purpose of this vignette is to give a general overview of the +mvgam package and its primary functions.

+
+

Dynamic GAMs

+

mvgam is designed to propagate unobserved temporal +processes to capture latent dynamics in the observed time series. This +works in a state-space format, with the temporal trend evolving +independently of the observation process. An introduction to the package +and some worked examples are also shown in this seminar: Ecological Forecasting with Dynamic Generalized Additive +Models. Briefly, assume \(\tilde{\boldsymbol{y}}_{i,t}\) is the +conditional expectation of response variable \(\boldsymbol{i}\) at time \(\boldsymbol{t}\). Assuming \(\boldsymbol{y_i}\) is drawn from an +exponential distribution with an invertible link function, the linear +predictor for a multivariate Dynamic GAM can be written as:

+

\[for~i~in~1:N_{series}~...\] \[for~t~in~1:N_{timepoints}~...\]

+

\[g^{-1}(\tilde{\boldsymbol{y}}_{i,t})=\alpha_{i}+\sum\limits_{j=1}^J\boldsymbol{s}_{i,j,t}\boldsymbol{x}_{j,t}+\boldsymbol{z}_{i,t}\,,\] +Here \(\alpha\) are the unknown +intercepts, the \(\boldsymbol{s}\)’s +are unknown smooth functions of covariates (\(\boldsymbol{x}\)’s), which can potentially +vary among the response series, and \(\boldsymbol{z}\) are dynamic latent +processes. Each smooth function \(\boldsymbol{s_j}\) is composed of basis +expansions whose coefficients, which must be estimated, control the +functional relationship between \(\boldsymbol{x}_{j}\) and \(g^{-1}(\tilde{\boldsymbol{y}})\). The size +of the basis expansion limits the smooth’s potential complexity. A +larger set of basis functions allows greater flexibility. For more +information on GAMs and how they can smooth through data, see this blogpost on how to interpret nonlinear effects from +Generalized Additive Models.

+

Several advantages of GAMs are that they can model a diversity of +response families, including discrete distributions (i.e. Poisson, +Negative Binomial, Gamma) that accommodate common ecological features +such as zero-inflation or overdispersion, and that they can be +formulated to include hierarchical smoothing for multivariate responses. +mvgam supports a number of different observation families, +which are summarized below:

+
+
+

Supported observation families

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DistributionFunctionSupportExtra parameter(s)
Gaussian (identity link)gaussian()Real values in \((-\infty, +\infty)\)\(\sigma\)
Student’s T (identity link)student-t()Heavy-tailed real values in \((-\infty, \infty)\)\(\sigma\), \(\nu\)
LogNormal (identity link)lognormal()Positive real values in \([0, \infty)\)\(\sigma\)
Gamma (log link)Gamma()Positive real values in \([0, \infty)\)\(\alpha\)
Beta (logit link)betar()Real values (proportional) in \([0,1]\)\(\phi\)
Bernoulli (logit link)bernoulli()Binary data in \({0,1}\)-
Poisson (log link)poisson()Non-negative integers in \((0,1,2,...)\)-
Negative Binomial2 (log link)nb()Non-negative integers in \((0,1,2,...)\)\(\phi\)
Binomial (logit link)binomial()Non-negative integers in \((0,1,2,...)\)-
Beta-Binomial (logit link)beta_binomial()Non-negative integers in \((0,1,2,...)\)\(\phi\)
Poisson Binomial N-mixture (log link)nmix()Non-negative integers in \((0,1,2,...)\)-
+

For all supported observation families, any extra parameters that +need to be estimated (i.e. the \(\sigma\) in a Gaussian model or the \(\phi\) in a Negative Binomial model) are by +default estimated independently for each series. However, users can opt +to force all series to share extra observation parameters using +share_obs_params = TRUE in mvgam(). Note that +default link functions cannot currently be changed.

+
+
+

Supported temporal dynamic processes

+

The dynamic processes can take a wide variety of forms, some of which +can be multivariate to allow the different time series to interact or be +correlated. When using the mvgam() function, the user +chooses between different process models with the +trend_model argument. Available process models are +described in detail below.

+
+

Independent Random Walks

+

Use trend_model = 'RW' or +trend_model = RW() to set up a model where each series in +data has independent latent temporal dynamics of the +form:

+

\[\begin{align*} +z_{i,t} & \sim \text{Normal}(z_{i,t-1}, \sigma_i) +\end{align*}\]

+

Process error parameters \(\sigma\) +are modeled independently for each series. If a moving average process +is required, use trend_model = RW(ma = TRUE) to set up the +following:

+

\[\begin{align*} +z_{i,t} & = z_{i,t-1} + \theta_i * error_{i,t-1} + error_{i,t} \\ +error_{i,t} & \sim \text{Normal}(0, \sigma_i) +\end{align*}\]

+

Moving average coefficients \(\theta\) are independently estimated for +each series and will be forced to be stationary by default \((abs(\theta)<1)\). Only moving averages +of order \(q=1\) are currently +allowed.

+
+
+

Multivariate Random Walks

+

If more than one series is included in data \((N_{series} > 1)\), a multivariate +Random Walk can be set up using +trend_model = RW(cor = TRUE), resulting in the +following:

+

\[\begin{align*} +z_{t} & \sim \text{MVNormal}(z_{t-1}, \Sigma) +\end{align*}\]

+

Where the latent process estimate \(z_t\) now takes the form of a vector. The +covariance matrix \(\Sigma\) will +capture contemporaneously correlated process errors. It is parameterised +using a Cholesky factorization, which requires priors on the +series-level variances \(\sigma\) and +on the strength of correlations using Stan’s +lkj_corr_cholesky distribution.

+

Moving average terms can also be included for multivariate random +walks, in which case the moving average coefficients \(\theta\) will be parameterised as an \(N_{series} * N_{series}\) matrix

+
+
+

Autoregressive processes

+

Autoregressive models up to \(p=3\), +in which the autoregressive coefficients are estimated independently for +each series, can be used by specifying trend_model = 'AR1', +trend_model = 'AR2', trend_model = 'AR3', or +trend_model = AR(p = 1, 2, or 3). For example, a univariate +AR(1) model takes the form:

+

\[\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i * z_{i,t-1}, \sigma_i) +\end{align*}\]

+

All options are the same as for Random Walks, but additional options +will be available for placing priors on the autoregressive coefficients. +By default, these coefficients will not be forced into stationarity, but +users can impose this restriction by changing the upper and lower bounds +on their priors. See ?get_mvgam_priors for more +details.

+
+
+

Vector Autoregressive processes

+

A Vector Autoregression of order \(p=1\) can be specified if \(N_{series} > 1\) using +trend_model = 'VAR1' or trend_model = VAR(). A +VAR(1) model takes the form:

+

\[\begin{align*} +z_{t} & \sim \text{Normal}(A * z_{t-1}, \Sigma) +\end{align*}\]

+

Where \(A\) is an \(N_{series} * N_{series}\) matrix of +autoregressive coefficients in which the diagonals capture lagged +self-dependence (i.e. the effect of a process at time \(t\) on its own estimate at time \(t+1\)), while off-diagonals capture lagged +cross-dependence (i.e. the effect of a process at time \(t\) on the process for another series at +time \(t+1\)). By default, the +covariance matrix \(\Sigma\) will +assume no process error covariance by fixing the off-diagonals to \(0\). To allow for correlated errors, use +trend_model = 'VAR1cor' or +trend_model = VAR(cor = TRUE). A moving average of order +\(q=1\) can also be included using +trend_model = VAR(ma = TRUE, cor = TRUE).

+

Note that for all VAR models, stationarity of the process is enforced +with a structured prior distribution that is described in detail in Heaps +2022

+

Heaps, Sarah E. “Enforcing +stationarity through the prior in vector autoregressions.” +Journal of Computational and Graphical Statistics 32.1 (2023): +74-83.

+
+
+

Gaussian Processes

+

The final option for modelling temporal dynamics is to use a Gaussian +Process with squared exponential kernel. These are set up independently +for each series (there is currently no multivariate GP option), using +trend_model = 'GP'. The dynamics for each latent process +are modelled as:

+

\[\begin{align*} +z & \sim \text{MVNormal}(0, \Sigma_{error}) \\ +\Sigma_{error}[t_i, t_j] & = \alpha^2 * exp(-0.5 * ((|t_i - t_j| / +\rho))^2) \end{align*}\]

+

The latent dynamic process evolves from a complex, high-dimensional +Multivariate Normal distribution which depends on \(\rho\) (often called the length scale +parameter) to control how quickly the correlations between the model’s +errors decay as a function of time. For these models, covariance decays +exponentially fast with the squared distance (in time) between the +observations. The functions also depend on a parameter \(\alpha\), which controls the marginal +variability of the temporal function at all points; in other words it +controls how much the GP term contributes to the linear predictor. +mvgam capitalizes on some advances that allow GPs to be +approximated using Hilbert space basis functions, which considerably speed up computation at little cost to +accuracy or prediction performance.

+
+ +
+

Continuous time AR(1) processes

+

Most trend models in the mvgam() function expect time to +be measured in regularly-spaced, discrete intervals (i.e. one +measurement per week, or one per year for example). But some time series +are taken at irregular intervals and we’d like to model autoregressive +properties of these. The trend_model = CAR() can be useful +to set up these models, which currently only support autoregressive +processes of order 1. The evolution of the latent dynamic +process follows the form:

+

\[\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i^{distance} * z_{i,t-1}, \sigma_i) +\end{align*}\]

+

Where \(distance\) is a vector of +non-negative measurements of the time differences between successive +observations. See the Examples section in +?CAR for an illustration of how to set these models up.

+
+
+
+

Regression formulae

+

mvgam supports an observation model regression formula, +built off the mvgcv package, as well as an optional process +model regression formula. The formulae supplied to are exactly like +those supplied to glm() except that smooth terms, +s(), te(), ti() and +t2(), time-varying effects using dynamic(), +monotonically increasing (using s(x, bs = 'moi')) or +decreasing splines (using s(x, bs = 'mod'); see +?smooth.construct.moi.smooth.spec for details), as well as +Gaussian Process functions using gp(), can be added to the +right hand side (and . is not supported in +mvgam formulae). See ?mvgam_formulae for more +guidance.

+

For setting up State-Space models, the optional process model formula +can be used (see the +State-Space model vignette and the +shared latent states vignette for guidance on using trend +formulae).

+
+
+

Example time series data

+

The ‘portal_data’ object contains time series of rodent captures from +the Portal Project, a long-term monitoring study based near the town of +Portal, Arizona. Researchers have been operating a standardized set +of baited traps within 24 experimental plots at this site since the +1970’s. Sampling follows the lunar monthly cycle, with observations +occurring on average about 28 days apart. However, missing observations +do occur due to difficulties accessing the site (weather events, COVID +disruptions etc…). You can read about the full sampling protocol in this preprint by Ernest et al on the Biorxiv.

+
data("portal_data")
+

As the data come pre-loaded with the mvgam package, you +can read a little about it in the help page using +?portal_data. Before working with data, it is important to +inspect how the data are structured, first using head:

+
head(portal_data)
+#>   moon DM DO PP OT year month mintemp precipitation     ndvi
+#> 1  329 10  6  0  2 2004     1  -9.710          37.8 1.465889
+#> 2  330 14  8  1  0 2004     2  -5.924           8.7 1.558507
+#> 3  331  9  1  2  1 2004     3  -0.220          43.5 1.337817
+#> 4  332 NA NA NA NA 2004     4   1.931          23.9 1.658913
+#> 5  333 15  8 10  1 2004     5   6.568           0.9 1.853656
+#> 6  334 NA NA NA NA 2004     6  11.590           1.4 1.761330
+

But the glimpse function in dplyr is also +useful for understanding how variables are structured

+
dplyr::glimpse(portal_data)
+#> Rows: 199
+#> Columns: 10
+#> $ moon          <int> 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 3…
+#> $ DM            <int> 10, 14, 9, NA, 15, NA, NA, 9, 5, 8, NA, 14, 7, NA, NA, 9…
+#> $ DO            <int> 6, 8, 1, NA, 8, NA, NA, 3, 3, 4, NA, 3, 8, NA, NA, 3, NA…
+#> $ PP            <int> 0, 1, 2, NA, 10, NA, NA, 16, 18, 12, NA, 3, 2, NA, NA, 1…
+#> $ OT            <int> 2, 0, 1, NA, 1, NA, NA, 1, 0, 0, NA, 2, 1, NA, NA, 1, NA…
+#> $ year          <int> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 20…
+#> $ month         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6,…
+#> $ mintemp       <dbl> -9.710, -5.924, -0.220, 1.931, 6.568, 11.590, 14.370, 16…
+#> $ precipitation <dbl> 37.8, 8.7, 43.5, 23.9, 0.9, 1.4, 20.3, 91.0, 60.5, 25.2,…
+#> $ ndvi          <dbl> 1.4658889, 1.5585069, 1.3378172, 1.6589129, 1.8536561, 1…
+

We will focus analyses on the time series of captures for one +specific rodent species, the Desert Pocket Mouse Chaetodipus +penicillatus. This species is interesting in that it goes into a +kind of “hibernation” during the colder months, leading to very low +captures during the winter period

+
+
+

Manipulating data for modelling

+

Manipulating the data into a ‘long’ format is necessary for modelling +in mvgam. By ‘long’ format, we mean that each +series x time observation needs to have its own entry in +the dataframe or list object that we wish to +use as data for modelling. A simple example can be viewed by simulating +data using the sim_mvgam function. See +?sim_mvgam for more details

+
data <- sim_mvgam(n_series = 4, T = 24)
+head(data$data_train, 12)
+#>    y season year   series time
+#> 1  1      1    1 series_1    1
+#> 2  0      1    1 series_2    1
+#> 3  1      1    1 series_3    1
+#> 4  1      1    1 series_4    1
+#> 5  1      2    1 series_1    2
+#> 6  1      2    1 series_2    2
+#> 7  0      2    1 series_3    2
+#> 8  1      2    1 series_4    2
+#> 9  2      3    1 series_1    3
+#> 10 0      3    1 series_2    3
+#> 11 0      3    1 series_3    3
+#> 12 1      3    1 series_4    3
+

Notice how we have four different time series in these simulated +data, but we do not spread the outcome values into different columns. +Rather, there is only a single column for the outcome variable, labelled +y in these simulated data. We also must supply a variable +labelled time to ensure the modelling software knows how to +arrange the time series when building models. This setup still allows us +to formulate multivariate time series models, as you can see in the State-Space +vignette. Below are the steps needed to shape our +portal_data object into the correct form. First, we create +a time variable, select the column representing counts of +our target species (PP), and select appropriate variables +that we can use as predictors

+
portal_data %>%
+  
+  # mvgam requires a 'time' variable be present in the data to index
+  # the temporal observations. This is especially important when tracking 
+  # multiple time series. In the Portal data, the 'moon' variable indexes the
+  # lunar monthly timestep of the trapping sessions
+  dplyr::mutate(time = moon - (min(moon)) + 1) %>%
+  
+  # We can also provide a more informative name for the outcome variable, which 
+  # is counts of the 'PP' species (Chaetodipus penicillatus) across all control
+  # plots
+  dplyr::mutate(count = PP) %>%
+  
+  # The other requirement for mvgam is a 'series' variable, which needs to be a
+  # factor variable to index which time series each row in the data belongs to.
+  # Again, this is more useful when you have multiple time series in the data
+  dplyr::mutate(series = as.factor('PP')) %>%
+  
+  # Select the variables of interest to keep in the model_data
+  dplyr::select(series, year, time, count, mintemp, ndvi) -> model_data
+

The data now contain six variables:
+series, a factor indexing which time series each +observation belongs to
+year, the year of sampling
+time, the indicator of which time step each observation +belongs to
+count, the response variable representing the number of +captures of the species PP in each sampling +observation
+mintemp, the monthly average minimum temperature at each +time step
+ndvi, the monthly average Normalized Difference Vegetation +Index at each time step

+

Now check the data structure again

+
head(model_data)
+#>   series year time count mintemp     ndvi
+#> 1     PP 2004    1     0  -9.710 1.465889
+#> 2     PP 2004    2     1  -5.924 1.558507
+#> 3     PP 2004    3     2  -0.220 1.337817
+#> 4     PP 2004    4    NA   1.931 1.658913
+#> 5     PP 2004    5    10   6.568 1.853656
+#> 6     PP 2004    6    NA  11.590 1.761330
+
dplyr::glimpse(model_data)
+#> Rows: 199
+#> Columns: 6
+#> $ series  <fct> PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP…
+#> $ year    <int> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 20…
+#> $ time    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
+#> $ count   <int> 0, 1, 2, NA, 10, NA, NA, 16, 18, 12, NA, 3, 2, NA, NA, 13, NA,…
+#> $ mintemp <dbl> -9.710, -5.924, -0.220, 1.931, 6.568, 11.590, 14.370, 16.520, …
+#> $ ndvi    <dbl> 1.4658889, 1.5585069, 1.3378172, 1.6589129, 1.8536561, 1.76132…
+

You can also summarize multiple variables, which is helpful to search +for data ranges and identify missing values

+
summary(model_data)
+#>  series        year           time           count          mintemp       
+#>  PP:199   Min.   :2004   Min.   :  1.0   Min.   : 0.00   Min.   :-24.000  
+#>           1st Qu.:2008   1st Qu.: 50.5   1st Qu.: 2.50   1st Qu.: -3.884  
+#>           Median :2012   Median :100.0   Median :12.00   Median :  2.130  
+#>           Mean   :2012   Mean   :100.0   Mean   :15.14   Mean   :  3.504  
+#>           3rd Qu.:2016   3rd Qu.:149.5   3rd Qu.:24.00   3rd Qu.: 12.310  
+#>           Max.   :2020   Max.   :199.0   Max.   :65.00   Max.   : 18.140  
+#>                                          NA's   :36                       
+#>       ndvi       
+#>  Min.   :0.2817  
+#>  1st Qu.:1.0741  
+#>  Median :1.3501  
+#>  Mean   :1.4709  
+#>  3rd Qu.:1.8178  
+#>  Max.   :3.9126  
+#> 
+

We have some NAs in our response variable +count. Let’s visualize the data as a heatmap to get a sense +of where these are distributed (NAs are shown as red bars +in the below plot)

+
image(is.na(t(model_data %>%
+                dplyr::arrange(dplyr::desc(time)))), axes = F,
+      col = c('grey80', 'darkred'))
+axis(3, at = seq(0,1, len = NCOL(model_data)), labels = colnames(model_data))
+

+

These observations will generally be thrown out by most modelling +packages in . But as you will see when we work through the tutorials, +mvgam keeps these in the data so that predictions can be +automatically returned for the full dataset. The time series and some of +its descriptive features can be plotted using +plot_mvgam_series():

+
plot_mvgam_series(data = model_data, series = 1, y = 'count')
+

+
+
+

GLMs with temporal random effects

+

Our first task will be to fit a Generalized Linear Model (GLM) that +can adequately capture the features of our count +observations (integer data, lower bound at zero, missing values) while +also attempting to model temporal variation. We are almost ready to fit +our first model, which will be a GLM with Poisson observations, a log +link function and random (hierarchical) intercepts for +year. This will allow us to capture our prior belief that, +although each year is unique, having been sampled from the same +population of effects, all years are connected and thus might contain +valuable information about one another. This will be done by +capitalizing on the partial pooling properties of hierarchical models. +Hierarchical (also known as random) effects offer many advantages when +modelling data with grouping structures (i.e. multiple species, +locations, years etc…). The ability to incorporate these in time series +models is a huge advantage over traditional models such as ARIMA or +Exponential Smoothing. But before we fit the model, we will need to +convert year to a factor so that we can use a random effect +basis in mvgam. See ?smooth.terms and +?smooth.construct.re.smooth.spec for details about the +re basis construction that is used by both +mvgam and mgcv

+
model_data %>%
+  
+  # Create a 'year_fac' factor version of 'year'
+  dplyr::mutate(year_fac = factor(year)) -> model_data
+

Preview the dataset to ensure year is now a factor with a unique +factor level for each year in the data

+
dplyr::glimpse(model_data)
+#> Rows: 199
+#> Columns: 7
+#> $ series   <fct> PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, P…
+#> $ year     <int> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2…
+#> $ time     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
+#> $ count    <int> 0, 1, 2, NA, 10, NA, NA, 16, 18, 12, NA, 3, 2, NA, NA, 13, NA…
+#> $ mintemp  <dbl> -9.710, -5.924, -0.220, 1.931, 6.568, 11.590, 14.370, 16.520,…
+#> $ ndvi     <dbl> 1.4658889, 1.5585069, 1.3378172, 1.6589129, 1.8536561, 1.7613…
+#> $ year_fac <fct> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2…
+levels(model_data$year_fac)
+#>  [1] "2004" "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013"
+#> [11] "2014" "2015" "2016" "2017" "2018" "2019" "2020"
+

We are now ready for our first mvgam model. The syntax +will be familiar to users who have previously built models with +mgcv. But for a refresher, see ?formula.gam +and the examples in ?gam. Random effects can be specified +using the s wrapper with the re basis. Note +that we can also suppress the primary intercept using the usual +R formula syntax - 1. mvgam has a +number of possible observation families that can be used, see +?mvgam_families for more information. We will use +Stan as the fitting engine, which deploys Hamiltonian Monte +Carlo (HMC) for full Bayesian inference. By default, 4 HMC chains will +be run using a warmup of 500 iterations and collecting 500 posterior +samples from each chain. The package will also aim to use the +Cmdstan backend when possible, so it is recommended that +users have an up-to-date installation of Cmdstan and the +associated cmdstanr interface on their machines (note that +you can set the backend yourself using the backend +argument: see ?mvgam for details). Interested users should +consult the Stan user’s guide for more information +about the software and the enormous variety of models that can be +tackled with HMC.

+
model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1,
+                family = poisson(),
+                data = model_data)
+

The model can be described mathematically for each timepoint \(t\) as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) +\end{align*}\]

+

Where the \(\beta_{year}\) effects +are drawn from a population distribution that is parameterized +by a common mean \((\mu_{year})\) and +variance \((\sigma_{year})\). Priors on +most of the model parameters can be interrogated and changed using +similar functionality to the options available in brms. For +example, the default priors on \((\mu_{year})\) and \((\sigma_{year})\) can be viewed using the +following code:

+
get_mvgam_priors(count ~ s(year_fac, bs = 're') - 1,
+                 family = poisson(),
+                 data = model_data)
+#>                      param_name param_length           param_info
+#> 1             vector[1] mu_raw;            1 s(year_fac) pop mean
+#> 2 vector<lower=0>[1] sigma_raw;            1   s(year_fac) pop sd
+#>                               prior                example_change
+#> 1            mu_raw ~ std_normal();  mu_raw ~ normal(0.17, 0.76);
+#> 2 sigma_raw ~ student_t(3, 0, 2.5); sigma_raw ~ exponential(0.7);
+#>   new_lowerbound new_upperbound
+#> 1             NA             NA
+#> 2             NA             NA
+

See examples in ?get_mvgam_priors to find out different +ways that priors can be altered. Once the model has finished, the first +step is to inspect the summary to ensure no major +diagnostic warnings have been produced and to quickly summarise +posterior distributions for key parameters

+
summary(model1)
+#> GAM formula:
+#> count ~ s(year_fac, bs = "re") - 1
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 199 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>                 2.5% 50% 97.5% Rhat n_eff
+#> s(year_fac).1   1.80 2.1   2.3 1.00  2663
+#> s(year_fac).2   2.50 2.7   2.8 1.00  2468
+#> s(year_fac).3   3.00 3.1   3.2 1.00  3105
+#> s(year_fac).4   3.10 3.3   3.4 1.00  2822
+#> s(year_fac).5   1.90 2.1   2.3 1.00  3348
+#> s(year_fac).6   1.50 1.8   2.0 1.00  2859
+#> s(year_fac).7   1.80 2.0   2.3 1.00  2995
+#> s(year_fac).8   2.80 3.0   3.1 1.00  3126
+#> s(year_fac).9   3.10 3.3   3.4 1.00  2816
+#> s(year_fac).10  2.60 2.8   2.9 1.00  2289
+#> s(year_fac).11  3.00 3.1   3.2 1.00  2725
+#> s(year_fac).12  3.10 3.2   3.3 1.00  2581
+#> s(year_fac).13  2.00 2.2   2.5 1.00  2885
+#> s(year_fac).14  2.50 2.6   2.8 1.00  2749
+#> s(year_fac).15  1.90 2.2   2.4 1.00  2943
+#> s(year_fac).16  1.90 2.1   2.3 1.00  2991
+#> s(year_fac).17 -0.33 1.1   1.9 1.01   356
+#> 
+#> GAM group-level estimates:
+#>                   2.5%  50% 97.5% Rhat n_eff
+#> mean(s(year_fac)) 2.00 2.40   2.7 1.01   193
+#> sd(s(year_fac))   0.44 0.67   1.1 1.02   172
+#> 
+#> Approximate significance of GAM smooths:
+#>              edf Ref.df Chi.sq p-value    
+#> s(year_fac) 13.8     17  23477  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 12:59:57 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The diagnostic messages at the bottom of the summary show that the +HMC sampler did not encounter any problems or difficult posterior +spaces. This is a good sign. Posterior distributions for model +parameters can be extracted in any way that an object of class +brmsfit can (see ?mvgam::mvgam_draws for +details). For example, we can extract the coefficients related to the +GAM linear predictor (i.e. the \(\beta\)’s) into a data.frame +using:

+
beta_post <- as.data.frame(model1, variable = 'betas')
+dplyr::glimpse(beta_post)
+#> Rows: 2,000
+#> Columns: 17
+#> $ `s(year_fac).1`  <dbl> 2.17023, 2.08413, 1.99815, 2.17572, 2.11308, 2.03050,…
+#> $ `s(year_fac).2`  <dbl> 2.70488, 2.69887, 2.65551, 2.79651, 2.76044, 2.75108,…
+#> $ `s(year_fac).3`  <dbl> 3.08617, 3.13429, 3.04575, 3.14824, 3.10917, 3.09809,…
+#> $ `s(year_fac).4`  <dbl> 3.29529, 3.21044, 3.22018, 3.26644, 3.29880, 3.25638,…
+#> $ `s(year_fac).5`  <dbl> 2.11053, 2.14516, 2.13959, 2.05244, 2.26847, 2.20820,…
+#> $ `s(year_fac).6`  <dbl> 1.80418, 1.83343, 1.75987, 1.76972, 1.64782, 1.70765,…
+#> $ `s(year_fac).7`  <dbl> 1.99033, 1.95772, 1.98093, 2.01777, 2.04849, 1.97815,…
+#> $ `s(year_fac).8`  <dbl> 3.01204, 2.91291, 3.14762, 2.83082, 2.90250, 3.04050,…
+#> $ `s(year_fac).9`  <dbl> 3.22248, 3.20205, 3.30373, 3.23181, 3.24927, 3.25232,…
+#> $ `s(year_fac).10` <dbl> 2.71922, 2.62225, 2.82574, 2.65027, 2.69077, 2.75249,…
+#> $ `s(year_fac).11` <dbl> 3.10525, 3.03951, 3.12914, 3.03849, 3.01198, 3.14391,…
+#> $ `s(year_fac).12` <dbl> 3.20887, 3.23337, 3.24350, 3.16821, 3.23516, 3.18216,…
+#> $ `s(year_fac).13` <dbl> 2.18530, 2.15358, 2.39908, 2.21862, 2.14648, 2.17067,…
+#> $ `s(year_fac).14` <dbl> 2.66153, 2.67202, 2.64594, 2.57457, 2.38109, 2.44175,…
+#> $ `s(year_fac).15` <dbl> 2.24898, 2.24912, 2.03587, 2.33842, 2.27868, 2.24643,…
+#> $ `s(year_fac).16` <dbl> 2.20947, 2.21717, 2.03610, 2.17374, 2.16442, 2.14900,…
+#> $ `s(year_fac).17` <dbl> 0.1428430, 0.8005170, -0.0136294, 0.6880930, 0.192034…
+

With any model fitted in mvgam, the underlying +Stan code can be viewed using the code +function:

+
code(model1)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_series; // number of series
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   
+#>   // random effect variances
+#>   vector<lower=0>[1] sigma_raw;
+#>   
+#>   // random effect means
+#>   vector[1] mu_raw;
+#> }
+#> transformed parameters {
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   b[1 : 17] = mu_raw[1] + b_raw[1 : 17] * sigma_raw[1];
+#> }
+#> model {
+#>   // prior for random effect population variances
+#>   sigma_raw ~ student_t(3, 0, 2.5);
+#>   
+#>   // prior for random effect population means
+#>   mu_raw ~ std_normal();
+#>   
+#>   // prior (non-centred) for s(year_fac)...
+#>   b_raw[1 : 17] ~ std_normal();
+#>   {
+#>     // likelihood functions
+#>     flat_ys ~ poisson_log_glm(flat_xs, 0.0, b);
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   array[n, n_series] int ypred;
+#>   
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+
+

Plotting effects and residuals

+

Now for interrogating the model. We can get some sense of the +variation in yearly intercepts from the summary above, but it is easier +to understand them using targeted plots. Plot posterior distributions of +the temporal random effects using plot.mvgam with +type = 're'. See ?plot.mvgam for more details +about the types of plots that can be produced from fitted +mvgam objects

+
plot(model1, type = 're')
+

+
+
+

bayesplot support

+

We can also capitalize on most of the useful MCMC plotting functions +from the bayesplot package to visualize posterior +distributions and diagnostics (see ?mvgam::mcmc_plot.mvgam +for details):

+
mcmc_plot(object = model1,
+          variable = 'betas',
+          type = 'areas')
+

+

We can also use the wide range of posterior checking functions +available in bayesplot (see +?mvgam::ppc_check.mvgam for details):

+
pp_check(object = model1)
+#> Using 10 posterior draws for ppc type 'dens_overlay' by default.
+#> Warning in pp_check.mvgam(object = model1): NA responses are not shown in
+#> 'pp_check'.
+

+
pp_check(model1, type = "rootogram")
+#> Using all posterior draws for ppc type 'rootogram' by default.
+#> Warning in pp_check.mvgam(model1, type = "rootogram"): NA responses are not
+#> shown in 'pp_check'.
+

+

There is clearly some variation in these yearly intercept estimates. +But how do these translate into time-varying predictions? To understand +this, we can plot posterior hindcasts from this model for the training +period using plot.mvgam with +type = 'forecast'

+
plot(model1, type = 'forecast')
+

+

If you wish to extract these hindcasts for other downstream analyses, +the hindcast function can be used. This will return a list +object of class mvgam_forecast. In the +hindcasts slot, a matrix of posterior retrodictions will be +returned for each series in the data (only one series in our +example):

+
hc <- hindcast(model1)
+str(hc)
+#> List of 15
+#>  $ call              :Class 'formula'  language count ~ s(year_fac, bs = "re") - 1
+#>   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
+#>  $ trend_call        : NULL
+#>  $ family            : chr "poisson"
+#>  $ trend_model       : chr "None"
+#>  $ drift             : logi FALSE
+#>  $ use_lv            : logi FALSE
+#>  $ fit_engine        : chr "stan"
+#>  $ type              : chr "response"
+#>  $ series_names      : chr "PP"
+#>  $ train_observations:List of 1
+#>   ..$ PP: int [1:199] 0 1 2 NA 10 NA NA 16 18 12 ...
+#>  $ train_times       : num [1:199] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ test_observations : NULL
+#>  $ test_times        : NULL
+#>  $ hindcasts         :List of 1
+#>   ..$ PP: num [1:2000, 1:199] 9 6 10 6 12 8 10 5 8 6 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:199] "ypred[1,1]" "ypred[2,1]" "ypred[3,1]" "ypred[4,1]" ...
+#>  $ forecasts         : NULL
+#>  - attr(*, "class")= chr "mvgam_forecast"
+

You can also extract these hindcasts on the linear predictor scale, +which in this case is the log scale (our Poisson GLM used a log link +function). Sometimes this can be useful for asking more targeted +questions about drivers of variation:

+
hc <- hindcast(model1, type = 'link')
+range(hc$hindcasts$PP)
+#> [1] -1.51216  3.46162
+

Objects of class mvgam_forecast have an associated plot +function as well:

+
plot(hc)
+

+

This plot can look a bit confusing as it seems like there is linear +interpolation from the end of one year to the start of the next. But +this is just due to the way the lines are automatically connected in +base plots

+

In any regression analysis, a key question is whether the residuals +show any patterns that can be indicative of un-modelled sources of +variation. For GLMs, we can use a modified residual called the Dunn-Smyth, +or randomized quantile, residual. Inspect Dunn-Smyth residuals from +the model using plot.mvgam with +type = 'residuals'

+
plot(model1, type = 'residuals')
+

+
+
+
+

Automatic forecasting for new data

+

These temporal random effects do not have a sense of “time”. Because +of this, each yearly random intercept is not restricted in some way to +be similar to the previous yearly intercept. This drawback becomes +evident when we predict for a new year. To do this, we can repeat the +exercise above but this time will split the data into training and +testing sets before re-running the model. We can then supply the test +set as newdata. For splitting, we will make use of the +filter function from dplyr

+
model_data %>% 
+  dplyr::filter(time <= 160) -> data_train 
+model_data %>% 
+  dplyr::filter(time > 160) -> data_test
+
model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1,
+                family = poisson(),
+                data = data_train,
+                newdata = data_test)
+

Repeating the plots above gives insight into how the model’s +hierarchical prior formulation provides all the structure needed to +sample values for un-modelled years

+
plot(model1b, type = 're')
+

+
plot(model1b, type = 'forecast')
+

+
#> Out of sample DRPS:
+#> [1] 182.6177
+

We can also view the test data in the forecast plot to see that the +predictions do not capture the temporal variation in the test set

+
plot(model1b, type = 'forecast', newdata = data_test)
+

+
#> Out of sample DRPS:
+#> [1] 182.6177
+

As with the hindcast function, we can use the +forecast function to automatically extract the posterior +distributions for these predictions. This also returns an object of +class mvgam_forecast, but now it will contain both the +hindcasts and forecasts for each series in the data:

+
fc <- forecast(model1b)
+str(fc)
+#> List of 16
+#>  $ call              :Class 'formula'  language count ~ s(year_fac, bs = "re") - 1
+#>   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
+#>  $ trend_call        : NULL
+#>  $ family            : chr "poisson"
+#>  $ family_pars       : NULL
+#>  $ trend_model       : chr "None"
+#>  $ drift             : logi FALSE
+#>  $ use_lv            : logi FALSE
+#>  $ fit_engine        : chr "stan"
+#>  $ type              : chr "response"
+#>  $ series_names      : Factor w/ 1 level "PP": 1
+#>  $ train_observations:List of 1
+#>   ..$ PP: int [1:160] 0 1 2 NA 10 NA NA 16 18 12 ...
+#>  $ train_times       : num [1:160] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ test_observations :List of 1
+#>   ..$ PP: int [1:39] NA 0 0 10 3 14 18 NA 28 46 ...
+#>  $ test_times        : num [1:39] 161 162 163 164 165 166 167 168 169 170 ...
+#>  $ hindcasts         :List of 1
+#>   ..$ PP: num [1:2000, 1:160] 10 7 8 11 6 11 9 11 5 2 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:160] "ypred[1,1]" "ypred[2,1]" "ypred[3,1]" "ypred[4,1]" ...
+#>  $ forecasts         :List of 1
+#>   ..$ PP: num [1:2000, 1:39] 5 12 10 7 7 8 11 14 8 12 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:39] "ypred[161,1]" "ypred[162,1]" "ypred[163,1]" "ypred[164,1]" ...
+#>  - attr(*, "class")= chr "mvgam_forecast"
+
+
+

Adding predictors as “fixed” effects

+

Any users familiar with GLMs will know that we nearly always wish to +include predictor variables that may explain some of the variation in +our observations. Predictors are easily incorporated into GLMs / GAMs. +Here, we will update the model from above by including a parametric +(fixed) effect of ndvi as a linear predictor:

+
model2 <- mvgam(count ~ s(year_fac, bs = 're') + 
+                  ndvi - 1,
+                family = poisson(),
+                data = data_train,
+                newdata = data_test)
+

The model can be described mathematically as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} + \beta_{ndvi} * +\boldsymbol{ndvi}_t \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*}\]

+

Where the \(\beta_{year}\) effects +are the same as before but we now have another predictor \((\beta_{ndvi})\) that applies to the +ndvi value at each timepoint \(t\). Inspect the summary of this model

+
summary(model2)
+#> GAM formula:
+#> count ~ ndvi + s(year_fac, bs = "re") - 1
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 160 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>                2.5%  50% 97.5% Rhat n_eff
+#> ndvi           0.32 0.39  0.46    1  1696
+#> s(year_fac).1  1.10 1.40  1.70    1  2512
+#> s(year_fac).2  1.80 2.00  2.20    1  2210
+#> s(year_fac).3  2.20 2.40  2.60    1  2109
+#> s(year_fac).4  2.30 2.50  2.70    1  1780
+#> s(year_fac).5  1.20 1.40  1.60    1  2257
+#> s(year_fac).6  1.00 1.30  1.50    1  2827
+#> s(year_fac).7  1.10 1.40  1.70    1  2492
+#> s(year_fac).8  2.10 2.30  2.50    1  2188
+#> s(year_fac).9  2.70 2.90  3.00    1  2014
+#> s(year_fac).10 2.00 2.20  2.40    1  2090
+#> s(year_fac).11 2.30 2.40  2.60    1  1675
+#> s(year_fac).12 2.50 2.70  2.80    1  2108
+#> s(year_fac).13 1.40 1.60  1.80    1  2161
+#> s(year_fac).14 0.46 2.00  3.20    1  1849
+#> s(year_fac).15 0.53 2.00  3.30    1  1731
+#> s(year_fac).16 0.53 2.00  3.30    1  1859
+#> s(year_fac).17 0.59 1.90  3.20    1  1761
+#> 
+#> GAM group-level estimates:
+#>                   2.5%  50% 97.5% Rhat n_eff
+#> mean(s(year_fac))  1.6 2.00   2.3 1.01   397
+#> sd(s(year_fac))    0.4 0.59   1.0 1.01   395
+#> 
+#> Approximate significance of GAM smooths:
+#>              edf Ref.df Chi.sq p-value    
+#> s(year_fac) 11.2     17   3096  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:00:50 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Rather than printing the summary each time, we can also quickly look +at the posterior empirical quantiles for the fixed effect of +ndvi (and other linear predictor coefficients) using +coef:

+
coef(model2)
+#>                     2.5%       50%     97.5% Rhat n_eff
+#> ndvi           0.3198694 0.3899835 0.4571083    1  1696
+#> s(year_fac).1  1.1176373 1.4085900 1.6603838    1  2512
+#> s(year_fac).2  1.8008470 2.0005000 2.2003670    1  2210
+#> s(year_fac).3  2.1842727 2.3822950 2.5699363    1  2109
+#> s(year_fac).4  2.3267037 2.5022700 2.6847912    1  1780
+#> s(year_fac).5  1.1945853 1.4215950 1.6492038    1  2257
+#> s(year_fac).6  1.0332160 1.2743050 1.5091052    1  2827
+#> s(year_fac).7  1.1467567 1.4119100 1.6751850    1  2492
+#> s(year_fac).8  2.0710285 2.2713050 2.4596285    1  2188
+#> s(year_fac).9  2.7198967 2.8557300 2.9874662    1  2014
+#> s(year_fac).10 1.9798730 2.1799600 2.3932595    1  2090
+#> s(year_fac).11 2.2734940 2.4374700 2.6130482    1  1675
+#> s(year_fac).12 2.5421157 2.6935350 2.8431822    1  2108
+#> s(year_fac).13 1.3786087 1.6177850 1.8495872    1  2161
+#> s(year_fac).14 0.4621041 1.9744700 3.2480377    1  1849
+#> s(year_fac).15 0.5293684 2.0014200 3.2766722    1  1731
+#> s(year_fac).16 0.5285142 1.9786450 3.2859085    1  1859
+#> s(year_fac).17 0.5909969 1.9462850 3.2306940    1  1761
+

Look at the estimated effect of ndvi using +plot.mvgam with type = 'pterms'

+
plot(model2, type = 'pterms')
+

+

This plot indicates a positive linear effect of ndvi on +log(counts). But it may be easier to visualise using a +histogram, especially for parametric (linear) effects. This can be done +by first extracting the posterior coefficients as we did in the first +example:

+
beta_post <- as.data.frame(model2, variable = 'betas')
+dplyr::glimpse(beta_post)
+#> Rows: 2,000
+#> Columns: 18
+#> $ ndvi             <dbl> 0.330568, 0.398734, 0.357498, 0.484288, 0.380087, 0.3…
+#> $ `s(year_fac).1`  <dbl> 1.55868, 1.27949, 1.24414, 1.02997, 1.64712, 1.07519,…
+#> $ `s(year_fac).2`  <dbl> 1.98967, 2.00846, 2.07493, 1.84431, 2.01590, 2.16466,…
+#> $ `s(year_fac).3`  <dbl> 2.41434, 2.16020, 2.67324, 2.33332, 2.32415, 2.45516,…
+#> $ `s(year_fac).4`  <dbl> 2.62215, 2.53992, 2.50659, 2.23671, 2.56663, 2.40054,…
+#> $ `s(year_fac).5`  <dbl> 1.37221, 1.44795, 1.53019, 1.27623, 1.50771, 1.49515,…
+#> $ `s(year_fac).6`  <dbl> 1.323980, 1.220200, 1.165610, 1.271620, 1.193820, 1.3…
+#> $ `s(year_fac).7`  <dbl> 1.52005, 1.30735, 1.42566, 1.13335, 1.61581, 1.31853,…
+#> $ `s(year_fac).8`  <dbl> 2.40223, 2.20021, 2.44366, 2.17192, 2.20837, 2.33066,…
+#> $ `s(year_fac).9`  <dbl> 2.91580, 2.90942, 2.87679, 2.64941, 2.85401, 2.78744,…
+#> $ `s(year_fac).10` <dbl> 2.46559, 2.01466, 2.08319, 2.01400, 2.22965, 2.26523,…
+#> $ `s(year_fac).11` <dbl> 2.52118, 2.45406, 2.46667, 2.20664, 2.42495, 2.46256,…
+#> $ `s(year_fac).12` <dbl> 2.72360, 2.63546, 2.86718, 2.59035, 2.76576, 2.56130,…
+#> $ `s(year_fac).13` <dbl> 1.67388, 1.50790, 1.52463, 1.39004, 1.72927, 1.61023,…
+#> $ `s(year_fac).14` <dbl> 2.583650, 2.034240, 1.819820, 1.579280, 2.426880, 1.8…
+#> $ `s(year_fac).15` <dbl> 2.57365, 2.28723, 1.67404, 1.46796, 2.49512, 2.71230,…
+#> $ `s(year_fac).16` <dbl> 1.801660, 2.185540, 1.756500, 2.098760, 2.270640, 1.8…
+#> $ `s(year_fac).17` <dbl> 0.886081, 3.409300, -0.371795, 2.494990, 1.822150, 2.…
+

The posterior distribution for the effect of ndvi is +stored in the ndvi column. A quick histogram confirms our +inference that log(counts) respond positively to increases +in ndvi:

+
hist(beta_post$ndvi,
+     xlim = c(-1 * max(abs(beta_post$ndvi)),
+              max(abs(beta_post$ndvi))),
+     col = 'darkred',
+     border = 'white',
+     xlab = expression(beta[NDVI]),
+     ylab = '',
+     yaxt = 'n',
+     main = '',
+     lwd = 2)
+abline(v = 0, lwd = 2.5)
+

+
+

marginaleffects support

+

Given our model used a nonlinear link function (log link in this +example), it can still be difficult to fully understand what +relationship our model is estimating between a predictor and the +response. Fortunately, the marginaleffects package makes +this relatively straightforward. Objects of class mvgam can +be used with marginaleffects to inspect contrasts, +scenario-based predictions, conditional and marginal effects, all on the +outcome scale. Here we will use the plot_predictions +function from marginaleffects to inspect the conditional +effect of ndvi (use ?plot_predictions for +guidance on how to modify these plots):

+
plot_predictions(model2, 
+                 condition = "ndvi",
+                 # include the observed count values
+                 # as points, and show rugs for the observed
+                 # ndvi and count values on the axes
+                 points = 0.5, rug = TRUE)
+

+

Now it is easier to get a sense of the nonlinear but positive +relationship estimated between ndvi and count. +Like brms, mvgam has the simple +conditional_effects function to make quick and informative +plots for main effects. This will likely be your go-to function for +quickly understanding patterns from fitted mvgam models

+
plot(conditional_effects(model2), ask = FALSE)
+

+
+
+
+

Adding predictors as smooths

+

Smooth functions, using penalized splines, are a major feature of +mvgam. Nonlinear splines are commonly viewed as variations +of random effects in which the coefficients that control the shape of +the spline are drawn from a joint, penalized distribution. This strategy +is very often used in ecological time series analysis to capture smooth +temporal variation in the processes we seek to study. When we construct +smoothing splines, the workhorse package mgcv will +calculate a set of basis functions that will collectively control the +shape and complexity of the resulting spline. It is often helpful to +visualize these basis functions to get a better sense of how splines +work. We’ll create a set of 6 basis functions to represent possible +variation in the effect of time on our outcome.In addition +to constructing the basis functions, mgcv also creates a +penalty matrix \(S\), which contains +known coefficients that work to constrain the +wiggliness of the resulting smooth function. When fitting a GAM to data, +we must estimate the smoothing parameters (\(\lambda\)) that will penalize these +matrices, resulting in constrained basis coefficients and smoother +functions that are less likely to overfit the data. This is the key to +fitting GAMs in a Bayesian framework, as we can jointly estimate the +\(\lambda\)’s using informative priors +to prevent overfitting and expand the complexity of models we can +tackle. To see this in practice, we can now fit a model that replaces +the yearly random effects with a smooth function of time. +We will need a reasonably complex function (large k) to try +and accommodate the temporal variation in our observations. Following +some useful advice by Gavin Simpson, we will use a +b-spline basis for the temporal smooth. Because we no longer have +intercepts for each year, we also retain the primary intercept term in +this model (there is no -1 in the formula now):

+
model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + 
+                  ndvi,
+                family = poisson(),
+                data = data_train,
+                newdata = data_test)
+

The model can be described mathematically as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{time})_t + \beta_{ndvi} * +\boldsymbol{ndvi}_t \\ +f(\boldsymbol{time}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*}\]

+

Where the smooth function \(f_{time}\) is built by summing across a set +of weighted basis functions. The basis functions \((b)\) are constructed using a thin plate +regression basis in mgcv. The weights \((\beta_{smooth})\) are drawn from a +penalized multivariate normal distribution where the precision matrix +\((\Omega\)) is multiplied by a +smoothing penalty \((\lambda)\). If +\(\lambda\) becomes large, this acts to +squeeze the covariances among the weights \((\beta_{smooth})\), leading to a less +wiggly spline. Note that sometimes there are multiple smoothing +penalties that contribute to the covariance matrix, but I am only +showing one here for simplicity. View the summary as before

+
summary(model3)
+#> GAM formula:
+#> count ~ s(time, bs = "bs", k = 15) + ndvi
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 160 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>              2.5%   50%  97.5% Rhat n_eff
+#> (Intercept)  2.00  2.10  2.200 1.00   903
+#> ndvi         0.26  0.33  0.390 1.00   942
+#> s(time).1   -2.10 -1.10  0.029 1.01   484
+#> s(time).2    0.45  1.30  2.400 1.01   411
+#> s(time).3   -0.43  0.45  1.500 1.02   347
+#> s(time).4    1.60  2.50  3.600 1.02   342
+#> s(time).5   -1.10 -0.22  0.880 1.02   375
+#> s(time).6   -0.53  0.36  1.600 1.01   352
+#> s(time).7   -1.50 -0.51  0.560 1.01   406
+#> s(time).8    0.63  1.50  2.600 1.02   340
+#> s(time).9    1.20  2.10  3.200 1.02   346
+#> s(time).10  -0.34  0.54  1.600 1.01   364
+#> s(time).11   0.92  1.80  2.900 1.02   332
+#> s(time).12   0.67  1.50  2.500 1.01   398
+#> s(time).13  -1.20 -0.32  0.700 1.01   420
+#> s(time).14  -7.90 -4.20 -1.200 1.01   414
+#> 
+#> Approximate significance of GAM smooths:
+#>          edf Ref.df Chi.sq p-value    
+#> s(time) 9.41     14    790  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:01:29 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The summary above now contains posterior estimates for the smoothing +parameters as well as the basis coefficients for the nonlinear effect of +time. We can visualize the conditional time +effect using the plot function with +type = 'smooths':

+
plot(model3, type = 'smooths')
+

+

By default this plots shows posterior empirical quantiles, but it can +also be helpful to view some realizations of the underlying function +(here, each line is a different potential curve drawn from the posterior +of all possible curves):

+
plot(model3, type = 'smooths', realisations = TRUE,
+     n_realisations = 30)
+

+
+

Derivatives of smooths

+

A useful question when modelling using GAMs is to identify where the +function is changing most rapidly. To address this, we can plot +estimated 1st derivatives of the spline:

+
plot(model3, type = 'smooths', derivatives = TRUE)
+

+

Here, values above >0 indicate the function was +increasing at that time point, while values <0 indicate +the function was declining. The most rapid declines appear to have been +happening around timepoints 50 and again toward the end of the training +period, for example.

+

Use conditional_effects again for useful plots on the +outcome scale:

+
plot(conditional_effects(model3), ask = FALSE)
+

+

Or on the link scale:

+
plot(conditional_effects(model3, type = 'link'), ask = FALSE)
+

+

Inspect the underlying Stan code to gain some idea of +how the spline is being penalized:

+
code(model3)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_sp; // number of smoothing parameters
+#>   int<lower=0> n_series; // number of series
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   vector[num_basis] zero; // prior locations for basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   matrix[14, 28] S1; // mgcv smooth penalty matrix S1
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   
+#>   // smoothing parameters
+#>   vector<lower=0>[n_sp] lambda;
+#> }
+#> transformed parameters {
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   b[1 : num_basis] = b_raw[1 : num_basis];
+#> }
+#> model {
+#>   // prior for (Intercept)...
+#>   b_raw[1] ~ student_t(3, 2.6, 2.5);
+#>   
+#>   // prior for ndvi...
+#>   b_raw[2] ~ student_t(3, 0, 2);
+#>   
+#>   // prior for s(time)...
+#>   b_raw[3 : 16] ~ multi_normal_prec(zero[3 : 16],
+#>                                     S1[1 : 14, 1 : 14] * lambda[1]
+#>                                     + S1[1 : 14, 15 : 28] * lambda[2]);
+#>   
+#>   // priors for smoothing parameters
+#>   lambda ~ normal(5, 30);
+#>   {
+#>     // likelihood functions
+#>     flat_ys ~ poisson_log_glm(flat_xs, 0.0, b);
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   vector[n_sp] rho;
+#>   array[n, n_series] int ypred;
+#>   rho = log(lambda);
+#>   
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+

The line below // prior for s(time)... shows how the +spline basis coefficients are drawn from a zero-centred multivariate +normal distribution. The precision matrix \(S\) is penalized by two different smoothing +parameters (the \(\lambda\)’s) to +enforce smoothness and reduce overfitting

+
+
+
+

Latent dynamics in mvgam

+

Forecasts from the above model are not ideal:

+
plot(model3, type = 'forecast', newdata = data_test)
+

+
#> Out of sample DRPS:
+#> [1] 288.3844
+

Why is this happening? The forecasts are driven almost entirely by +variation in the temporal spline, which is extrapolating linearly +forever beyond the edge of the training data. Any slight +wiggles near the end of the training set will result in wildly different +forecasts. To visualize this, we can plot the extrapolated temporal +functions into the out-of-sample test set for the two models. Here are +the extrapolated functions for the first model, with 15 basis +functions:

+
plot_mvgam_smooth(model3, smooth = 's(time)',
+                  # feed newdata to the plot function to generate
+                  # predictions of the temporal smooth to the end of the 
+                  # testing period
+                  newdata = data.frame(time = 1:max(data_test$time),
+                                       ndvi = 0))
+abline(v = max(data_train$time), lty = 'dashed', lwd = 2)
+

+

This model is not doing well. Clearly we need to somehow account for +the strong temporal autocorrelation when modelling these data without +using a smooth function of time. Now onto another prominent +feature of mvgam: the ability to include (possibly latent) +autocorrelated residuals in regression models. To do so, we use the +trend_model argument (see ?mvgam_trends for +details of different dynamic trend models that are supported). This +model will use a separate sub-model for latent residuals that evolve as +an AR1 process (i.e. the error in the current time point is a function +of the error in the previous time point, plus some stochastic noise). We +also include a smooth function of ndvi in this model, +rather than the parametric term that was used above, to showcase that +mvgam can include combinations of smooths and dynamic +components:

+
model4 <- mvgam(count ~ s(ndvi, k = 6),
+                family = poisson(),
+                data = data_train,
+                newdata = data_test,
+                trend_model = 'AR1')
+

The model can be described mathematically as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{ndvi})_t + z_t \\ +z_t & \sim \text{Normal}(ar1 * z_{t-1}, \sigma_{error}) \\ +ar1 & \sim \text{Normal}(0, 1)[-1, 1] \\ +\sigma_{error} & \sim \text{Exponential}(2) \\ +f(\boldsymbol{ndvi}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) +\end{align*}\]

+

Here the term \(z_t\) captures +autocorrelated latent residuals, which are modelled using an AR1 +process. You can also notice that this model is estimating +autocorrelated errors for the full time period, even though some of +these time points have missing observations. This is useful for getting +more realistic estimates of the residual autocorrelation parameters. +Summarise the model to see how it now returns posterior summaries for +the latent AR1 process:

+
summary(model4)
+#> GAM formula:
+#> count ~ s(ndvi, k = 6)
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> AR1
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 160 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>               2.5%     50% 97.5% Rhat n_eff
+#> (Intercept)  1.200  1.9000 2.500 1.03    63
+#> s(ndvi).1   -0.066  0.0100 0.160 1.01   318
+#> s(ndvi).2   -0.110  0.0190 0.340 1.00   286
+#> s(ndvi).3   -0.048 -0.0019 0.051 1.00   560
+#> s(ndvi).4   -0.210  0.1200 1.500 1.01   198
+#> s(ndvi).5   -0.079  0.1500 0.360 1.01   350
+#> 
+#> Approximate significance of GAM smooths:
+#>          edf Ref.df Chi.sq p-value
+#> s(ndvi) 2.26      5   87.8     0.1
+#> 
+#> Latent trend parameter AR estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> ar1[1]   0.70 0.81  0.92 1.01   234
+#> sigma[1] 0.68 0.80  0.96 1.00   488
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:02:26 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

View conditional smooths for the ndvi effect:

+
plot_predictions(model4, 
+                 condition = "ndvi",
+                 points = 0.5, rug = TRUE)
+

+

View posterior hindcasts / forecasts and compare against the out of +sample test data

+
plot(model4, type = 'forecast', newdata = data_test)
+

+
#> Out of sample DRPS:
+#> [1] 150.5241
+

The trend is evolving as an AR1 process, which we can also view:

+
plot(model4, type = 'trend', newdata = data_test)
+

+

In-sample model performance can be interrogated using leave-one-out +cross-validation utilities from the loo package (a higher +value is preferred for this metric):

+
loo_compare(model3, model4)
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+#>        elpd_diff se_diff
+#> model4    0.0       0.0 
+#> model3 -558.9      66.4
+

The higher estimated log predictive density (ELPD) value for the +dynamic model suggests it provides a better fit to the in-sample +data.

+

Though it should be obvious that this model provides better +forecasts, we can quantify forecast performance for models 3 and 4 using +the forecast and score functions. Here we will +compare models based on their Discrete Ranked Probability Scores (a +lower value is preferred for this metric)

+
fc_mod3 <- forecast(model3)
+fc_mod4 <- forecast(model4)
+score_mod3 <- score(fc_mod3, score = 'drps')
+score_mod4 <- score(fc_mod4, score = 'drps')
+sum(score_mod4$PP$score, na.rm = TRUE) - sum(score_mod3$PP$score, na.rm = TRUE)
+#> [1] -137.8603
+

A strongly negative value here suggests the score for the dynamic +model (model 4) is much smaller than the score for the model with a +smooth function of time (model 3)

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/doc/nmixtures.R b/doc/nmixtures.R new file mode 100644 index 00000000..a4e7a766 --- /dev/null +++ b/doc/nmixtures.R @@ -0,0 +1,396 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +library(dplyr) +# A custom ggplot2 theme +theme_set(theme_classic(base_size = 12, base_family = 'serif') + + theme(axis.line.x.bottom = element_line(colour = "black", + size = 1), + axis.line.y.left = element_line(colour = "black", + size = 1))) +options(ggplot2.discrete.colour = c("#A25050", + "#00008b", + 'darkred', + "#010048"), + ggplot2.discrete.fill = c("#A25050", + "#00008b", + 'darkred', + "#010048")) + +## ----------------------------------------------------------------------------- +set.seed(999) +# Simulate observations for species 1, which shows a declining trend and 0.7 detection probability +data.frame(site = 1, + # five replicates per year; six years + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_1', + # true abundance declines nonlinearly + truth = c(rep(28, 5), + rep(26, 5), + rep(23, 5), + rep(16, 5), + rep(14, 5), + rep(14, 5)), + # observations are taken with detection prob = 0.7 + obs = c(rbinom(5, 28, 0.7), + rbinom(5, 26, 0.7), + rbinom(5, 23, 0.7), + rbinom(5, 15, 0.7), + rbinom(5, 14, 0.7), + rbinom(5, 14, 0.7))) %>% + # add 'series' information, which is an identifier of site, replicate and species + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + # add a 'cap' variable that defines the maximum latent N to + # marginalize over when estimating latent abundance; in other words + # how large do we realistically think the true abundance could be? + cap = 100) %>% + dplyr::select(- replicate) -> testdat + +# Now add another species that has a different temporal trend and a smaller +# detection probability (0.45 for this species) +testdat = testdat %>% + dplyr::bind_rows(data.frame(site = 1, + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_2', + truth = c(rep(4, 5), + rep(7, 5), + rep(15, 5), + rep(16, 5), + rep(19, 5), + rep(18, 5)), + obs = c(rbinom(5, 4, 0.45), + rbinom(5, 7, 0.45), + rbinom(5, 15, 0.45), + rbinom(5, 16, 0.45), + rbinom(5, 19, 0.45), + rbinom(5, 18, 0.45))) %>% + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + cap = 50) %>% + dplyr::select(-replicate)) + +## ----------------------------------------------------------------------------- +testdat$species <- factor(testdat$species, + levels = unique(testdat$species)) +testdat$series <- factor(testdat$series, + levels = unique(testdat$series)) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(testdat) +head(testdat, 12) + +## ----------------------------------------------------------------------------- +testdat %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map +trend_map + +## ----include = FALSE, results='hide'------------------------------------------ +mod <- mvgam( + # the observation formula sets up linear predictors for + # detection probability on the logit scale + formula = obs ~ species - 1, + + # the trend_formula sets up the linear predictors for + # the latent abundance processes on the log scale + trend_formula = ~ s(time, by = trend, k = 4) + species, + + # the trend_map takes care of the mapping + trend_map = trend_map, + + # nmix() family and data + family = nmix(), + data = testdat, + + # priors can be set in the usual way + priors = c(prior(std_normal(), class = b), + prior(normal(1, 1.5), class = Intercept_trend)), + samples = 1000) + +## ----eval = FALSE------------------------------------------------------------- +# mod <- mvgam( +# # the observation formula sets up linear predictors for +# # detection probability on the logit scale +# formula = obs ~ species - 1, +# +# # the trend_formula sets up the linear predictors for +# # the latent abundance processes on the log scale +# trend_formula = ~ s(time, by = trend, k = 4) + species, +# +# # the trend_map takes care of the mapping +# trend_map = trend_map, +# +# # nmix() family and data +# family = nmix(), +# data = testdat, +# +# # priors can be set in the usual way +# priors = c(prior(std_normal(), class = b), +# prior(normal(1, 1.5), class = Intercept_trend)), +# samples = 1000) + +## ----------------------------------------------------------------------------- +code(mod) + +## ----------------------------------------------------------------------------- +summary(mod) + +## ----------------------------------------------------------------------------- +loo(mod) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths', trend_effects = TRUE) + +## ----------------------------------------------------------------------------- +plot_predictions(mod, condition = 'species', + type = 'detection') + + ylab('Pr(detection)') + + ylim(c(0, 1)) + + theme_classic() + + theme(legend.position = 'none') + +## ----------------------------------------------------------------------------- +hc <- hindcast(mod, type = 'latent_N') + +# Function to plot latent abundance estimates vs truth +plot_latentN = function(hindcasts, data, species = 'sp_1'){ + all_series <- unique(data %>% + dplyr::filter(species == !!species) %>% + dplyr::pull(series)) + + # Grab the first replicate that represents this series + # so we can get the true simulated values + series <- as.numeric(all_series[1]) + truths <- data %>% + dplyr::arrange(time, series) %>% + dplyr::filter(series == !!levels(data$series)[series]) %>% + dplyr::pull(truth) + + # In case some replicates have missing observations, + # pull out predictions for ALL replicates and average over them + hcs <- do.call(rbind, lapply(all_series, function(x){ + ind <- which(names(hindcasts$hindcasts) %in% as.character(x)) + hindcasts$hindcasts[[ind]] + })) + + # Calculate posterior empirical quantiles of predictions + pred_quantiles <- data.frame(t(apply(hcs, 2, function(x) + quantile(x, probs = c(0.05, 0.2, 0.3, 0.4, + 0.5, 0.6, 0.7, 0.8, 0.95))))) + pred_quantiles$time <- 1:NROW(pred_quantiles) + pred_quantiles$truth <- truths + + # Grab observations + data %>% + dplyr::filter(series %in% all_series) %>% + dplyr::select(time, obs) -> observations + + # Plot + ggplot(pred_quantiles, aes(x = time, group = 1)) + + geom_ribbon(aes(ymin = X5., ymax = X95.), fill = "#DCBCBC") + + geom_ribbon(aes(ymin = X30., ymax = X70.), fill = "#B97C7C") + + geom_line(aes(x = time, y = truth), + colour = 'black', linewidth = 1) + + geom_point(aes(x = time, y = truth), + shape = 21, colour = 'white', fill = 'black', + size = 2.5) + + geom_jitter(data = observations, aes(x = time, y = obs), + width = 0.06, + shape = 21, fill = 'darkred', colour = 'white', size = 2.5) + + labs(y = 'Latent abundance (N)', + x = 'Time', + title = species) +} + +## ----------------------------------------------------------------------------- +plot_latentN(hc, testdat, species = 'sp_1') +plot_latentN(hc, testdat, species = 'sp_2') + +## ----------------------------------------------------------------------------- +# Date link +load(url('https://github.com/doserjef/spAbundance/raw/main/data/dataNMixSim.rda')) +data.one.sp <- dataNMixSim + +# Pull out observations for one species +data.one.sp$y <- data.one.sp$y[1, , ] + +# Abundance covariates that don't change across repeat sampling observations +abund.cov <- dataNMixSim$abund.covs[, 1] +abund.factor <- as.factor(dataNMixSim$abund.covs[, 2]) + +# Detection covariates that can change across repeat sampling observations +# Note that `NA`s are not allowed for covariates in mvgam, so we randomly +# impute them here +det.cov <- dataNMixSim$det.covs$det.cov.1[,] +det.cov[is.na(det.cov)] <- rnorm(length(which(is.na(det.cov)))) +det.cov2 <- dataNMixSim$det.covs$det.cov.2 +det.cov2[is.na(det.cov2)] <- rnorm(length(which(is.na(det.cov2)))) + +## ----------------------------------------------------------------------------- +mod_data <- do.call(rbind, + lapply(1:NROW(data.one.sp$y), function(x){ + data.frame(y = data.one.sp$y[x,], + abund_cov = abund.cov[x], + abund_fac = abund.factor[x], + det_cov = det.cov[x,], + det_cov2 = det.cov2[x,], + replicate = 1:NCOL(data.one.sp$y), + site = paste0('site', x)) + })) %>% + dplyr::mutate(species = 'sp_1', + series = as.factor(paste0(site, '_', species, '_', replicate))) %>% + dplyr::mutate(site = factor(site, levels = unique(site)), + species = factor(species, levels = unique(species)), + time = 1, + cap = max(data.one.sp$y, na.rm = TRUE) + 20) + +## ----------------------------------------------------------------------------- +NROW(mod_data) +dplyr::glimpse(mod_data) +head(mod_data) + +## ----------------------------------------------------------------------------- +mod_data %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map + +trend_map %>% + dplyr::arrange(trend) %>% + head(12) + +## ----include = FALSE, results='hide'------------------------------------------ +mod <- mvgam( + # effects of covariates on detection probability; + # here we use penalized splines for both continuous covariates + formula = y ~ s(det_cov, k = 3) + s(det_cov2, k = 3), + + # effects of the covariates on latent abundance; + # here we use a penalized spline for the continuous covariate and + # hierarchical intercepts for the factor covariate + trend_formula = ~ s(abund_cov, k = 3) + + s(abund_fac, bs = 're'), + + # link multiple observations to each site + trend_map = trend_map, + + # nmix() family and supplied data + family = nmix(), + data = mod_data, + + # standard normal priors on key regression parameters + priors = c(prior(std_normal(), class = 'b'), + prior(std_normal(), class = 'Intercept'), + prior(std_normal(), class = 'Intercept_trend'), + prior(std_normal(), class = 'sigma_raw_trend')), + + # use Stan's variational inference for quicker results + algorithm = 'meanfield', + + # no need to compute "series-level" residuals + residuals = FALSE, + samples = 1000) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam( +# # effects of covariates on detection probability; +# # here we use penalized splines for both continuous covariates +# formula = y ~ s(det_cov, k = 4) + s(det_cov2, k = 4), +# +# # effects of the covariates on latent abundance; +# # here we use a penalized spline for the continuous covariate and +# # hierarchical intercepts for the factor covariate +# trend_formula = ~ s(abund_cov, k = 4) + +# s(abund_fac, bs = 're'), +# +# # link multiple observations to each site +# trend_map = trend_map, +# +# # nmix() family and supplied data +# family = nmix(), +# data = mod_data, +# +# # standard normal priors on key regression parameters +# priors = c(prior(std_normal(), class = 'b'), +# prior(std_normal(), class = 'Intercept'), +# prior(std_normal(), class = 'Intercept_trend'), +# prior(std_normal(), class = 'sigma_raw_trend')), +# +# # use Stan's variational inference for quicker results +# algorithm = 'meanfield', +# +# # no need to compute "series-level" residuals +# residuals = FALSE, +# samples = 1000) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +avg_predictions(mod, type = 'detection') + +## ----------------------------------------------------------------------------- +abund_plots <- plot(conditional_effects(mod, + type = 'link', + effects = c('abund_cov', + 'abund_fac')), + plot = FALSE) + +## ----------------------------------------------------------------------------- +abund_plots[[1]] + + ylab('Expected latent abundance') + +## ----------------------------------------------------------------------------- +abund_plots[[2]] + + ylab('Expected latent abundance') + +## ----------------------------------------------------------------------------- +det_plots <- plot(conditional_effects(mod, + type = 'detection', + effects = c('det_cov', + 'det_cov2')), + plot = FALSE) + +## ----------------------------------------------------------------------------- +det_plots[[1]] + + ylab('Pr(detection)') +det_plots[[2]] + + ylab('Pr(detection)') + +## ----------------------------------------------------------------------------- +fivenum_round = function(x)round(fivenum(x, na.rm = TRUE), 2) + +plot_predictions(mod, + newdata = datagrid(det_cov = unique, + det_cov2 = fivenum_round), + by = c('det_cov', 'det_cov2'), + type = 'detection') + + theme_classic() + + ylab('Pr(detection)') + diff --git a/doc/nmixtures.Rmd b/doc/nmixtures.Rmd new file mode 100644 index 00000000..62d4a08e --- /dev/null +++ b/doc/nmixtures.Rmd @@ -0,0 +1,509 @@ +--- +title: "N-mixtures in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{N-mixtures in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +library(dplyr) +# A custom ggplot2 theme +theme_set(theme_classic(base_size = 12, base_family = 'serif') + + theme(axis.line.x.bottom = element_line(colour = "black", + size = 1), + axis.line.y.left = element_line(colour = "black", + size = 1))) +options(ggplot2.discrete.colour = c("#A25050", + "#00008b", + 'darkred', + "#010048"), + ggplot2.discrete.fill = c("#A25050", + "#00008b", + 'darkred', + "#010048")) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to fit and interrogate N-mixture models for population abundance counts made with imperfect detection. + +## N-mixture models +An N-mixture model is a fairly recent addition to the ecological modeller's toolkit that is designed to make inferences about variation in the abundance of species when observations are imperfect ([Royle 2004](https://onlinelibrary.wiley.com/doi/10.1111/j.0006-341X.2004.00142.x){target="_blank"}). Briefly, assume $\boldsymbol{Y_{i,r}}$ is the number of individuals recorded at site $i$ during replicate sampling observation $r$ (recorded as a non-negative integer). If multiple replicate surveys are done within a short enough period to satisfy the assumption that the population remained closed (i.e. there was no substantial change in true population size between replicate surveys), we can account for the fact that observations aren't perfect. This is done by assuming that these replicate observations are Binomial random variables that are parameterized by the true "latent" abundance $N$ and a detection probability $p$: + +\begin{align*} +\boldsymbol{Y_{i,r}} & \sim \text{Binomial}(N_i, p_r) \\ +N_{i} & \sim \text{Poisson}(\lambda_i) \end{align*} + +Using a set of linear predictors, we can estimate effects of covariates $\boldsymbol{X}$ on the expected latent abundance (with a log link for $\lambda$) and, jointly, effects of possibly different covariates (call them $\boldsymbol{Q}$) on detection probability (with a logit link for $p$): + +\begin{align*} +log(\lambda) & = \beta \boldsymbol{X} \\ +logit(p) & = \gamma \boldsymbol{Q}\end{align*} + +`mvgam` can handle this type of model because it is designed to propagate unobserved temporal processes that evolve independently of the observation process in a State-space format. This setup adapts well to N-mixture models because they can be thought of as State-space models in which the latent state is a discrete variable representing the "true" but unknown population size. This is very convenient because we can incorporate any of the package's diverse effect types (i.e. multidimensional splines, time-varying effects, monotonic effects, random effects etc...) into the linear predictors. All that is required for this to work is a marginalization trick that allows `Stan`'s sampling algorithms to handle discrete parameters (see more about how this method of "integrating out" discrete parameters works in [this nice blog post by Maxwell Joseph](https://mbjoseph.github.io/posts/2020-04-28-a-step-by-step-guide-to-marginalizing-over-discrete-parameters-for-ecologists-using-stan/){target="_blank"}). + +The family `nmix()` is used to set up N-mixture models in `mvgam`, but we still need to do a little bit of data wrangling to ensure the data are set up in the correct format (this is especially true when we have more than one replicate survey per time period). The most important aspects are: (1) how we set up the observation `series` and `trend_map` arguments to ensure replicate surveys are mapped to the correct latent abundance model and (2) the inclusion of a `cap` variable that defines the maximum possible integer value to use for each observation when estimating latent abundance. The two examples below give a reasonable overview of how this can be done. + +## Example 1: a two-species system with nonlinear trends +First we will use a simple simulation in which multiple replicate observations are taken at each timepoint for two different species. The simulation produces observations at a single site over six years, with five replicate surveys per year. Each species is simulated to have different nonlinear temporal trends and different detection probabilities. For now, detection probability is fixed (i.e. it does not change over time or in association with any covariates). Notice that we add the `cap` variable, which does not need to be static, to define the maximum possible value that we think the latent abundance could be for each timepoint. This simply needs to be large enough that we get a reasonable idea of which latent N values are most likely, without adding too much computational cost: + +```{r} +set.seed(999) +# Simulate observations for species 1, which shows a declining trend and 0.7 detection probability +data.frame(site = 1, + # five replicates per year; six years + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_1', + # true abundance declines nonlinearly + truth = c(rep(28, 5), + rep(26, 5), + rep(23, 5), + rep(16, 5), + rep(14, 5), + rep(14, 5)), + # observations are taken with detection prob = 0.7 + obs = c(rbinom(5, 28, 0.7), + rbinom(5, 26, 0.7), + rbinom(5, 23, 0.7), + rbinom(5, 15, 0.7), + rbinom(5, 14, 0.7), + rbinom(5, 14, 0.7))) %>% + # add 'series' information, which is an identifier of site, replicate and species + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + # add a 'cap' variable that defines the maximum latent N to + # marginalize over when estimating latent abundance; in other words + # how large do we realistically think the true abundance could be? + cap = 100) %>% + dplyr::select(- replicate) -> testdat + +# Now add another species that has a different temporal trend and a smaller +# detection probability (0.45 for this species) +testdat = testdat %>% + dplyr::bind_rows(data.frame(site = 1, + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_2', + truth = c(rep(4, 5), + rep(7, 5), + rep(15, 5), + rep(16, 5), + rep(19, 5), + rep(18, 5)), + obs = c(rbinom(5, 4, 0.45), + rbinom(5, 7, 0.45), + rbinom(5, 15, 0.45), + rbinom(5, 16, 0.45), + rbinom(5, 19, 0.45), + rbinom(5, 18, 0.45))) %>% + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + cap = 50) %>% + dplyr::select(-replicate)) +``` + +This data format isn't too difficult to set up, but it does differ from the traditional multidimensional array setup that is commonly used for fitting N-mixture models in other software packages. Next we ensure that species and series IDs are included as factor variables, in case we'd like to allow certain effects to vary by species +```{r} +testdat$species <- factor(testdat$species, + levels = unique(testdat$species)) +testdat$series <- factor(testdat$series, + levels = unique(testdat$series)) +``` + +Preview the dataset to get an idea of how it is structured: +```{r} +dplyr::glimpse(testdat) +head(testdat, 12) +``` + +### Setting up the `trend_map` + +Finally, we need to set up the `trend_map` object. This is crucial for allowing multiple observations to be linked to the same latent process model (see more information about this argument in the [Shared latent states vignette](https://nicholasjclark.github.io/mvgam/articles/shared_states.html){target="_blank"}). In this case, the mapping operates by species and site to state that each set of replicate observations from the same time point should all share the exact same latent abundance model: +```{r} +testdat %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map +trend_map +``` + +Notice how all of the replicates for species 1 in site 1 share the same process (i.e. the same `trend`). This will ensure that all replicates are Binomial draws of the same latent N. + +### Modelling with the `nmix()` family + +Now we are ready to fit a model using `mvgam()`. This model will allow each species to have different detection probabilities and different temporal trends. We will use `Cmdstan` as the backend, which by default will use Hamiltonian Monte Carlo for full Bayesian inference + +```{r include = FALSE, results='hide'} +mod <- mvgam( + # the observation formula sets up linear predictors for + # detection probability on the logit scale + formula = obs ~ species - 1, + + # the trend_formula sets up the linear predictors for + # the latent abundance processes on the log scale + trend_formula = ~ s(time, by = trend, k = 4) + species, + + # the trend_map takes care of the mapping + trend_map = trend_map, + + # nmix() family and data + family = nmix(), + data = testdat, + + # priors can be set in the usual way + priors = c(prior(std_normal(), class = b), + prior(normal(1, 1.5), class = Intercept_trend)), + samples = 1000) +``` + +```{r eval = FALSE} +mod <- mvgam( + # the observation formula sets up linear predictors for + # detection probability on the logit scale + formula = obs ~ species - 1, + + # the trend_formula sets up the linear predictors for + # the latent abundance processes on the log scale + trend_formula = ~ s(time, by = trend, k = 4) + species, + + # the trend_map takes care of the mapping + trend_map = trend_map, + + # nmix() family and data + family = nmix(), + data = testdat, + + # priors can be set in the usual way + priors = c(prior(std_normal(), class = b), + prior(normal(1, 1.5), class = Intercept_trend)), + samples = 1000) +``` + +View the automatically-generated `Stan` code to get a sense of how the marginalization over latent N works +```{r} +code(mod) +``` + +The posterior summary of this model shows that it has converged nicely +```{r} +summary(mod) +``` + +`loo()` functionality works just as it does for all `mvgam` models to aid in model comparison / selection (though note that Pareto K values often give warnings for mixture models so these may not be too helpful) +```{r} +loo(mod) +``` + +Plot the estimated smooths of time from each species' latent abundance process (on the log scale) +```{r} +plot(mod, type = 'smooths', trend_effects = TRUE) +``` + +`marginaleffects` support allows for more useful prediction-based interrogations on different scales (though note that at the time of writing this Vignette, you must have the development version of `marginaleffects` installed for `nmix()` models to be supported; use `remotes::install_github('vincentarelbundock/marginaleffects')` to install). Objects that use family `nmix()` have a few additional prediction scales that can be used (i.e. `link`, `response`, `detection` or `latent_N`). For example, here are the estimated detection probabilities per species, which show that the model has done a nice job of estimating these parameters: +```{r} +plot_predictions(mod, condition = 'species', + type = 'detection') + + ylab('Pr(detection)') + + ylim(c(0, 1)) + + theme_classic() + + theme(legend.position = 'none') +``` + +A common goal in N-mixture modelling is to estimate the true latent abundance. The model has automatically generated predictions for the unknown latent abundance that are conditional on the observations. We can extract these and produce decent plots using a small function +```{r} +hc <- hindcast(mod, type = 'latent_N') + +# Function to plot latent abundance estimates vs truth +plot_latentN = function(hindcasts, data, species = 'sp_1'){ + all_series <- unique(data %>% + dplyr::filter(species == !!species) %>% + dplyr::pull(series)) + + # Grab the first replicate that represents this series + # so we can get the true simulated values + series <- as.numeric(all_series[1]) + truths <- data %>% + dplyr::arrange(time, series) %>% + dplyr::filter(series == !!levels(data$series)[series]) %>% + dplyr::pull(truth) + + # In case some replicates have missing observations, + # pull out predictions for ALL replicates and average over them + hcs <- do.call(rbind, lapply(all_series, function(x){ + ind <- which(names(hindcasts$hindcasts) %in% as.character(x)) + hindcasts$hindcasts[[ind]] + })) + + # Calculate posterior empirical quantiles of predictions + pred_quantiles <- data.frame(t(apply(hcs, 2, function(x) + quantile(x, probs = c(0.05, 0.2, 0.3, 0.4, + 0.5, 0.6, 0.7, 0.8, 0.95))))) + pred_quantiles$time <- 1:NROW(pred_quantiles) + pred_quantiles$truth <- truths + + # Grab observations + data %>% + dplyr::filter(series %in% all_series) %>% + dplyr::select(time, obs) -> observations + + # Plot + ggplot(pred_quantiles, aes(x = time, group = 1)) + + geom_ribbon(aes(ymin = X5., ymax = X95.), fill = "#DCBCBC") + + geom_ribbon(aes(ymin = X30., ymax = X70.), fill = "#B97C7C") + + geom_line(aes(x = time, y = truth), + colour = 'black', linewidth = 1) + + geom_point(aes(x = time, y = truth), + shape = 21, colour = 'white', fill = 'black', + size = 2.5) + + geom_jitter(data = observations, aes(x = time, y = obs), + width = 0.06, + shape = 21, fill = 'darkred', colour = 'white', size = 2.5) + + labs(y = 'Latent abundance (N)', + x = 'Time', + title = species) +} +``` + +Latent abundance plots vs the simulated truths for each species are shown below. Here, the red points show the imperfect observations, the black line shows the true latent abundance, and the ribbons show credible intervals of our estimates: +```{r} +plot_latentN(hc, testdat, species = 'sp_1') +plot_latentN(hc, testdat, species = 'sp_2') +``` + +We can see that estimates for both species have correctly captured the true temporal variation and magnitudes in abundance + +## Example 2: a larger survey with possible nonlinear effects + +Now for another example with a larger dataset. We will use data from [Jeff Doser's simulation example from the wonderful `spAbundance` package](https://www.jeffdoser.com/files/spabundance-web/articles/nmixturemodels){target="_blank"}. The simulated data include one continuous site-level covariate, one factor site-level covariate and two continuous sample-level covariates. This example will allow us to examine how we can include possibly nonlinear effects in the latent process and detection probability models. + +Download the data and grab observations / covariate measurements for one species +```{r} +# Date link +load(url('https://github.com/doserjef/spAbundance/raw/main/data/dataNMixSim.rda')) +data.one.sp <- dataNMixSim + +# Pull out observations for one species +data.one.sp$y <- data.one.sp$y[1, , ] + +# Abundance covariates that don't change across repeat sampling observations +abund.cov <- dataNMixSim$abund.covs[, 1] +abund.factor <- as.factor(dataNMixSim$abund.covs[, 2]) + +# Detection covariates that can change across repeat sampling observations +# Note that `NA`s are not allowed for covariates in mvgam, so we randomly +# impute them here +det.cov <- dataNMixSim$det.covs$det.cov.1[,] +det.cov[is.na(det.cov)] <- rnorm(length(which(is.na(det.cov)))) +det.cov2 <- dataNMixSim$det.covs$det.cov.2 +det.cov2[is.na(det.cov2)] <- rnorm(length(which(is.na(det.cov2)))) +``` + +Next we wrangle into the appropriate 'long' data format, adding indicators of `time` and `series` for working in `mvgam`. We also add the `cap` variable to represent the maximum latent N to marginalize over for each observation +```{r} +mod_data <- do.call(rbind, + lapply(1:NROW(data.one.sp$y), function(x){ + data.frame(y = data.one.sp$y[x,], + abund_cov = abund.cov[x], + abund_fac = abund.factor[x], + det_cov = det.cov[x,], + det_cov2 = det.cov2[x,], + replicate = 1:NCOL(data.one.sp$y), + site = paste0('site', x)) + })) %>% + dplyr::mutate(species = 'sp_1', + series = as.factor(paste0(site, '_', species, '_', replicate))) %>% + dplyr::mutate(site = factor(site, levels = unique(site)), + species = factor(species, levels = unique(species)), + time = 1, + cap = max(data.one.sp$y, na.rm = TRUE) + 20) +``` + +The data include observations for 225 sites with three replicates per site, though some observations are missing +```{r} +NROW(mod_data) +dplyr::glimpse(mod_data) +head(mod_data) +``` + +The final step for data preparation is of course the `trend_map`, which sets up the mapping between observation replicates and the latent abundance models. This is done in the same way as in the example above +```{r} +mod_data %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map + +trend_map %>% + dplyr::arrange(trend) %>% + head(12) +``` + +Now we are ready to fit a model using `mvgam()`. Here we will use penalized splines for each of the continuous covariate effects to detect possible nonlinear associations. We also showcase how `mvgam` can make use of the different approximation algorithms available in `Stan` by using the meanfield variational Bayes approximator (this reduces computation time to around 12 seconds for this example) +```{r include = FALSE, results='hide'} +mod <- mvgam( + # effects of covariates on detection probability; + # here we use penalized splines for both continuous covariates + formula = y ~ s(det_cov, k = 3) + s(det_cov2, k = 3), + + # effects of the covariates on latent abundance; + # here we use a penalized spline for the continuous covariate and + # hierarchical intercepts for the factor covariate + trend_formula = ~ s(abund_cov, k = 3) + + s(abund_fac, bs = 're'), + + # link multiple observations to each site + trend_map = trend_map, + + # nmix() family and supplied data + family = nmix(), + data = mod_data, + + # standard normal priors on key regression parameters + priors = c(prior(std_normal(), class = 'b'), + prior(std_normal(), class = 'Intercept'), + prior(std_normal(), class = 'Intercept_trend'), + prior(std_normal(), class = 'sigma_raw_trend')), + + # use Stan's variational inference for quicker results + algorithm = 'meanfield', + + # no need to compute "series-level" residuals + residuals = FALSE, + samples = 1000) +``` + +```{r eval=FALSE} +mod <- mvgam( + # effects of covariates on detection probability; + # here we use penalized splines for both continuous covariates + formula = y ~ s(det_cov, k = 4) + s(det_cov2, k = 4), + + # effects of the covariates on latent abundance; + # here we use a penalized spline for the continuous covariate and + # hierarchical intercepts for the factor covariate + trend_formula = ~ s(abund_cov, k = 4) + + s(abund_fac, bs = 're'), + + # link multiple observations to each site + trend_map = trend_map, + + # nmix() family and supplied data + family = nmix(), + data = mod_data, + + # standard normal priors on key regression parameters + priors = c(prior(std_normal(), class = 'b'), + prior(std_normal(), class = 'Intercept'), + prior(std_normal(), class = 'Intercept_trend'), + prior(std_normal(), class = 'sigma_raw_trend')), + + # use Stan's variational inference for quicker results + algorithm = 'meanfield', + + # no need to compute "series-level" residuals + residuals = FALSE, + samples = 1000) +``` + +Inspect the model summary but don't bother looking at estimates for all individual spline coefficients. Notice how we no longer receive information on convergence because we did not use MCMC sampling for this model +```{r} +summary(mod, include_betas = FALSE) +``` + +Again we can make use of `marginaleffects` support for interrogating the model through targeted predictions. First, we can inspect the estimated average detection probability +```{r} +avg_predictions(mod, type = 'detection') +``` + +Next investigate estimated effects of covariates on latent abundance using the `conditional_effects()` function and specifying `type = 'link'`; this will return plots on the expectation scale +```{r} +abund_plots <- plot(conditional_effects(mod, + type = 'link', + effects = c('abund_cov', + 'abund_fac')), + plot = FALSE) +``` + +The effect of the continuous covariate on expected latent abundance +```{r} +abund_plots[[1]] + + ylab('Expected latent abundance') +``` + +The effect of the factor covariate on expected latent abundance, estimated as a hierarchical random effect +```{r} +abund_plots[[2]] + + ylab('Expected latent abundance') +``` + +Now we can investigate estimated effects of covariates on detection probability using `type = 'detection'` +```{r} +det_plots <- plot(conditional_effects(mod, + type = 'detection', + effects = c('det_cov', + 'det_cov2')), + plot = FALSE) +``` + +The covariate smooths were estimated to be somewhat nonlinear on the logit scale according to the model summary (based on their approximate significances). But inspecting conditional effects of each covariate on the probability scale is more intuitive and useful +```{r} +det_plots[[1]] + + ylab('Pr(detection)') +det_plots[[2]] + + ylab('Pr(detection)') +``` + +More targeted predictions are also easy with `marginaleffects` support. For example, we can ask: How does detection probability change as we change *both* detection covariates? +```{r} +fivenum_round = function(x)round(fivenum(x, na.rm = TRUE), 2) + +plot_predictions(mod, + newdata = datagrid(det_cov = unique, + det_cov2 = fivenum_round), + by = c('det_cov', 'det_cov2'), + type = 'detection') + + theme_classic() + + ylab('Pr(detection)') +``` + +The model has found support for some important covariate effects, but of course we'd want to interrogate how well the model predicts and think about possible spatial effects to capture unmodelled variation in latent abundance (which can easily be incorporated into both linear predictors using spatial smooths). + +## Further reading +The following papers and resources offer useful material about N-mixture models for ecological population dynamics investigations: + +Guélat, Jérôme, and Kéry, Marc. “[Effects of Spatial Autocorrelation and Imperfect Detection on Species Distribution Models.](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12983)” *Methods in Ecology and Evolution* 9 (2018): 1614–25. + +Kéry, Marc, and Royle Andrew J. "[Applied hierarchical modeling in ecology: Analysis of distribution, abundance and species richness in R and BUGS: Volume 2: Dynamic and advanced models](https://www.sciencedirect.com/book/9780128237687/applied-hierarchical-modeling-in-ecology-analysis-of-distribution-abundance-and-species-richness-in-r-and-bugs)". London, UK: Academic Press (2020). + +Royle, Andrew J. "[N‐mixture models for estimating population size from spatially replicated counts.](https://onlinelibrary.wiley.com/doi/full/10.1111/j.0006-341X.2004.00142.x)" *Biometrics* 60.1 (2004): 108-115. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/doc/nmixtures.html b/doc/nmixtures.html new file mode 100644 index 00000000..f92147dd --- /dev/null +++ b/doc/nmixtures.html @@ -0,0 +1,1202 @@ + + + + + + + + + + + + + + + + +N-mixtures in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

N-mixtures in mvgam

+

Nicholas J Clark

+

2024-04-16

+ + + +

The purpose of this vignette is to show how the mvgam +package can be used to fit and interrogate N-mixture models for +population abundance counts made with imperfect detection.

+
+

N-mixture models

+

An N-mixture model is a fairly recent addition to the ecological +modeller’s toolkit that is designed to make inferences about variation +in the abundance of species when observations are imperfect (Royle 2004). Briefly, assume \(\boldsymbol{Y_{i,r}}\) is the number of +individuals recorded at site \(i\) +during replicate sampling observation \(r\) (recorded as a non-negative integer). +If multiple replicate surveys are done within a short enough period to +satisfy the assumption that the population remained closed (i.e. there +was no substantial change in true population size between replicate +surveys), we can account for the fact that observations aren’t perfect. +This is done by assuming that these replicate observations are Binomial +random variables that are parameterized by the true “latent” abundance +\(N\) and a detection probability \(p\):

+

\[\begin{align*} +\boldsymbol{Y_{i,r}} & \sim \text{Binomial}(N_i, p_r) \\ +N_{i} & \sim \text{Poisson}(\lambda_i) \end{align*}\]

+

Using a set of linear predictors, we can estimate effects of +covariates \(\boldsymbol{X}\) on the +expected latent abundance (with a log link for \(\lambda\)) and, jointly, effects of +possibly different covariates (call them \(\boldsymbol{Q}\)) on detection probability +(with a logit link for \(p\)):

+

\[\begin{align*} +log(\lambda) & = \beta \boldsymbol{X} \\ +logit(p) & = \gamma \boldsymbol{Q}\end{align*}\]

+

mvgam can handle this type of model because it is +designed to propagate unobserved temporal processes that evolve +independently of the observation process in a State-space format. This +setup adapts well to N-mixture models because they can be thought of as +State-space models in which the latent state is a discrete variable +representing the “true” but unknown population size. This is very +convenient because we can incorporate any of the package’s diverse +effect types (i.e. multidimensional splines, time-varying effects, +monotonic effects, random effects etc…) into the linear predictors. All +that is required for this to work is a marginalization trick that allows +Stan’s sampling algorithms to handle discrete parameters +(see more about how this method of “integrating out” discrete parameters +works in this nice blog post by Maxwell Joseph).

+

The family nmix() is used to set up N-mixture models in +mvgam, but we still need to do a little bit of data +wrangling to ensure the data are set up in the correct format (this is +especially true when we have more than one replicate survey per time +period). The most important aspects are: (1) how we set up the +observation series and trend_map arguments to +ensure replicate surveys are mapped to the correct latent abundance +model and (2) the inclusion of a cap variable that defines +the maximum possible integer value to use for each observation when +estimating latent abundance. The two examples below give a reasonable +overview of how this can be done.

+
+ +
+

Example 2: a larger survey with possible nonlinear effects

+

Now for another example with a larger dataset. We will use data from +Jeff Doser’s simulation example from the wonderful +spAbundance package. The simulated data include one +continuous site-level covariate, one factor site-level covariate and two +continuous sample-level covariates. This example will allow us to +examine how we can include possibly nonlinear effects in the latent +process and detection probability models.

+

Download the data and grab observations / covariate measurements for +one species

+
# Date link
+load(url('https://github.com/doserjef/spAbundance/raw/main/data/dataNMixSim.rda'))
+data.one.sp <- dataNMixSim
+
+# Pull out observations for one species
+data.one.sp$y <- data.one.sp$y[1, , ]
+
+# Abundance covariates that don't change across repeat sampling observations
+abund.cov <- dataNMixSim$abund.covs[, 1]
+abund.factor <- as.factor(dataNMixSim$abund.covs[, 2])
+
+# Detection covariates that can change across repeat sampling observations
+# Note that `NA`s are not allowed for covariates in mvgam, so we randomly
+# impute them here
+det.cov <- dataNMixSim$det.covs$det.cov.1[,]
+det.cov[is.na(det.cov)] <- rnorm(length(which(is.na(det.cov))))
+det.cov2 <- dataNMixSim$det.covs$det.cov.2
+det.cov2[is.na(det.cov2)] <- rnorm(length(which(is.na(det.cov2))))
+

Next we wrangle into the appropriate ‘long’ data format, adding +indicators of time and series for working in +mvgam. We also add the cap variable to +represent the maximum latent N to marginalize over for each +observation

+
mod_data <- do.call(rbind,
+                    lapply(1:NROW(data.one.sp$y), function(x){
+                      data.frame(y = data.one.sp$y[x,],
+                                 abund_cov = abund.cov[x],
+                                 abund_fac = abund.factor[x],
+                                 det_cov = det.cov[x,],
+                                 det_cov2 = det.cov2[x,],
+                                 replicate = 1:NCOL(data.one.sp$y),
+                                 site = paste0('site', x))
+                    })) %>%
+  dplyr::mutate(species = 'sp_1',
+                series = as.factor(paste0(site, '_', species, '_', replicate))) %>%
+  dplyr::mutate(site = factor(site, levels = unique(site)),
+                species = factor(species, levels = unique(species)),
+                time = 1,
+                cap = max(data.one.sp$y, na.rm = TRUE) + 20)
+

The data include observations for 225 sites with three replicates per +site, though some observations are missing

+
NROW(mod_data)
+#> [1] 675
+dplyr::glimpse(mod_data)
+#> Rows: 675
+#> Columns: 11
+#> $ y         <int> 1, NA, NA, NA, 2, 2, NA, 1, NA, NA, 0, 1, 0, 0, 0, 0, NA, NA…
+#> $ abund_cov <dbl> -0.3734384, -0.3734384, -0.3734384, 0.7064305, 0.7064305, 0.…
+#> $ abund_fac <fct> 3, 3, 3, 4, 4, 4, 9, 9, 9, 2, 2, 2, 3, 3, 3, 2, 2, 2, 1, 1, …
+#> $ det_cov   <dbl> -1.28279990, -0.08474811, 0.44789392, 1.71731815, 0.19548086…
+#> $ det_cov2  <dbl> 2.03047314, -1.42459158, 1.68497337, 0.75026787, 1.04555361,…
+#> $ replicate <int> 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, …
+#> $ site      <fct> site1, site1, site1, site2, site2, site2, site3, site3, site…
+#> $ species   <fct> sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, …
+#> $ series    <fct> site1_sp_1_1, site1_sp_1_2, site1_sp_1_3, site2_sp_1_1, site…
+#> $ time      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
+#> $ cap       <dbl> 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, …
+head(mod_data)
+#>    y  abund_cov abund_fac     det_cov   det_cov2 replicate  site species
+#> 1  1 -0.3734384         3 -1.28279990  2.0304731         1 site1    sp_1
+#> 2 NA -0.3734384         3 -0.08474811 -1.4245916         2 site1    sp_1
+#> 3 NA -0.3734384         3  0.44789392  1.6849734         3 site1    sp_1
+#> 4 NA  0.7064305         4  1.71731815  0.7502679         1 site2    sp_1
+#> 5  2  0.7064305         4  0.19548086  1.0455536         2 site2    sp_1
+#> 6  2  0.7064305         4  0.96730338  1.9197118         3 site2    sp_1
+#>         series time cap
+#> 1 site1_sp_1_1    1  33
+#> 2 site1_sp_1_2    1  33
+#> 3 site1_sp_1_3    1  33
+#> 4 site2_sp_1_1    1  33
+#> 5 site2_sp_1_2    1  33
+#> 6 site2_sp_1_3    1  33
+

The final step for data preparation is of course the +trend_map, which sets up the mapping between observation +replicates and the latent abundance models. This is done in the same way +as in the example above

+
mod_data %>%
+  # each unique combination of site*species is a separate process
+  dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>%
+  dplyr::select(trend, series) %>%
+  dplyr::distinct() -> trend_map
+
+trend_map %>%
+  dplyr::arrange(trend) %>%
+  head(12)
+#>    trend         series
+#> 1      1 site100_sp_1_1
+#> 2      1 site100_sp_1_2
+#> 3      1 site100_sp_1_3
+#> 4      2 site101_sp_1_1
+#> 5      2 site101_sp_1_2
+#> 6      2 site101_sp_1_3
+#> 7      3 site102_sp_1_1
+#> 8      3 site102_sp_1_2
+#> 9      3 site102_sp_1_3
+#> 10     4 site103_sp_1_1
+#> 11     4 site103_sp_1_2
+#> 12     4 site103_sp_1_3
+

Now we are ready to fit a model using mvgam(). Here we +will use penalized splines for each of the continuous covariate effects +to detect possible nonlinear associations. We also showcase how +mvgam can make use of the different approximation +algorithms available in Stan by using the meanfield +variational Bayes approximator (this reduces computation time to around +12 seconds for this example)

+
mod <- mvgam(
+  # effects of covariates on detection probability;
+  # here we use penalized splines for both continuous covariates
+  formula = y ~ s(det_cov, k = 4) + s(det_cov2, k = 4),
+  
+  # effects of the covariates on latent abundance;
+  # here we use a penalized spline for the continuous covariate and
+  # hierarchical intercepts for the factor covariate
+  trend_formula = ~ s(abund_cov, k = 4) +
+    s(abund_fac, bs = 're'),
+  
+  # link multiple observations to each site
+  trend_map = trend_map,
+  
+  # nmix() family and supplied data
+  family = nmix(),
+  data = mod_data,
+  
+  # standard normal priors on key regression parameters
+  priors = c(prior(std_normal(), class = 'b'),
+             prior(std_normal(), class = 'Intercept'),
+             prior(std_normal(), class = 'Intercept_trend'),
+             prior(std_normal(), class = 'sigma_raw_trend')),
+  
+  # use Stan's variational inference for quicker results
+  algorithm = 'meanfield',
+  
+  # no need to compute "series-level" residuals
+  residuals = FALSE,
+  samples = 1000)
+

Inspect the model summary but don’t bother looking at estimates for +all individual spline coefficients. Notice how we no longer receive +information on convergence because we did not use MCMC sampling for this +model

+
summary(mod, include_betas = FALSE)
+#> GAM observation formula:
+#> y ~ s(det_cov, k = 3) + s(det_cov2, k = 3)
+#> 
+#> GAM process formula:
+#> ~s(abund_cov, k = 3) + s(abund_fac, bs = "re")
+#> 
+#> Family:
+#> nmix
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N process models:
+#> 225 
+#> 
+#> N series:
+#> 675 
+#> 
+#> N timepoints:
+#> 1 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 1 chains, each with iter = 1000; warmup = ; thin = 1 
+#> Total post-warmup draws = 1000
+#> 
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>              2.5% 50% 97.5% Rhat n.eff
+#> (Intercept) 0.052 0.4  0.71  NaN   NaN
+#> 
+#> Approximate significance of GAM observation smooths:
+#>              edf Ref.df Chi.sq p-value    
+#> s(det_cov)  1.22      2   52.3  0.0011 ** 
+#> s(det_cov2) 1.07      2  307.1  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> GAM process model coefficient (beta) estimates:
+#>                    2.5%    50% 97.5% Rhat n.eff
+#> (Intercept)_trend -0.25 -0.081 0.079  NaN   NaN
+#> 
+#> GAM process model group-level estimates:
+#>                           2.5%    50% 97.5% Rhat n.eff
+#> mean(s(abund_fac))_trend -0.18 0.0038  0.19  NaN   NaN
+#> sd(s(abund_fac))_trend    0.26 0.3900  0.56  NaN   NaN
+#> 
+#> Approximate significance of GAM process smooths:
+#>               edf Ref.df    F p-value  
+#> s(abund_cov) 1.19      2 2.38   0.299  
+#> s(abund_fac) 8.82     10 2.79   0.025 *
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Posterior approximation used: no diagnostics to compute
+

Again we can make use of marginaleffects support for +interrogating the model through targeted predictions. First, we can +inspect the estimated average detection probability

+
avg_predictions(mod, type = 'detection')
+#> 
+#>  Estimate 2.5 % 97.5 %
+#>     0.579  0.51  0.644
+#> 
+#> Columns: estimate, conf.low, conf.high 
+#> Type:  detection
+

Next investigate estimated effects of covariates on latent abundance +using the conditional_effects() function and specifying +type = 'link'; this will return plots on the expectation +scale

+
abund_plots <- plot(conditional_effects(mod,
+                                        type = 'link',
+                                        effects = c('abund_cov',
+                                                    'abund_fac')),
+                    plot = FALSE)
+

The effect of the continuous covariate on expected latent +abundance

+
abund_plots[[1]] +
+  ylab('Expected latent abundance')
+

+

The effect of the factor covariate on expected latent abundance, +estimated as a hierarchical random effect

+
abund_plots[[2]] +
+  ylab('Expected latent abundance')
+

+

Now we can investigate estimated effects of covariates on detection +probability using type = 'detection'

+
det_plots <- plot(conditional_effects(mod,
+                                      type = 'detection',
+                                      effects = c('det_cov',
+                                                  'det_cov2')),
+                  plot = FALSE)
+

The covariate smooths were estimated to be somewhat nonlinear on the +logit scale according to the model summary (based on their approximate +significances). But inspecting conditional effects of each covariate on +the probability scale is more intuitive and useful

+
det_plots[[1]] +
+  ylab('Pr(detection)')
+

+
det_plots[[2]] +
+  ylab('Pr(detection)')
+

+

More targeted predictions are also easy with +marginaleffects support. For example, we can ask: How does +detection probability change as we change both detection +covariates?

+
fivenum_round = function(x)round(fivenum(x, na.rm = TRUE), 2)
+
+plot_predictions(mod, 
+                 newdata = datagrid(det_cov = unique,
+                                    det_cov2 = fivenum_round),
+                 by = c('det_cov', 'det_cov2'),
+                 type = 'detection') +
+  theme_classic() +
+  ylab('Pr(detection)')
+

+

The model has found support for some important covariate effects, but +of course we’d want to interrogate how well the model predicts and think +about possible spatial effects to capture unmodelled variation in latent +abundance (which can easily be incorporated into both linear predictors +using spatial smooths).

+
+
+

Further reading

+

The following papers and resources offer useful material about +N-mixture models for ecological population dynamics investigations:

+

Guélat, Jérôme, and Kéry, Marc. “Effects +of Spatial Autocorrelation and Imperfect Detection on Species +Distribution Models.Methods in Ecology and Evolution 9 +(2018): 1614–25.

+

Kéry, Marc, and Royle Andrew J. “Applied +hierarchical modeling in ecology: Analysis of distribution, abundance +and species richness in R and BUGS: Volume 2: Dynamic and advanced +models”. London, UK: Academic Press (2020).

+

Royle, Andrew J. “N‐mixture +models for estimating population size from spatially replicated +counts.Biometrics 60.1 (2004): 108-115.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/doc/shared_states.R b/doc/shared_states.R new file mode 100644 index 00000000..4eabc053 --- /dev/null +++ b/doc/shared_states.R @@ -0,0 +1,260 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +set.seed(122) +simdat <- sim_mvgam(trend_model = 'AR1', + prop_trend = 0.6, + mu = c(0, 1, 2), + family = poisson()) +trend_map <- data.frame(series = unique(simdat$data_train$series), + trend = c(1, 1, 2)) +trend_map + +## ----------------------------------------------------------------------------- +all.equal(levels(trend_map$series), levels(simdat$data_train$series)) + +## ----------------------------------------------------------------------------- +fake_mod <- mvgam(y ~ + # observation model formula, which has a + # different intercept per series + series - 1, + + # process model formula, which has a shared seasonal smooth + # (each latent process model shares the SAME smooth) + trend_formula = ~ s(season, bs = 'cc', k = 6), + + # AR1 dynamics (each latent process model has DIFFERENT) + # dynamics + trend_model = 'AR1', + + # supplied trend_map + trend_map = trend_map, + + # data and observation family + family = poisson(), + data = simdat$data_train, + run_model = FALSE) + +## ----------------------------------------------------------------------------- +code(fake_mod) + +## ----------------------------------------------------------------------------- +fake_mod$model_data$Z + +## ----full_mod, include = FALSE, results='hide'-------------------------------- +full_mod <- mvgam(y ~ series - 1, + trend_formula = ~ s(season, bs = 'cc', k = 6), + trend_model = 'AR1', + trend_map = trend_map, + family = poisson(), + data = simdat$data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# full_mod <- mvgam(y ~ series - 1, +# trend_formula = ~ s(season, bs = 'cc', k = 6), +# trend_model = 'AR1', +# trend_map = trend_map, +# family = poisson(), +# data = simdat$data_train) + +## ----------------------------------------------------------------------------- +summary(full_mod) + +## ----------------------------------------------------------------------------- +plot(conditional_effects(full_mod, type = 'link'), ask = FALSE) + +## ----------------------------------------------------------------------------- +plot(full_mod, type = 'trend', series = 1) +plot(full_mod, type = 'trend', series = 2) +plot(full_mod, type = 'trend', series = 3) + +## ----------------------------------------------------------------------------- +plot(full_mod, type = 'forecast', series = 1) +plot(full_mod, type = 'forecast', series = 2) +plot(full_mod, type = 'forecast', series = 3) + +## ----------------------------------------------------------------------------- +set.seed(543210) +# simulate a nonlinear relationship using the mgcv function gamSim +signal_dat <- gamSim(n = 100, eg = 1, scale = 1) + +# productivity is one of the variables in the simulated data +productivity <- signal_dat$x2 + +# simulate the true signal, which already has a nonlinear relationship +# with productivity; we will add in a fairly strong AR1 process to +# contribute to the signal +true_signal <- as.vector(scale(signal_dat$y) + + arima.sim(100, model = list(ar = 0.8, sd = 0.1))) + +## ----------------------------------------------------------------------------- +plot(true_signal, type = 'l', + bty = 'l', lwd = 2, + ylab = 'True signal', + xlab = 'Time') + +## ----------------------------------------------------------------------------- +plot(true_signal ~ productivity, + pch = 16, bty = 'l', + ylab = 'True signal', + xlab = 'Productivity') + +## ----------------------------------------------------------------------------- +set.seed(543210) +sim_series = function(n_series = 3, true_signal){ + temp_effects <- gamSim(n = 100, eg = 7, scale = 0.1) + temperature <- temp_effects$y + alphas <- rnorm(n_series, sd = 2) + + do.call(rbind, lapply(seq_len(n_series), function(series){ + data.frame(observed = rnorm(length(true_signal), + mean = alphas[series] + + 1.5*as.vector(scale(temp_effects[, series + 1])) + + true_signal, + sd = runif(1, 1, 2)), + series = paste0('sensor_', series), + time = 1:length(true_signal), + temperature = temperature, + productivity = productivity, + true_signal = true_signal) + })) + } +model_dat <- sim_series(true_signal = true_signal) %>% + dplyr::mutate(series = factor(series)) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = model_dat, y = 'observed', + series = 'all') + +## ----------------------------------------------------------------------------- + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_1'), + pch = 16, bty = 'l', + ylab = 'Sensor 1', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_2'), + pch = 16, bty = 'l', + ylab = 'Sensor 2', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_3'), + pch = 16, bty = 'l', + ylab = 'Sensor 3', + xlab = 'Temperature') + +## ----sensor_mod, include = FALSE, results='hide'------------------------------ +mod <- mvgam(formula = + # formula for observations, allowing for different + # intercepts and smooth effects of temperature + observed ~ series + + s(temperature, k = 10) + + s(series, temperature, bs = 'sz', k = 8), + + trend_formula = + # formula for the latent signal, which can depend + # nonlinearly on productivity + ~ s(productivity, k = 8), + + trend_model = + # in addition to productivity effects, the signal is + # assumed to exhibit temporal autocorrelation + 'AR1', + + trend_map = + # trend_map forces all sensors to track the same + # latent signal + data.frame(series = unique(model_dat$series), + trend = c(1, 1, 1)), + + # informative priors on process error + # and observation error will help with convergence + priors = c(prior(normal(2, 0.5), class = sigma), + prior(normal(1, 0.5), class = sigma_obs)), + + # Gaussian observations + family = gaussian(), + burnin = 600, + adapt_delta = 0.95, + data = model_dat) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam(formula = +# # formula for observations, allowing for different +# # intercepts and hierarchical smooth effects of temperature +# observed ~ series + +# s(temperature, k = 10) + +# s(series, temperature, bs = 'sz', k = 8), +# +# trend_formula = +# # formula for the latent signal, which can depend +# # nonlinearly on productivity +# ~ s(productivity, k = 8), +# +# trend_model = +# # in addition to productivity effects, the signal is +# # assumed to exhibit temporal autocorrelation +# 'AR1', +# +# trend_map = +# # trend_map forces all sensors to track the same +# # latent signal +# data.frame(series = unique(model_dat$series), +# trend = c(1, 1, 1)), +# +# # informative priors on process error +# # and observation error will help with convergence +# priors = c(prior(normal(2, 0.5), class = sigma), +# prior(normal(1, 0.5), class = sigma_obs)), +# +# # Gaussian observations +# family = gaussian(), +# data = model_dat) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths', trend_effects = TRUE) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths') + +## ----------------------------------------------------------------------------- +plot(conditional_effects(mod, type = 'link'), ask = FALSE) + +## ----------------------------------------------------------------------------- +plot_predictions(mod, + condition = c('temperature', 'series', 'series'), + points = 0.5) + + theme(legend.position = 'none') + +## ----------------------------------------------------------------------------- +pairs(mod, variable = c('sigma[1]', 'sigma_obs[1]')) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'trend') + +# Overlay the true simulated signal +points(true_signal, pch = 16, cex = 1, col = 'white') +points(true_signal, pch = 16, cex = 0.8) + diff --git a/doc/shared_states.Rmd b/doc/shared_states.Rmd new file mode 100644 index 00000000..e0f36689 --- /dev/null +++ b/doc/shared_states.Rmd @@ -0,0 +1,346 @@ +--- +title: "Shared latent states in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Shared latent states in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +This vignette gives an example of how `mvgam` can be used to estimate models where multiple observed time series share the same latent process model. For full details on the basic `mvgam` functionality, please see [the introductory vignette](https://nicholasjclark.github.io/mvgam/articles/mvgam_overview.html). + +## The `trend_map` argument +The `trend_map` argument in the `mvgam()` function is an optional `data.frame` that can be used to specify which series should depend on which latent process models (called "trends" in `mvgam`). This can be particularly useful if we wish to force multiple observed time series to depend on the same latent trend process, but with different observation processes. If this argument is supplied, a latent factor model is set up by setting `use_lv = TRUE` and using the supplied `trend_map` to set up the shared trends. Users familiar with the `MARSS` family of packages will recognize this as a way of specifying the $Z$ matrix. This `data.frame` needs to have column names `series` and `trend`, with integer values in the `trend` column to state which trend each series should depend on. The `series` column should have a single unique entry for each time series in the data, with names that perfectly match the factor levels of the `series` variable in `data`). For example, if we were to simulate a collection of three integer-valued time series (using `sim_mvgam`), the following `trend_map` would force the first two series to share the same latent trend process: +```{r} +set.seed(122) +simdat <- sim_mvgam(trend_model = 'AR1', + prop_trend = 0.6, + mu = c(0, 1, 2), + family = poisson()) +trend_map <- data.frame(series = unique(simdat$data_train$series), + trend = c(1, 1, 2)) +trend_map +``` + +We can see that the factor levels in `trend_map` match those in the data: +```{r} +all.equal(levels(trend_map$series), levels(simdat$data_train$series)) +``` + +### Checking `trend_map` with `run_model = FALSE` +Supplying this `trend_map` to the `mvgam` function for a simple model, but setting `run_model = FALSE`, allows us to inspect the constructed `Stan` code and the data objects that would be used to condition the model. Here we will set up a model in which each series has a different observation process (with only a different intercept per series in this case), and the two latent dynamic process models evolve as independent AR1 processes that also contain a shared nonlinear smooth function to capture repeated seasonality. This model is not too complicated but it does show how we can learn shared and independent effects for collections of time series in the `mvgam` framework: +```{r} +fake_mod <- mvgam(y ~ + # observation model formula, which has a + # different intercept per series + series - 1, + + # process model formula, which has a shared seasonal smooth + # (each latent process model shares the SAME smooth) + trend_formula = ~ s(season, bs = 'cc', k = 6), + + # AR1 dynamics (each latent process model has DIFFERENT) + # dynamics + trend_model = 'AR1', + + # supplied trend_map + trend_map = trend_map, + + # data and observation family + family = poisson(), + data = simdat$data_train, + run_model = FALSE) +``` + +Inspecting the `Stan` code shows how this model is a dynamic factor model in which the loadings are constructed to reflect the supplied `trend_map`: +```{r} +code(fake_mod) +``` + +Notice the line that states "lv_coefs = Z;". This uses the supplied $Z$ matrix to construct the loading coefficients. The supplied matrix now looks exactly like what you'd use if you were to create a similar model in the `MARSS` package: +```{r} +fake_mod$model_data$Z +``` + +### Fitting and inspecting the model +Though this model doesn't perfectly match the data-generating process (which allowed each series to have different underlying dynamics), we can still fit it to show what the resulting inferences look like: +```{r full_mod, include = FALSE, results='hide'} +full_mod <- mvgam(y ~ series - 1, + trend_formula = ~ s(season, bs = 'cc', k = 6), + trend_model = 'AR1', + trend_map = trend_map, + family = poisson(), + data = simdat$data_train) +``` + +```{r eval=FALSE} +full_mod <- mvgam(y ~ series - 1, + trend_formula = ~ s(season, bs = 'cc', k = 6), + trend_model = 'AR1', + trend_map = trend_map, + family = poisson(), + data = simdat$data_train) +``` + +The summary of this model is informative as it shows that only two latent process models have been estimated, even though we have three observed time series. The model converges well +```{r} +summary(full_mod) +``` + +Quick plots of all main effects can be made using `conditional_effects()`: +```{r} +plot(conditional_effects(full_mod, type = 'link'), ask = FALSE) +``` + +Even more informative are the plots of the latent processes. Both series 1 and 2 share the exact same estimates, while the estimates for series 3 are different: +```{r} +plot(full_mod, type = 'trend', series = 1) +plot(full_mod, type = 'trend', series = 2) +plot(full_mod, type = 'trend', series = 3) +``` + +However, the forecasts for series' 1 and 2 differ because they have different intercepts in the observation model +```{r} +plot(full_mod, type = 'forecast', series = 1) +plot(full_mod, type = 'forecast', series = 2) +plot(full_mod, type = 'forecast', series = 3) +``` + +## Example: signal detection +Now we will explore a more complicated example. Here we simulate a true hidden signal that we are trying to track. This signal depends nonlinearly on some covariate (called `productivity`, which represents a measure of how productive the landscape is). The signal also demonstrates a fairly large amount of temporal autocorrelation: +```{r} +set.seed(543210) +# simulate a nonlinear relationship using the mgcv function gamSim +signal_dat <- gamSim(n = 100, eg = 1, scale = 1) + +# productivity is one of the variables in the simulated data +productivity <- signal_dat$x2 + +# simulate the true signal, which already has a nonlinear relationship +# with productivity; we will add in a fairly strong AR1 process to +# contribute to the signal +true_signal <- as.vector(scale(signal_dat$y) + + arima.sim(100, model = list(ar = 0.8, sd = 0.1))) +``` + +Plot the signal to inspect it's evolution over time +```{r} +plot(true_signal, type = 'l', + bty = 'l', lwd = 2, + ylab = 'True signal', + xlab = 'Time') +``` + +And plot the relationship between the signal and the `productivity` covariate: +```{r} +plot(true_signal ~ productivity, + pch = 16, bty = 'l', + ylab = 'True signal', + xlab = 'Productivity') +``` + +Next we simulate three sensors that are trying to track the same hidden signal. All of these sensors have different observation errors that can depend nonlinearly on a second external covariate, called `temperature` in this example. Again this makes use of `gamSim` +```{r} +set.seed(543210) +sim_series = function(n_series = 3, true_signal){ + temp_effects <- gamSim(n = 100, eg = 7, scale = 0.1) + temperature <- temp_effects$y + alphas <- rnorm(n_series, sd = 2) + + do.call(rbind, lapply(seq_len(n_series), function(series){ + data.frame(observed = rnorm(length(true_signal), + mean = alphas[series] + + 1.5*as.vector(scale(temp_effects[, series + 1])) + + true_signal, + sd = runif(1, 1, 2)), + series = paste0('sensor_', series), + time = 1:length(true_signal), + temperature = temperature, + productivity = productivity, + true_signal = true_signal) + })) + } +model_dat <- sim_series(true_signal = true_signal) %>% + dplyr::mutate(series = factor(series)) +``` + +Plot the sensor observations +```{r} +plot_mvgam_series(data = model_dat, y = 'observed', + series = 'all') +``` + +And now plot the observed relationships between the three sensors and the `temperature` covariate +```{r} + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_1'), + pch = 16, bty = 'l', + ylab = 'Sensor 1', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_2'), + pch = 16, bty = 'l', + ylab = 'Sensor 2', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_3'), + pch = 16, bty = 'l', + ylab = 'Sensor 3', + xlab = 'Temperature') +``` + +### The shared signal model +Now we can formulate and fit a model that allows each sensor's observation error to depend nonlinearly on `temperature` while allowing the true signal to depend nonlinearly on `productivity`. By fixing all of the values in the `trend` column to `1` in the `trend_map`, we are assuming that all observation sensors are tracking the same latent signal. We use informative priors on the two variance components (process error and observation error), which reflect our prior belief that the observation error is smaller overall than the true process error +```{r sensor_mod, include = FALSE, results='hide'} +mod <- mvgam(formula = + # formula for observations, allowing for different + # intercepts and smooth effects of temperature + observed ~ series + + s(temperature, k = 10) + + s(series, temperature, bs = 'sz', k = 8), + + trend_formula = + # formula for the latent signal, which can depend + # nonlinearly on productivity + ~ s(productivity, k = 8), + + trend_model = + # in addition to productivity effects, the signal is + # assumed to exhibit temporal autocorrelation + 'AR1', + + trend_map = + # trend_map forces all sensors to track the same + # latent signal + data.frame(series = unique(model_dat$series), + trend = c(1, 1, 1)), + + # informative priors on process error + # and observation error will help with convergence + priors = c(prior(normal(2, 0.5), class = sigma), + prior(normal(1, 0.5), class = sigma_obs)), + + # Gaussian observations + family = gaussian(), + burnin = 600, + adapt_delta = 0.95, + data = model_dat) +``` + +```{r eval=FALSE} +mod <- mvgam(formula = + # formula for observations, allowing for different + # intercepts and hierarchical smooth effects of temperature + observed ~ series + + s(temperature, k = 10) + + s(series, temperature, bs = 'sz', k = 8), + + trend_formula = + # formula for the latent signal, which can depend + # nonlinearly on productivity + ~ s(productivity, k = 8), + + trend_model = + # in addition to productivity effects, the signal is + # assumed to exhibit temporal autocorrelation + 'AR1', + + trend_map = + # trend_map forces all sensors to track the same + # latent signal + data.frame(series = unique(model_dat$series), + trend = c(1, 1, 1)), + + # informative priors on process error + # and observation error will help with convergence + priors = c(prior(normal(2, 0.5), class = sigma), + prior(normal(1, 0.5), class = sigma_obs)), + + # Gaussian observations + family = gaussian(), + data = model_dat) +``` + +View a reduced version of the model summary because there will be many spline coefficients in this model +```{r} +summary(mod, include_betas = FALSE) +``` + +### Inspecting effects on both process and observation models +Don't pay much attention to the approximate *p*-values of the smooth terms. The calculation for these values is incredibly sensitive to the estimates for the smoothing parameters so I don't tend to find them to be very meaningful. What are meaningful, however, are prediction-based plots of the smooth functions. For example, here is the estimated response of the underlying signal to `productivity`: +```{r} +plot(mod, type = 'smooths', trend_effects = TRUE) +``` + +And here are the estimated relationships between the sensor observations and the `temperature` covariate: +```{r} +plot(mod, type = 'smooths') +``` + +All main effects can be quickly plotted with `conditional_effects`: +```{r} +plot(conditional_effects(mod, type = 'link'), ask = FALSE) +``` + +`conditional_effects` is simply a wrapper to the more flexible `plot_predictions` function from the `marginaleffects` package. We can get more useful plots of these effects using this function for further customisation: +```{r} +plot_predictions(mod, + condition = c('temperature', 'series', 'series'), + points = 0.5) + + theme(legend.position = 'none') +``` + +We have successfully estimated effects, some of them nonlinear, that impact the hidden process AND the observations. All in a single joint model. But there can always be challenges with these models, particularly when estimating both process and observation error at the same time. For example, a `pairs` plot for the observation error for sensor 1 and the hidden process error shows some strong correlations that we might want to deal with by using a more structured prior: +```{r} +pairs(mod, variable = c('sigma[1]', 'sigma_obs[1]')) +``` + +But we will leave the model as-is for this example + +### Recovering the hidden signal +A final but very key question is whether we can successfully recover the true hidden signal. The `trend` slot in the returned model parameters has the estimates for this signal, which we can easily plot using the `mvgam` S3 method for `plot`. We can also overlay the true values for the hidden signal, which shows that our model has done a good job of recovering it: +```{r} +plot(mod, type = 'trend') + +# Overlay the true simulated signal +points(true_signal, pch = 16, cex = 1, col = 'white') +points(true_signal, pch = 16, cex = 0.8) +``` + +## Further reading +The following papers and resources offer a lot of useful material about other types of State-Space models and how they can be applied in practice: + +Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. "[MARSS: multivariate autoregressive state-space models for analyzing time-series data.](https://d1wqtxts1xzle7.cloudfront.net/30588864/rjournal_2012-1_holmes_et_al-libre.pdf?1391843792=&response-content-disposition=inline%3B+filename%3DMARSS_Multivariate_Autoregressive_State.pdf&Expires=1695861526&Signature=TCRXULs0mUKRM4m1pmvZxwE10bUqS6vzLcuKeUBCj57YIjx23iTxS1fEgBpV0fs2wb5XAw7ZkG84XyMaoS~vjiqZ-DpheQDHwAHpIWG-TcckHQjEjPWTNajFvAemToUdCiHnDa~yrhW9HRgXjgncdalkjzjvjT3HLSW8mcjBDhQN-WJ3MKQFSXtxoBpWfcuPYbf-HC1E1oSl7957y~w0I1gcIVdu6LHjP~CEKXa0BQzS4xuarL2nz~tHD2MverbNJYMrDGrAxIi-MX6i~lfHWuwV6UKRdoOZ0pXIcMYWBTv9V5xYey76aMKTICiJ~0NqXLZdXO5qlS4~~2nFEO7b7w__&Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA)" *R Journal*. 4.1 (2012): 11. + +Ward, Eric J., et al. "[Inferring spatial structure from time‐series data: using multivariate state‐space models to detect metapopulation structure of California sea lions in the Gulf of California, Mexico.](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/j.1365-2664.2009.01745.x)" *Journal of Applied Ecology* 47.1 (2010): 47-56. + +Auger‐Méthé, Marie, et al. ["A guide to state–space modeling of ecological time series.](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecm.1470)" *Ecological Monographs* 91.4 (2021): e01470. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/doc/shared_states.html b/doc/shared_states.html new file mode 100644 index 00000000..77372995 --- /dev/null +++ b/doc/shared_states.html @@ -0,0 +1,993 @@ + + + + + + + + + + + + + + + + +Shared latent states in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Shared latent states in mvgam

+

Nicholas J Clark

+

2024-04-16

+ + +
+ +
+ +

This vignette gives an example of how mvgam can be used +to estimate models where multiple observed time series share the same +latent process model. For full details on the basic mvgam +functionality, please see the +introductory vignette.

+
+

The trend_map argument

+

The trend_map argument in the mvgam() +function is an optional data.frame that can be used to +specify which series should depend on which latent process models +(called “trends” in mvgam). This can be particularly useful +if we wish to force multiple observed time series to depend on the same +latent trend process, but with different observation processes. If this +argument is supplied, a latent factor model is set up by setting +use_lv = TRUE and using the supplied trend_map +to set up the shared trends. Users familiar with the MARSS +family of packages will recognize this as a way of specifying the \(Z\) matrix. This data.frame +needs to have column names series and trend, +with integer values in the trend column to state which +trend each series should depend on. The series column +should have a single unique entry for each time series in the data, with +names that perfectly match the factor levels of the series +variable in data). For example, if we were to simulate a +collection of three integer-valued time series (using +sim_mvgam), the following trend_map would +force the first two series to share the same latent trend process:

+
set.seed(122)
+simdat <- sim_mvgam(trend_model = 'AR1',
+                    prop_trend = 0.6,
+                    mu = c(0, 1, 2),
+                    family = poisson())
+trend_map <- data.frame(series = unique(simdat$data_train$series),
+                        trend = c(1, 1, 2))
+trend_map
+#>     series trend
+#> 1 series_1     1
+#> 2 series_2     1
+#> 3 series_3     2
+

We can see that the factor levels in trend_map match +those in the data:

+
all.equal(levels(trend_map$series), levels(simdat$data_train$series))
+#> [1] TRUE
+
+

Checking trend_map with +run_model = FALSE

+

Supplying this trend_map to the mvgam +function for a simple model, but setting run_model = FALSE, +allows us to inspect the constructed Stan code and the data +objects that would be used to condition the model. Here we will set up a +model in which each series has a different observation process (with +only a different intercept per series in this case), and the two latent +dynamic process models evolve as independent AR1 processes that also +contain a shared nonlinear smooth function to capture repeated +seasonality. This model is not too complicated but it does show how we +can learn shared and independent effects for collections of time series +in the mvgam framework:

+
fake_mod <- mvgam(y ~ 
+                    # observation model formula, which has a 
+                    # different intercept per series
+                    series - 1,
+                  
+                  # process model formula, which has a shared seasonal smooth
+                  # (each latent process model shares the SAME smooth)
+                  trend_formula = ~ s(season, bs = 'cc', k = 6),
+                  
+                  # AR1 dynamics (each latent process model has DIFFERENT)
+                  # dynamics
+                  trend_model = 'AR1',
+                  
+                  # supplied trend_map
+                  trend_map = trend_map,
+                  
+                  # data and observation family
+                  family = poisson(),
+                  data = simdat$data_train,
+                  run_model = FALSE)
+

Inspecting the Stan code shows how this model is a +dynamic factor model in which the loadings are constructed to reflect +the supplied trend_map:

+
code(fake_mod)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_sp_trend; // number of trend smoothing parameters
+#>   int<lower=0> n_lv; // number of dynamic factors
+#>   int<lower=0> n_series; // number of series
+#>   matrix[n_series, n_lv] Z; // matrix mapping series to latent states
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   int<lower=0> num_basis_trend; // number of trend basis coefficients
+#>   vector[num_basis_trend] zero_trend; // prior locations for trend basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   matrix[n * n_lv, num_basis_trend] X_trend; // trend model design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   array[n, n_lv] int ytimes_trend;
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   matrix[4, 4] S_trend1; // mgcv smooth penalty matrix S_trend1
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> transformed data {
+#>   
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   vector[num_basis_trend] b_raw_trend;
+#>   
+#>   // latent state SD terms
+#>   vector<lower=0>[n_lv] sigma;
+#>   
+#>   // latent state AR1 terms
+#>   vector<lower=-1.5, upper=1.5>[n_lv] ar1;
+#>   
+#>   // latent states
+#>   matrix[n, n_lv] LV;
+#>   
+#>   // smoothing parameters
+#>   vector<lower=0>[n_sp_trend] lambda_trend;
+#> }
+#> transformed parameters {
+#>   // latent states and loading matrix
+#>   vector[n * n_lv] trend_mus;
+#>   matrix[n, n_series] trend;
+#>   
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   vector[num_basis_trend] b_trend;
+#>   
+#>   // observation model basis coefficients
+#>   b[1 : num_basis] = b_raw[1 : num_basis];
+#>   
+#>   // process model basis coefficients
+#>   b_trend[1 : num_basis_trend] = b_raw_trend[1 : num_basis_trend];
+#>   
+#>   // latent process linear predictors
+#>   trend_mus = X_trend * b_trend;
+#>   
+#>   // derived latent states
+#>   for (i in 1 : n) {
+#>     for (s in 1 : n_series) {
+#>       trend[i, s] = dot_product(Z[s,  : ], LV[i,  : ]);
+#>     }
+#>   }
+#> }
+#> model {
+#>   // prior for seriesseries_1...
+#>   b_raw[1] ~ student_t(3, 0, 2);
+#>   
+#>   // prior for seriesseries_2...
+#>   b_raw[2] ~ student_t(3, 0, 2);
+#>   
+#>   // prior for seriesseries_3...
+#>   b_raw[3] ~ student_t(3, 0, 2);
+#>   
+#>   // priors for AR parameters
+#>   ar1 ~ std_normal();
+#>   
+#>   // priors for latent state SD parameters
+#>   sigma ~ student_t(3, 0, 2.5);
+#>   
+#>   // dynamic process models
+#>   
+#>   // prior for s(season)_trend...
+#>   b_raw_trend[1 : 4] ~ multi_normal_prec(zero_trend[1 : 4],
+#>                                          S_trend1[1 : 4, 1 : 4]
+#>                                          * lambda_trend[1]);
+#>   lambda_trend ~ normal(5, 30);
+#>   for (j in 1 : n_lv) {
+#>     LV[1, j] ~ normal(trend_mus[ytimes_trend[1, j]], sigma[j]);
+#>     for (i in 2 : n) {
+#>       LV[i, j] ~ normal(trend_mus[ytimes_trend[i, j]]
+#>                         + ar1[j]
+#>                           * (LV[i - 1, j] - trend_mus[ytimes_trend[i - 1, j]]),
+#>                         sigma[j]);
+#>     }
+#>   }
+#>   {
+#>     // likelihood functions
+#>     vector[n_nonmissing] flat_trends;
+#>     flat_trends = to_vector(trend)[obs_ind];
+#>     flat_ys ~ poisson_log_glm(append_col(flat_xs, flat_trends), 0.0,
+#>                               append_row(b, 1.0));
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   vector[n_sp_trend] rho_trend;
+#>   vector[n_lv] penalty;
+#>   array[n, n_series] int ypred;
+#>   penalty = 1.0 / (sigma .* sigma);
+#>   rho_trend = log(lambda_trend);
+#>   
+#>   matrix[n_series, n_lv] lv_coefs = Z;
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]] + trend[1 : n, s];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+

Notice the line that states “lv_coefs = Z;”. This uses the supplied +\(Z\) matrix to construct the loading +coefficients. The supplied matrix now looks exactly like what you’d use +if you were to create a similar model in the MARSS +package:

+
fake_mod$model_data$Z
+#>      [,1] [,2]
+#> [1,]    1    0
+#> [2,]    1    0
+#> [3,]    0    1
+
+
+

Fitting and inspecting the model

+

Though this model doesn’t perfectly match the data-generating process +(which allowed each series to have different underlying dynamics), we +can still fit it to show what the resulting inferences look like:

+
full_mod <- mvgam(y ~ series - 1,
+                  trend_formula = ~ s(season, bs = 'cc', k = 6),
+                  trend_model = 'AR1',
+                  trend_map = trend_map,
+                  family = poisson(),
+                  data = simdat$data_train)
+

The summary of this model is informative as it shows that only two +latent process models have been estimated, even though we have three +observed time series. The model converges well

+
summary(full_mod)
+#> GAM observation formula:
+#> y ~ series - 1
+#> 
+#> GAM process formula:
+#> ~s(season, bs = "cc", k = 6)
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> AR1
+#> 
+#> N process models:
+#> 2 
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 75 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>                 2.5%   50% 97.5% Rhat n_eff
+#> seriesseries_1 -0.14 0.087  0.31    1  1303
+#> seriesseries_2  0.91 1.100  1.20    1  1076
+#> seriesseries_3  1.90 2.100  2.30    1   456
+#> 
+#> Process model AR parameter estimates:
+#>         2.5%     50% 97.5% Rhat n_eff
+#> ar1[1] -0.72 -0.4300 -0.04 1.00   572
+#> ar1[2] -0.28 -0.0074  0.26 1.01  1838
+#> 
+#> Process error parameter estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> sigma[1] 0.33 0.48  0.68    1   343
+#> sigma[2] 0.59 0.73  0.90    1  1452
+#> 
+#> GAM process model coefficient (beta) estimates:
+#>                    2.5%     50% 97.5% Rhat n_eff
+#> s(season).1_trend -0.21 -0.0076  0.20    1  1917
+#> s(season).2_trend -0.27 -0.0470  0.18    1  1682
+#> s(season).3_trend -0.15  0.0670  0.29    1  1462
+#> s(season).4_trend -0.15  0.0630  0.27    1  1574
+#> 
+#> Approximate significance of GAM process smooths:
+#>            edf Ref.df    F p-value
+#> s(season) 2.49      4 0.09    0.93
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:09:57 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Quick plots of all main effects can be made using +conditional_effects():

+
plot(conditional_effects(full_mod, type = 'link'), ask = FALSE)
+

+

Even more informative are the plots of the latent processes. Both +series 1 and 2 share the exact same estimates, while the estimates for +series 3 are different:

+
plot(full_mod, type = 'trend', series = 1)
+

+
plot(full_mod, type = 'trend', series = 2)
+

+
plot(full_mod, type = 'trend', series = 3)
+

+

However, the forecasts for series’ 1 and 2 differ because they have +different intercepts in the observation model

+
plot(full_mod, type = 'forecast', series = 1)
+

+
plot(full_mod, type = 'forecast', series = 2)
+

+
plot(full_mod, type = 'forecast', series = 3)
+

+
+
+
+

Example: signal detection

+

Now we will explore a more complicated example. Here we simulate a +true hidden signal that we are trying to track. This signal depends +nonlinearly on some covariate (called productivity, which +represents a measure of how productive the landscape is). The signal +also demonstrates a fairly large amount of temporal autocorrelation:

+
set.seed(543210)
+# simulate a nonlinear relationship using the mgcv function gamSim
+signal_dat <- gamSim(n = 100, eg = 1, scale = 1)
+#> Gu & Wahba 4 term additive model
+
+# productivity is one of the variables in the simulated data
+productivity <- signal_dat$x2
+
+# simulate the true signal, which already has a nonlinear relationship
+# with productivity; we will add in a fairly strong AR1 process to 
+# contribute to the signal
+true_signal <- as.vector(scale(signal_dat$y) +
+                         arima.sim(100, model = list(ar = 0.8, sd = 0.1)))
+

Plot the signal to inspect it’s evolution over time

+
plot(true_signal, type = 'l',
+     bty = 'l', lwd = 2,
+     ylab = 'True signal',
+     xlab = 'Time')
+

+

And plot the relationship between the signal and the +productivity covariate:

+
plot(true_signal ~ productivity,
+     pch = 16, bty = 'l',
+     ylab = 'True signal',
+     xlab = 'Productivity')
+

+

Next we simulate three sensors that are trying to track the same +hidden signal. All of these sensors have different observation errors +that can depend nonlinearly on a second external covariate, called +temperature in this example. Again this makes use of +gamSim

+
set.seed(543210)
+sim_series = function(n_series = 3, true_signal){
+  temp_effects <- gamSim(n = 100, eg = 7, scale = 0.1)
+  temperature <- temp_effects$y
+  alphas <- rnorm(n_series, sd = 2)
+
+  do.call(rbind, lapply(seq_len(n_series), function(series){
+    data.frame(observed = rnorm(length(true_signal),
+                                mean = alphas[series] +
+                                       1.5*as.vector(scale(temp_effects[, series + 1])) +
+                                       true_signal,
+                                sd = runif(1, 1, 2)),
+               series = paste0('sensor_', series),
+               time = 1:length(true_signal),
+               temperature = temperature,
+               productivity = productivity,
+               true_signal = true_signal)
+   }))
+  }
+model_dat <- sim_series(true_signal = true_signal) %>%
+  dplyr::mutate(series = factor(series))
+#> Gu & Wahba 4 term additive model, correlated predictors
+

Plot the sensor observations

+
plot_mvgam_series(data = model_dat, y = 'observed',
+                  series = 'all')
+

+

And now plot the observed relationships between the three sensors and +the temperature covariate

+
 plot(observed ~ temperature, data = model_dat %>%
+   dplyr::filter(series == 'sensor_1'),
+   pch = 16, bty = 'l',
+   ylab = 'Sensor 1',
+   xlab = 'Temperature')
+

+
 plot(observed ~ temperature, data = model_dat %>%
+   dplyr::filter(series == 'sensor_2'),
+   pch = 16, bty = 'l',
+   ylab = 'Sensor 2',
+   xlab = 'Temperature')
+

+
 plot(observed ~ temperature, data = model_dat %>%
+   dplyr::filter(series == 'sensor_3'),
+   pch = 16, bty = 'l',
+   ylab = 'Sensor 3',
+   xlab = 'Temperature')
+

+
+

The shared signal model

+

Now we can formulate and fit a model that allows each sensor’s +observation error to depend nonlinearly on temperature +while allowing the true signal to depend nonlinearly on +productivity. By fixing all of the values in the +trend column to 1 in the +trend_map, we are assuming that all observation sensors are +tracking the same latent signal. We use informative priors on the two +variance components (process error and observation error), which reflect +our prior belief that the observation error is smaller overall than the +true process error

+
mod <- mvgam(formula =
+               # formula for observations, allowing for different
+               # intercepts and hierarchical smooth effects of temperature
+               observed ~ series + 
+               s(temperature, k = 10) +
+               s(series, temperature, bs = 'sz', k = 8),
+             
+             trend_formula =
+               # formula for the latent signal, which can depend
+               # nonlinearly on productivity
+               ~ s(productivity, k = 8),
+             
+             trend_model =
+               # in addition to productivity effects, the signal is
+               # assumed to exhibit temporal autocorrelation
+               'AR1',
+             
+             trend_map =
+               # trend_map forces all sensors to track the same
+               # latent signal
+               data.frame(series = unique(model_dat$series),
+                          trend = c(1, 1, 1)),
+             
+             # informative priors on process error
+             # and observation error will help with convergence
+             priors = c(prior(normal(2, 0.5), class = sigma),
+                        prior(normal(1, 0.5), class = sigma_obs)),
+             
+             # Gaussian observations
+             family = gaussian(),
+             data = model_dat)
+

View a reduced version of the model summary because there will be +many spline coefficients in this model

+
summary(mod, include_betas = FALSE)
+#> GAM observation formula:
+#> observed ~ series + s(temperature, k = 10) + s(series, temperature, 
+#>     bs = "sz", k = 8)
+#> 
+#> GAM process formula:
+#> ~s(productivity, k = 8)
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> AR1
+#> 
+#> N process models:
+#> 1 
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 100 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1100; warmup = 600; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5% 50% 97.5% Rhat n_eff
+#> sigma_obs[1]  1.6 1.9   2.2    1  1757
+#> sigma_obs[2]  1.4 1.7   2.0    1  1090
+#> sigma_obs[3]  1.3 1.5   1.8    1  1339
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>                 2.5%   50% 97.5% Rhat n_eff
+#> (Intercept)     0.72  1.70  2.50 1.01   360
+#> seriessensor_2 -2.10 -0.96  0.32 1.00  1068
+#> seriessensor_3 -3.40 -2.00 -0.39 1.00  1154
+#> 
+#> Approximate significance of GAM observation smooths:
+#>                        edf Ref.df    F p-value    
+#> s(temperature)        1.22      9 12.7  <2e-16 ***
+#> s(series,temperature) 1.92     16  1.0   0.011 *  
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Process model AR parameter estimates:
+#>        2.5%  50% 97.5% Rhat n_eff
+#> ar1[1] 0.33 0.59  0.83    1   492
+#> 
+#> Process error parameter estimates:
+#>          2.5% 50% 97.5% Rhat n_eff
+#> sigma[1] 0.72   1   1.3 1.01   392
+#> 
+#> Approximate significance of GAM process smooths:
+#>                 edf Ref.df    F p-value    
+#> s(productivity) 3.6      7 9.34 0.00036 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhats above 1.05 found for 28 parameters
+#>  *Diagnose further to investigate why the chains have not mixed
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:11:39 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+
+
+

Inspecting effects on both process and observation models

+

Don’t pay much attention to the approximate p-values of the +smooth terms. The calculation for these values is incredibly sensitive +to the estimates for the smoothing parameters so I don’t tend to find +them to be very meaningful. What are meaningful, however, are +prediction-based plots of the smooth functions. For example, here is the +estimated response of the underlying signal to +productivity:

+
plot(mod, type = 'smooths', trend_effects = TRUE)
+

+

And here are the estimated relationships between the sensor +observations and the temperature covariate:

+
plot(mod, type = 'smooths')
+

+

All main effects can be quickly plotted with +conditional_effects:

+
plot(conditional_effects(mod, type = 'link'), ask = FALSE)
+

+

conditional_effects is simply a wrapper to the more +flexible plot_predictions function from the +marginaleffects package. We can get more useful plots of +these effects using this function for further customisation:

+
plot_predictions(mod, 
+                 condition = c('temperature', 'series', 'series'),
+                 points = 0.5) +
+  theme(legend.position = 'none')
+

+

We have successfully estimated effects, some of them nonlinear, that +impact the hidden process AND the observations. All in a single joint +model. But there can always be challenges with these models, +particularly when estimating both process and observation error at the +same time. For example, a pairs plot for the observation +error for sensor 1 and the hidden process error shows some strong +correlations that we might want to deal with by using a more structured +prior:

+
pairs(mod, variable = c('sigma[1]', 'sigma_obs[1]'))
+

+

But we will leave the model as-is for this example

+
+
+

Recovering the hidden signal

+

A final but very key question is whether we can successfully recover +the true hidden signal. The trend slot in the returned +model parameters has the estimates for this signal, which we can easily +plot using the mvgam S3 method for plot. We +can also overlay the true values for the hidden signal, which shows that +our model has done a good job of recovering it:

+
plot(mod, type = 'trend')
+
+# Overlay the true simulated signal
+points(true_signal, pch = 16, cex = 1, col = 'white')
+points(true_signal, pch = 16, cex = 0.8)
+

+
+
+
+

Further reading

+

The following papers and resources offer a lot of useful material +about other types of State-Space models and how they can be applied in +practice:

+

Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. “MARSS: +multivariate autoregressive state-space models for analyzing time-series +data.R Journal. 4.1 (2012): 11.

+

Ward, Eric J., et al. “Inferring +spatial structure from time‐series data: using multivariate state‐space +models to detect metapopulation structure of California sea lions in the +Gulf of California, Mexico.Journal of Applied Ecology +47.1 (2010): 47-56.

+

Auger‐Méthé, Marie, et al. “A +guide to state–space modeling of ecological time series.” +Ecological Monographs 91.4 (2021): e01470.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/doc/time_varying_effects.R b/doc/time_varying_effects.R new file mode 100644 index 00000000..f5415f2b --- /dev/null +++ b/doc/time_varying_effects.R @@ -0,0 +1,237 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +set.seed(1111) +N <- 200 +beta_temp <- mvgam:::sim_gp(rnorm(1), + alpha_gp = 0.75, + rho_gp = 10, + h = N) + 0.5 + +## ----fig.alt = "Simulating time-varying effects in mvgam and R"--------------- +plot(beta_temp, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Coefficient', + col = 'darkred') +box(bty = 'l', lwd = 2) + +## ----------------------------------------------------------------------------- +temp <- rnorm(N, sd = 1) + +## ----fig.alt = "Simulating time-varying effects in mvgam and R"--------------- +out <- rnorm(N, mean = 4 + beta_temp * temp, + sd = 0.25) +time <- seq_along(temp) +plot(out, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Outcome', + col = 'darkred') +box(bty = 'l', lwd = 2) + +## ----------------------------------------------------------------------------- +data <- data.frame(out, temp, time) +data_train <- data[1:190,] +data_test <- data[191:200,] + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = data_train, newdata = data_test, y = 'out') + +## ----include=FALSE------------------------------------------------------------ +mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), + family = gaussian(), + data = data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), +# family = gaussian(), +# data = data_train) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths') + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) + +## ----------------------------------------------------------------------------- +range_round = function(x){ + round(range(x, na.rm = TRUE), 2) +} +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') + +## ----------------------------------------------------------------------------- +fc <- forecast(mod, newdata = data_test) +plot(fc) + +## ----include=FALSE------------------------------------------------------------ +mod <- mvgam(out ~ dynamic(temp, k = 40), + family = gaussian(), + data = data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam(out ~ dynamic(temp, k = 40), +# family = gaussian(), +# data = data_train) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) + +## ----------------------------------------------------------------------------- +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') + +## ----------------------------------------------------------------------------- +fc <- forecast(mod, newdata = data_test) +plot(fc) + +## ----------------------------------------------------------------------------- +load(url('https://github.com/atsa-es/MARSS/raw/master/data/SalmonSurvCUI.rda')) +dplyr::glimpse(SalmonSurvCUI) + +## ----------------------------------------------------------------------------- +SalmonSurvCUI %>% + # create a time variable + dplyr::mutate(time = dplyr::row_number()) %>% + + # create a series variable + dplyr::mutate(series = as.factor('salmon')) %>% + + # z-score the covariate CUI.apr + dplyr::mutate(CUI.apr = as.vector(scale(CUI.apr))) %>% + + # convert logit-transformed survival back to proportional + dplyr::mutate(survival = plogis(logit.s)) -> model_data + +## ----------------------------------------------------------------------------- +dplyr::glimpse(model_data) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = model_data, y = 'survival') + +## ----include = FALSE---------------------------------------------------------- +mod0 <- mvgam(formula = survival ~ 1, + trend_model = 'RW', + family = betar(), + data = model_data) + +## ----eval = FALSE------------------------------------------------------------- +# mod0 <- mvgam(formula = survival ~ 1, +# trend_model = 'RW', +# family = betar(), +# data = model_data) + +## ----------------------------------------------------------------------------- +summary(mod0) + +## ----------------------------------------------------------------------------- +plot(mod0, type = 'trend') + +## ----------------------------------------------------------------------------- +plot(mod0, type = 'forecast') + +## ----include=FALSE------------------------------------------------------------ +mod1 <- mvgam(formula = survival ~ 1, + trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), + trend_model = 'RW', + family = betar(), + data = model_data, + adapt_delta = 0.99) + +## ----eval=FALSE--------------------------------------------------------------- +# mod1 <- mvgam(formula = survival ~ 1, +# trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), +# trend_model = 'RW', +# family = betar(), +# data = model_data) + +## ----------------------------------------------------------------------------- +summary(mod1, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(mod1, type = 'trend') + +## ----------------------------------------------------------------------------- +plot(mod1, type = 'forecast') + +## ----------------------------------------------------------------------------- +# Extract estimates of the process error 'sigma' for each model +mod0_sigma <- as.data.frame(mod0, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod0') +mod1_sigma <- as.data.frame(mod1, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod1') +sigmas <- rbind(mod0_sigma, mod1_sigma) + +# Plot using ggplot2 +library(ggplot2) +ggplot(sigmas, aes(y = `sigma[1]`, fill = model)) + + geom_density(alpha = 0.3, colour = NA) + + coord_flip() + +## ----------------------------------------------------------------------------- +plot(mod1, type = 'smooth', trend_effects = TRUE) + +## ----------------------------------------------------------------------------- +plot_predictions(mod1, newdata = datagrid(CUI.apr = range_round, + time = unique), + by = c('time', 'CUI.apr', 'CUI.apr')) + +## ----------------------------------------------------------------------------- +loo_compare(mod0, mod1) + +## ----include=FALSE------------------------------------------------------------ +lfo_mod0 <- lfo_cv(mod0, min_t = 30) +lfo_mod1 <- lfo_cv(mod1, min_t = 30) + +## ----eval=FALSE--------------------------------------------------------------- +# lfo_mod0 <- lfo_cv(mod0, min_t = 30) +# lfo_mod1 <- lfo_cv(mod1, min_t = 30) + +## ----------------------------------------------------------------------------- +sum(lfo_mod0$elpds) +sum(lfo_mod1$elpds) + +## ----fig.alt = "Comparing forecast skill for dynamic beta regression models in mvgam and R"---- +plot(x = 1:length(lfo_mod0$elpds) + 30, + y = lfo_mod0$elpds - lfo_mod1$elpds, + ylab = 'ELPDmod0 - ELPDmod1', + xlab = 'Evaluation time point', + pch = 16, + col = 'darkred', + bty = 'l') +abline(h = 0, lty = 'dashed') + diff --git a/doc/time_varying_effects.Rmd b/doc/time_varying_effects.Rmd new file mode 100644 index 00000000..271a907d --- /dev/null +++ b/doc/time_varying_effects.Rmd @@ -0,0 +1,357 @@ +--- +title: "Time-varying effects in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Time-varying effects in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to estimate and forecast regression coefficients that vary through time. + +## Time-varying effects +Dynamic fixed-effect coefficients (often referred to as dynamic linear models) can be readily incorporated into GAMs / DGAMs. In `mvgam`, the `dynamic()` formula wrapper offers a convenient interface to set these up. The plan is to incorporate a range of dynamic options (such as random walk, AR1 etc...) but for the moment only low-rank Gaussian Process (GP) smooths are allowed (making use either of the `gp` basis in `mgcv` of of Hilbert space approximate GPs). These are advantageous over splines or random walk effects for several reasons. First, GPs will force the time-varying effect to be smooth. This often makes sense in reality, where we would not expect a regression coefficient to change rapidly from one time point to the next. Second, GPs provide information on the 'global' dynamics of a time-varying effect through their length-scale parameters. This means we can use them to provide accurate forecasts of how an effect is expected to change in the future, something that we couldn't do well if we used splines to estimate the effect. An example below illustrates. + +### Simulating time-varying effects +Simulate a time-varying coefficient using a squared exponential Gaussian Process function with length scale $\rho$=10. We will do this using an internal function from `mvgam` (the `sim_gp` function): +```{r} +set.seed(1111) +N <- 200 +beta_temp <- mvgam:::sim_gp(rnorm(1), + alpha_gp = 0.75, + rho_gp = 10, + h = N) + 0.5 +``` + +A plot of the time-varying coefficient shows that it changes smoothly through time: +```{r, fig.alt = "Simulating time-varying effects in mvgam and R"} +plot(beta_temp, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Coefficient', + col = 'darkred') +box(bty = 'l', lwd = 2) +``` + +Next we need to simulate the values of the covariate, which we will call `temp` (to represent $temperature$). In this case we just use a standard normal distribution to simulate this covariate: +```{r} +temp <- rnorm(N, sd = 1) +``` + +Finally, simulate the outcome variable, which is a Gaussian observation process (with observation error) over the time-varying effect of $temperature$ +```{r, fig.alt = "Simulating time-varying effects in mvgam and R"} +out <- rnorm(N, mean = 4 + beta_temp * temp, + sd = 0.25) +time <- seq_along(temp) +plot(out, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Outcome', + col = 'darkred') +box(bty = 'l', lwd = 2) +``` + +Gather the data into a `data.frame` for fitting models, and split the data into training and testing folds. +```{r} +data <- data.frame(out, temp, time) +data_train <- data[1:190,] +data_test <- data[191:200,] +``` + +Plot the series +```{r} +plot_mvgam_series(data = data_train, newdata = data_test, y = 'out') +``` + + +### The `dynamic()` function +Time-varying coefficients can be fairly easily set up using the `s()` or `gp()` wrapper functions in `mvgam` formulae by fitting a nonlinear effect of `time` and using the covariate of interest as the numeric `by` variable (see `?mgcv::s` or `?brms::gp` for more details). The `dynamic()` formula wrapper offers a way to automate this process, and will eventually allow for a broader variety of time-varying effects (such as random walk or AR processes). Depending on the arguments that are specified to `dynamic`, it will either set up a low-rank GP smooth function using `s()` with `bs = 'gp'` and a fixed value of the length scale parameter $\rho$, or it will set up a Hilbert space approximate GP using the `gp()` function with `c=5/4` so that $\rho$ is estimated (see `?dynamic` for more details). In this first example we will use the `s()` option, and will mis-specify the $\rho$ parameter here as, in practice, it is never known. This call to `dynamic()` will set up the following smooth: `s(time, by = temp, bs = "gp", m = c(-2, 8, 2), k = 40)` +```{r, include=FALSE} +mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), + family = gaussian(), + data = data_train) +``` + +```{r, eval=FALSE} +mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), + family = gaussian(), + data = data_train) +``` + +Inspect the model summary, which shows how the `dynamic()` wrapper was used to construct a low-rank Gaussian Process smooth function: +```{r} +summary(mod, include_betas = FALSE) +``` + +Because this model used a spline with a `gp` basis, it's smooths can be visualised just like any other `gam`. Plot the estimated time-varying coefficient for the in-sample training period +```{r} +plot(mod, type = 'smooths') +``` + +We can also plot the estimates for the in-sample and out-of-sample periods to see how the Gaussian Process function produces sensible smooth forecasts. Here we supply the full dataset to the `newdata` argument in `plot_mvgam_smooth` to inspect posterior forecasts of the time-varying smooth function. Overlay the true simulated function to see that the model has adequately estimated it's dynamics in both the training and testing data partitions +```{r} +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) +``` + +We can also use `plot_predictions` from the `marginaleffects` package to visualise the time-varying coefficient for what the effect would be estimated to be at different values of $temperature$: +```{r} +range_round = function(x){ + round(range(x, na.rm = TRUE), 2) +} +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') +``` + +This results in sensible forecasts of the observations as well +```{r} +fc <- forecast(mod, newdata = data_test) +plot(fc) +``` + +The syntax is very similar if we wish to estimate the parameters of the underlying Gaussian Process, this time using a Hilbert space approximation. We simply omit the `rho` argument in `dynamic` to make this happen. This will set up a call similar to `gp(time, by = 'temp', c = 5/4, k = 40)`. +```{r include=FALSE} +mod <- mvgam(out ~ dynamic(temp, k = 40), + family = gaussian(), + data = data_train) +``` + +```{r eval=FALSE} +mod <- mvgam(out ~ dynamic(temp, k = 40), + family = gaussian(), + data = data_train) +``` + +This model summary now contains estimates for the marginal deviation and length scale parameters of the underlying Gaussian Process function: +```{r} +summary(mod, include_betas = FALSE) +``` + +Effects for `gp()` terms can also be plotted as smooths: +```{r} +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) +``` + +Both the above plot and the below `plot_predictions()` call show that the effect in this case is similar to what we estimated in the approximate GP smooth model above: +```{r} +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') +``` + +Forecasts are also similar: +```{r} +fc <- forecast(mod, newdata = data_test) +plot(fc) +``` + +## Salmon survival example +Here we will use openly available data on marine survival of Chinook salmon to illustrate how time-varying effects can be used to improve ecological time series models. [Scheuerell and Williams (2005)](https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1365-2419.2005.00346.x) used a dynamic linear model to examine the relationship between marine survival of Chinook salmon and an index of ocean upwelling strength along the west coast of the USA. The authors hypothesized that stronger upwelling in April should create better growing conditions for phytoplankton, which would then translate into more zooplankton and provide better foraging opportunities for juvenile salmon entering the ocean. The data on survival is measured as a proportional variable over 42 years (1964–2005) and is available in the `MARSS` package: +```{r} +load(url('https://github.com/atsa-es/MARSS/raw/master/data/SalmonSurvCUI.rda')) +dplyr::glimpse(SalmonSurvCUI) +``` + +First we need to prepare the data for modelling. The variable `CUI.apr` will be standardized to make it easier for the sampler to estimate underlying GP parameters for the time-varying effect. We also need to convert the survival back to a proportion, as in its current form it has been logit-transformed (this is because most time series packages cannot handle proportional data). As usual, we also need to create a `time` indicator and a `series` indicator for working in `mvgam`: +```{r} +SalmonSurvCUI %>% + # create a time variable + dplyr::mutate(time = dplyr::row_number()) %>% + + # create a series variable + dplyr::mutate(series = as.factor('salmon')) %>% + + # z-score the covariate CUI.apr + dplyr::mutate(CUI.apr = as.vector(scale(CUI.apr))) %>% + + # convert logit-transformed survival back to proportional + dplyr::mutate(survival = plogis(logit.s)) -> model_data +``` + +Inspect the data +```{r} +dplyr::glimpse(model_data) +``` + +Plot features of the outcome variable, which shows that it is a proportional variable with particular restrictions that we want to model: +```{r} +plot_mvgam_series(data = model_data, y = 'survival') +``` + + +### A State-Space Beta regression +`mvgam` can easily handle data that are bounded at 0 and 1 with a Beta observation model (using the `mgcv` function `betar()`, see `?mgcv::betar` for details). First we will fit a simple State-Space model that uses a Random Walk dynamic process model with no predictors and a Beta observation model: +```{r include = FALSE} +mod0 <- mvgam(formula = survival ~ 1, + trend_model = 'RW', + family = betar(), + data = model_data) +``` + +```{r eval = FALSE} +mod0 <- mvgam(formula = survival ~ 1, + trend_model = 'RW', + family = betar(), + data = model_data) +``` + +The summary of this model shows good behaviour of the Hamiltonian Monte Carlo sampler and provides useful summaries on the Beta observation model parameters: +```{r} +summary(mod0) +``` + +A plot of the underlying dynamic component shows how it has easily handled the temporal evolution of the time series: +```{r} +plot(mod0, type = 'trend') +``` + +Posterior hindcasts are also good and will automatically respect the observational data bounding at 0 and 1: +```{r} +plot(mod0, type = 'forecast') +``` + + +### Including time-varying upwelling effects +Now we can increase the complexity of our model by constructing and fitting a State-Space model with a time-varying effect of the coastal upwelling index in addition to the autoregressive dynamics. We again use a Beta observation model to capture the restrictions of our proportional observations, but this time will include a `dynamic()` effect of `CUI.apr` in the latent process model. We do not specify the $\rho$ parameter, instead opting to estimate it using a Hilbert space approximate GP: +```{r include=FALSE} +mod1 <- mvgam(formula = survival ~ 1, + trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), + trend_model = 'RW', + family = betar(), + data = model_data, + adapt_delta = 0.99) +``` + +```{r eval=FALSE} +mod1 <- mvgam(formula = survival ~ 1, + trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), + trend_model = 'RW', + family = betar(), + data = model_data) +``` + +The summary for this model now includes estimates for the time-varying GP parameters: +```{r} +summary(mod1, include_betas = FALSE) +``` + +The estimates for the underlying dynamic process, and for the hindcasts, haven't changed much: +```{r} +plot(mod1, type = 'trend') +``` + +```{r} +plot(mod1, type = 'forecast') +``` + +But the process error parameter $\sigma$ is slightly smaller for this model than for the first model: +```{r} +# Extract estimates of the process error 'sigma' for each model +mod0_sigma <- as.data.frame(mod0, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod0') +mod1_sigma <- as.data.frame(mod1, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod1') +sigmas <- rbind(mod0_sigma, mod1_sigma) + +# Plot using ggplot2 +library(ggplot2) +ggplot(sigmas, aes(y = `sigma[1]`, fill = model)) + + geom_density(alpha = 0.3, colour = NA) + + coord_flip() +``` + +Why does the process error not need to be as flexible in the second model? Because the estimates of this dynamic process are now informed partly by the time-varying effect of upwelling, which we can visualise on the link scale using `plot()` with `trend_effects = TRUE`: +```{r} +plot(mod1, type = 'smooth', trend_effects = TRUE) +``` + +Or on the outcome scale, at a range of possible `CUI.apr` values, using `plot_predictions()`: +```{r} +plot_predictions(mod1, newdata = datagrid(CUI.apr = range_round, + time = unique), + by = c('time', 'CUI.apr', 'CUI.apr')) +``` + + +### Comparing model predictive performances +A key question when fitting multiple time series models is whether one of them provides better predictions than the other. There are several options in `mvgam` for exploring this quantitatively. First, we can compare models based on in-sample approximate leave-one-out cross-validation as implemented in the popular `loo` package: +```{r} +loo_compare(mod0, mod1) +``` + +The second model has the larger Expected Log Predictive Density (ELPD), meaning that it is slightly favoured over the simpler model that did not include the time-varying upwelling effect. However, the two models certainly do not differ by much. But this metric only compares in-sample performance, and we are hoping to use our models to produce reasonable forecasts. Luckily, `mvgam` also has routines for comparing models using approximate leave-future-out cross-validation. Here we refit both models to a reduced training set (starting at time point 30) and produce approximate 1-step ahead forecasts. These forecasts are used to estimate forecast ELPD before expanding the training set one time point at a time. We use Pareto-smoothed importance sampling to reweight posterior predictions, acting as a kind of particle filter so that we don't need to refit the model too often (you can read more about how this process works in Bürkner et al. 2020). +```{r include=FALSE} +lfo_mod0 <- lfo_cv(mod0, min_t = 30) +lfo_mod1 <- lfo_cv(mod1, min_t = 30) +``` + +```{r eval=FALSE} +lfo_mod0 <- lfo_cv(mod0, min_t = 30) +lfo_mod1 <- lfo_cv(mod1, min_t = 30) +``` + +The model with the time-varying upwelling effect tends to provides better 1-step ahead forecasts, with a higher total forecast ELPD +```{r} +sum(lfo_mod0$elpds) +sum(lfo_mod1$elpds) +``` + +We can also plot the ELPDs for each model as a contrast. Here, values less than zero suggest the time-varying predictor model (Mod1) gives better 1-step ahead forecasts: +```{r, fig.alt = "Comparing forecast skill for dynamic beta regression models in mvgam and R"} +plot(x = 1:length(lfo_mod0$elpds) + 30, + y = lfo_mod0$elpds - lfo_mod1$elpds, + ylab = 'ELPDmod0 - ELPDmod1', + xlab = 'Evaluation time point', + pch = 16, + col = 'darkred', + bty = 'l') +abline(h = 0, lty = 'dashed') +``` + +A useful exercise to further expand this model would be to think about what kinds of predictors might impact measurement error, which could easily be implemented into the observation formula in `mvgam`. But for now, we will leave the model as-is. + +## Further reading +The following papers and resources offer a lot of useful material about dynamic linear models and how they can be applied / evaluated in practice: + +Bürkner, PC, Gabry, J and Vehtari, A [Approximate leave-future-out cross-validation for Bayesian time series models](https://www.tandfonline.com/doi/full/10.1080/00949655.2020.1783262). *Journal of Statistical Computation and Simulation*. 90:14 (2020) 2499-2523. + +Herrero, Asier, et al. [From the individual to the landscape and back: time‐varying effects of climate and herbivory on tree sapling growth at distribution limits](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/1365-2745.12527). *Journal of Ecology* 104.2 (2016): 430-442. + +Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. "[MARSS: multivariate autoregressive state-space models for analyzing time-series data.](https://journal.r-project.org/archive/2012/RJ-2012-002/index.html)" *R Journal*. 4.1 (2012): 11. + +Scheuerell, Mark D., and John G. Williams. [Forecasting climate induced changes in the survival of Snake River Spring/Summer Chinook Salmon (*Oncorhynchus Tshawytscha*)](https://onlinelibrary.wiley.com/doi/10.1111/j.1365-2419.2005.00346.x) *Fisheries Oceanography* 14 (2005): 448–57. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/doc/time_varying_effects.html b/doc/time_varying_effects.html new file mode 100644 index 00000000..8d85f76b --- /dev/null +++ b/doc/time_varying_effects.html @@ -0,0 +1,980 @@ + + + + + + + + + + + + + + + + +Time-varying effects in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Time-varying effects in mvgam

+

Nicholas J Clark

+

2024-04-18

+ + +
+ +
+ +

The purpose of this vignette is to show how the mvgam +package can be used to estimate and forecast regression coefficients +that vary through time.

+
+

Time-varying effects

+

Dynamic fixed-effect coefficients (often referred to as dynamic +linear models) can be readily incorporated into GAMs / DGAMs. In +mvgam, the dynamic() formula wrapper offers a +convenient interface to set these up. The plan is to incorporate a range +of dynamic options (such as random walk, AR1 etc…) but for the moment +only low-rank Gaussian Process (GP) smooths are allowed (making use +either of the gp basis in mgcv of of Hilbert +space approximate GPs). These are advantageous over splines or random +walk effects for several reasons. First, GPs will force the time-varying +effect to be smooth. This often makes sense in reality, where we would +not expect a regression coefficient to change rapidly from one time +point to the next. Second, GPs provide information on the ‘global’ +dynamics of a time-varying effect through their length-scale parameters. +This means we can use them to provide accurate forecasts of how an +effect is expected to change in the future, something that we couldn’t +do well if we used splines to estimate the effect. An example below +illustrates.

+
+

Simulating time-varying effects

+

Simulate a time-varying coefficient using a squared exponential +Gaussian Process function with length scale \(\rho\)=10. We will do this using an +internal function from mvgam (the sim_gp +function):

+
set.seed(1111)
+N <- 200
+beta_temp <- mvgam:::sim_gp(rnorm(1),
+                            alpha_gp = 0.75,
+                            rho_gp = 10,
+                            h = N) + 0.5
+

A plot of the time-varying coefficient shows that it changes smoothly +through time:

+
plot(beta_temp, type = 'l', lwd = 3, 
+     bty = 'l', xlab = 'Time', ylab = 'Coefficient',
+     col = 'darkred')
+box(bty = 'l', lwd = 2)
+

Simulating time-varying effects in mvgam and R

+

Next we need to simulate the values of the covariate, which we will +call temp (to represent \(temperature\)). In this case we just use a +standard normal distribution to simulate this covariate:

+
temp <- rnorm(N, sd = 1)
+

Finally, simulate the outcome variable, which is a Gaussian +observation process (with observation error) over the time-varying +effect of \(temperature\)

+
out <- rnorm(N, mean = 4 + beta_temp * temp,
+             sd = 0.25)
+time <- seq_along(temp)
+plot(out,  type = 'l', lwd = 3, 
+     bty = 'l', xlab = 'Time', ylab = 'Outcome',
+     col = 'darkred')
+box(bty = 'l', lwd = 2)
+

Simulating time-varying effects in mvgam and R

+

Gather the data into a data.frame for fitting models, +and split the data into training and testing folds.

+
data <- data.frame(out, temp, time)
+data_train <- data[1:190,]
+data_test <- data[191:200,]
+

Plot the series

+
plot_mvgam_series(data = data_train, newdata = data_test, y = 'out')
+

+
+
+

The dynamic() function

+

Time-varying coefficients can be fairly easily set up using the +s() or gp() wrapper functions in +mvgam formulae by fitting a nonlinear effect of +time and using the covariate of interest as the numeric +by variable (see ?mgcv::s or +?brms::gp for more details). The dynamic() +formula wrapper offers a way to automate this process, and will +eventually allow for a broader variety of time-varying effects (such as +random walk or AR processes). Depending on the arguments that are +specified to dynamic, it will either set up a low-rank GP +smooth function using s() with bs = 'gp' and a +fixed value of the length scale parameter \(\rho\), or it will set up a Hilbert space +approximate GP using the gp() function with +c=5/4 so that \(\rho\) is +estimated (see ?dynamic for more details). In this first +example we will use the s() option, and will mis-specify +the \(\rho\) parameter here as, in +practice, it is never known. This call to dynamic() will +set up the following smooth: +s(time, by = temp, bs = "gp", m = c(-2, 8, 2), k = 40)

+
mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40),
+             family = gaussian(),
+             data = data_train)
+

Inspect the model summary, which shows how the dynamic() +wrapper was used to construct a low-rank Gaussian Process smooth +function:

+
summary(mod, include_betas = FALSE)
+#> GAM formula:
+#> out ~ s(time, by = temp, bs = "gp", m = c(-2, 8, 2), k = 40)
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 190 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5%  50% 97.5% Rhat n_eff
+#> sigma_obs[1] 0.23 0.25  0.28    1  2222
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5% 50% 97.5% Rhat n_eff
+#> (Intercept)    4   4   4.1    1  2893
+#> 
+#> Approximate significance of GAM smooths:
+#>              edf Ref.df    F p-value    
+#> s(time):temp  14     40 72.4  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:39:49 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Because this model used a spline with a gp basis, it’s +smooths can be visualised just like any other gam. Plot the +estimated time-varying coefficient for the in-sample training period

+
plot(mod, type = 'smooths')
+

+

We can also plot the estimates for the in-sample and out-of-sample +periods to see how the Gaussian Process function produces sensible +smooth forecasts. Here we supply the full dataset to the +newdata argument in plot_mvgam_smooth to +inspect posterior forecasts of the time-varying smooth function. Overlay +the true simulated function to see that the model has adequately +estimated it’s dynamics in both the training and testing data +partitions

+
plot_mvgam_smooth(mod, smooth = 1, newdata = data)
+abline(v = 190, lty = 'dashed', lwd = 2)
+lines(beta_temp, lwd = 2.5, col = 'white')
+lines(beta_temp, lwd = 2)
+

+

We can also use plot_predictions from the +marginaleffects package to visualise the time-varying +coefficient for what the effect would be estimated to be at different +values of \(temperature\):

+
range_round = function(x){
+  round(range(x, na.rm = TRUE), 2)
+}
+plot_predictions(mod, 
+                 newdata = datagrid(time = unique,
+                                    temp = range_round),
+                 by = c('time', 'temp', 'temp'),
+                 type = 'link')
+

+

This results in sensible forecasts of the observations as well

+
fc <- forecast(mod, newdata = data_test)
+plot(fc)
+

+
#> Out of sample CRPS:
+#> [1] 1.280347
+

The syntax is very similar if we wish to estimate the parameters of +the underlying Gaussian Process, this time using a Hilbert space +approximation. We simply omit the rho argument in +dynamic to make this happen. This will set up a call +similar to gp(time, by = 'temp', c = 5/4, k = 40).

+
mod <- mvgam(out ~ dynamic(temp, k = 40),
+             family = gaussian(),
+             data = data_train)
+

This model summary now contains estimates for the marginal deviation +and length scale parameters of the underlying Gaussian Process +function:

+
summary(mod, include_betas = FALSE)
+#> GAM formula:
+#> out ~ gp(time, by = temp, c = 5/4, k = 40, scale = TRUE)
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 190 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5%  50% 97.5% Rhat n_eff
+#> sigma_obs[1] 0.24 0.26  0.29    1  2151
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5% 50% 97.5% Rhat n_eff
+#> (Intercept)    4   4   4.1    1  2989
+#> 
+#> GAM gp term marginal deviation (alpha) and length scale (rho) estimates:
+#>                      2.5%   50% 97.5% Rhat n_eff
+#> alpha_gp(time):temp 0.640 0.890 1.400 1.01   745
+#> rho_gp(time):temp   0.028 0.053 0.069 1.00   888
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 1 of 2000 iterations ended with a divergence (0.05%)
+#>  *Try running with larger adapt_delta to remove the divergences
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:41:07 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Effects for gp() terms can also be plotted as +smooths:

+
plot_mvgam_smooth(mod, smooth = 1, newdata = data)
+abline(v = 190, lty = 'dashed', lwd = 2)
+lines(beta_temp, lwd = 2.5, col = 'white')
+lines(beta_temp, lwd = 2)
+

+

Both the above plot and the below plot_predictions() +call show that the effect in this case is similar to what we estimated +in the approximate GP smooth model above:

+
plot_predictions(mod, 
+                 newdata = datagrid(time = unique,
+                                    temp = range_round),
+                 by = c('time', 'temp', 'temp'),
+                 type = 'link')
+

+

Forecasts are also similar:

+
fc <- forecast(mod, newdata = data_test)
+plot(fc)
+

+
#> Out of sample CRPS:
+#> [1] 1.667521
+
+
+
+

Salmon survival example

+

Here we will use openly available data on marine survival of Chinook +salmon to illustrate how time-varying effects can be used to improve +ecological time series models. Scheuerell +and Williams (2005) used a dynamic linear model to examine the +relationship between marine survival of Chinook salmon and an index of +ocean upwelling strength along the west coast of the USA. The authors +hypothesized that stronger upwelling in April should create better +growing conditions for phytoplankton, which would then translate into +more zooplankton and provide better foraging opportunities for juvenile +salmon entering the ocean. The data on survival is measured as a +proportional variable over 42 years (1964–2005) and is available in the +MARSS package:

+
load(url('https://github.com/atsa-es/MARSS/raw/master/data/SalmonSurvCUI.rda'))
+dplyr::glimpse(SalmonSurvCUI)
+#> Rows: 42
+#> Columns: 3
+#> $ year    <int> 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 19…
+#> $ logit.s <dbl> -3.46, -3.32, -3.58, -3.03, -3.61, -3.35, -3.93, -4.19, -4.82,…
+#> $ CUI.apr <int> 57, 5, 43, 11, 47, -21, 25, -2, -1, 43, 2, 35, 0, 1, -1, 6, -7…
+

First we need to prepare the data for modelling. The variable +CUI.apr will be standardized to make it easier for the +sampler to estimate underlying GP parameters for the time-varying +effect. We also need to convert the survival back to a proportion, as in +its current form it has been logit-transformed (this is because most +time series packages cannot handle proportional data). As usual, we also +need to create a time indicator and a series +indicator for working in mvgam:

+
SalmonSurvCUI %>%
+  # create a time variable
+  dplyr::mutate(time = dplyr::row_number()) %>%
+
+  # create a series variable
+  dplyr::mutate(series = as.factor('salmon')) %>%
+
+  # z-score the covariate CUI.apr
+  dplyr::mutate(CUI.apr = as.vector(scale(CUI.apr))) %>%
+
+  # convert logit-transformed survival back to proportional
+  dplyr::mutate(survival = plogis(logit.s)) -> model_data
+

Inspect the data

+
dplyr::glimpse(model_data)
+#> Rows: 42
+#> Columns: 6
+#> $ year     <int> 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1…
+#> $ logit.s  <dbl> -3.46, -3.32, -3.58, -3.03, -3.61, -3.35, -3.93, -4.19, -4.82…
+#> $ CUI.apr  <dbl> 2.37949804, 0.03330223, 1.74782994, 0.30401713, 1.92830654, -…
+#> $ time     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
+#> $ series   <fct> salmon, salmon, salmon, salmon, salmon, salmon, salmon, salmo…
+#> $ survival <dbl> 0.030472033, 0.034891409, 0.027119717, 0.046088827, 0.0263393…
+

Plot features of the outcome variable, which shows that it is a +proportional variable with particular restrictions that we want to +model:

+
plot_mvgam_series(data = model_data, y = 'survival')
+

+
+

A State-Space Beta regression

+

mvgam can easily handle data that are bounded at 0 and 1 +with a Beta observation model (using the mgcv function +betar(), see ?mgcv::betar for details). First +we will fit a simple State-Space model that uses a Random Walk dynamic +process model with no predictors and a Beta observation model:

+
mod0 <- mvgam(formula = survival ~ 1,
+             trend_model = 'RW',
+             family = betar(),
+             data = model_data)
+

The summary of this model shows good behaviour of the Hamiltonian +Monte Carlo sampler and provides useful summaries on the Beta +observation model parameters:

+
summary(mod0)
+#> GAM formula:
+#> survival ~ 1
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> beta
+#> 
+#> Link function:
+#> logit
+#> 
+#> Trend model:
+#> RW
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 42 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation precision parameter estimates:
+#>        2.5% 50% 97.5% Rhat n_eff
+#> phi[1]  160 310   580 1.01   612
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5%  50% 97.5% Rhat n_eff
+#> (Intercept) -4.2 -3.4  -2.4 1.02   125
+#> 
+#> Latent trend variance estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> sigma[1] 0.18 0.33  0.55 1.02   276
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:42:35 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

A plot of the underlying dynamic component shows how it has easily +handled the temporal evolution of the time series:

+
plot(mod0, type = 'trend')
+

+

Posterior hindcasts are also good and will automatically respect the +observational data bounding at 0 and 1:

+
plot(mod0, type = 'forecast')
+

+
+
+

Including time-varying upwelling effects

+

Now we can increase the complexity of our model by constructing and +fitting a State-Space model with a time-varying effect of the coastal +upwelling index in addition to the autoregressive dynamics. We again use +a Beta observation model to capture the restrictions of our proportional +observations, but this time will include a dynamic() effect +of CUI.apr in the latent process model. We do not specify +the \(\rho\) parameter, instead opting +to estimate it using a Hilbert space approximate GP:

+
mod1 <- mvgam(formula = survival ~ 1,
+              trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE),
+              trend_model = 'RW',
+              family = betar(),
+              data = model_data)
+

The summary for this model now includes estimates for the +time-varying GP parameters:

+
summary(mod1, include_betas = FALSE)
+#> GAM observation formula:
+#> survival ~ 1
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> GAM process formula:
+#> ~dynamic(CUI.apr, k = 25, scale = FALSE)
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> beta
+#> 
+#> Link function:
+#> logit
+#> 
+#> Trend model:
+#> RW
+#> 
+#> N process models:
+#> 1 
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 42 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation precision parameter estimates:
+#>        2.5% 50% 97.5% Rhat n_eff
+#> phi[1]  190 360   670    1   858
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>             2.5%  50% 97.5% Rhat n_eff
+#> (Intercept) -4.1 -3.2  -2.2 1.07    64
+#> 
+#> Process error parameter estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> sigma[1] 0.18 0.31  0.51 1.02   274
+#> 
+#> GAM process model gp term marginal deviation (alpha) and length scale (rho) estimates:
+#>                                2.5%  50% 97.5% Rhat n_eff
+#> alpha_gp_time_byCUI_apr_trend 0.028 0.32   1.5 1.02   205
+#> rho_gp_time_byCUI_apr_trend   1.400 6.50  40.0 1.02   236
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhats above 1.05 found for 30 parameters
+#>  *Diagnose further to investigate why the chains have not mixed
+#> 89 of 2000 iterations ended with a divergence (4.45%)
+#>  *Try running with larger adapt_delta to remove the divergences
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:44:05 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The estimates for the underlying dynamic process, and for the +hindcasts, haven’t changed much:

+
plot(mod1, type = 'trend')
+

+
plot(mod1, type = 'forecast')
+

+

But the process error parameter \(\sigma\) is slightly smaller for this model +than for the first model:

+
# Extract estimates of the process error 'sigma' for each model
+mod0_sigma <- as.data.frame(mod0, variable = 'sigma', regex = TRUE) %>%
+  dplyr::mutate(model = 'Mod0')
+mod1_sigma <- as.data.frame(mod1, variable = 'sigma', regex = TRUE) %>%
+  dplyr::mutate(model = 'Mod1')
+sigmas <- rbind(mod0_sigma, mod1_sigma)
+
+# Plot using ggplot2
+library(ggplot2)
+ggplot(sigmas, aes(y = `sigma[1]`, fill = model)) +
+  geom_density(alpha = 0.3, colour = NA) +
+  coord_flip()
+

+

Why does the process error not need to be as flexible in the second +model? Because the estimates of this dynamic process are now informed +partly by the time-varying effect of upwelling, which we can visualise +on the link scale using plot() with +trend_effects = TRUE:

+
plot(mod1, type = 'smooth', trend_effects = TRUE)
+

+

Or on the outcome scale, at a range of possible CUI.apr +values, using plot_predictions():

+
plot_predictions(mod1, newdata = datagrid(CUI.apr = range_round,
+                                          time = unique),
+                 by = c('time', 'CUI.apr', 'CUI.apr'))
+

+
+
+

Comparing model predictive performances

+

A key question when fitting multiple time series models is whether +one of them provides better predictions than the other. There are +several options in mvgam for exploring this quantitatively. +First, we can compare models based on in-sample approximate +leave-one-out cross-validation as implemented in the popular +loo package:

+
loo_compare(mod0, mod1)
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+#>      elpd_diff se_diff
+#> mod1  0.0       0.0   
+#> mod0 -2.3       1.6
+

The second model has the larger Expected Log Predictive Density +(ELPD), meaning that it is slightly favoured over the simpler model that +did not include the time-varying upwelling effect. However, the two +models certainly do not differ by much. But this metric only compares +in-sample performance, and we are hoping to use our models to produce +reasonable forecasts. Luckily, mvgam also has routines for +comparing models using approximate leave-future-out cross-validation. +Here we refit both models to a reduced training set (starting at time +point 30) and produce approximate 1-step ahead forecasts. These +forecasts are used to estimate forecast ELPD before expanding the +training set one time point at a time. We use Pareto-smoothed importance +sampling to reweight posterior predictions, acting as a kind of particle +filter so that we don’t need to refit the model too often (you can read +more about how this process works in Bürkner et al. 2020).

+
lfo_mod0 <- lfo_cv(mod0, min_t = 30)
+lfo_mod1 <- lfo_cv(mod1, min_t = 30)
+

The model with the time-varying upwelling effect tends to provides +better 1-step ahead forecasts, with a higher total forecast ELPD

+
sum(lfo_mod0$elpds)
+#> [1] 35.11835
+sum(lfo_mod1$elpds)
+#> [1] 36.77461
+

We can also plot the ELPDs for each model as a contrast. Here, values +less than zero suggest the time-varying predictor model (Mod1) gives +better 1-step ahead forecasts:

+
plot(x = 1:length(lfo_mod0$elpds) + 30,
+     y = lfo_mod0$elpds - lfo_mod1$elpds,
+     ylab = 'ELPDmod0 - ELPDmod1',
+     xlab = 'Evaluation time point',
+     pch = 16,
+     col = 'darkred',
+     bty = 'l')
+abline(h = 0, lty = 'dashed')
+

Comparing forecast skill for dynamic beta regression models in mvgam and R

+

A useful exercise to further expand this model would be to think +about what kinds of predictors might impact measurement error, which +could easily be implemented into the observation formula in +mvgam. But for now, we will leave the model as-is.

+
+
+
+

Further reading

+

The following papers and resources offer a lot of useful material +about dynamic linear models and how they can be applied / evaluated in +practice:

+

Bürkner, PC, Gabry, J and Vehtari, A Approximate +leave-future-out cross-validation for Bayesian time series models. +Journal of Statistical Computation and Simulation. 90:14 (2020) +2499-2523.

+

Herrero, Asier, et al. From +the individual to the landscape and back: time‐varying effects of +climate and herbivory on tree sapling growth at distribution limits. +Journal of Ecology 104.2 (2016): 430-442.

+

Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. “MARSS: +multivariate autoregressive state-space models for analyzing time-series +data.R Journal. 4.1 (2012): 11.

+

Scheuerell, Mark D., and John G. Williams. Forecasting +climate induced changes in the survival of Snake River Spring/Summer +Chinook Salmon (Oncorhynchus Tshawytscha) Fisheries +Oceanography 14 (2005): 448–57.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/doc/trend_formulas.R b/doc/trend_formulas.R new file mode 100644 index 00000000..4739e2e9 --- /dev/null +++ b/doc/trend_formulas.R @@ -0,0 +1,403 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +load(url('https://github.com/atsa-es/MARSS/raw/master/data/lakeWAplankton.rda')) + +## ----------------------------------------------------------------------------- +outcomes <- c('Greens', 'Bluegreens', 'Diatoms', 'Unicells', 'Other.algae') + +## ----------------------------------------------------------------------------- +# loop across each plankton group to create the long datframe +plankton_data <- do.call(rbind, lapply(outcomes, function(x){ + + # create a group-specific dataframe with counts labelled 'y' + # and the group name in the 'series' variable + data.frame(year = lakeWAplanktonTrans[, 'Year'], + month = lakeWAplanktonTrans[, 'Month'], + y = lakeWAplanktonTrans[, x], + series = x, + temp = lakeWAplanktonTrans[, 'Temp'])})) %>% + + # change the 'series' label to a factor + dplyr::mutate(series = factor(series)) %>% + + # filter to only include some years in the data + dplyr::filter(year >= 1965 & year < 1975) %>% + dplyr::arrange(year, month) %>% + dplyr::group_by(series) %>% + + # z-score the counts so they are approximately standard normal + dplyr::mutate(y = as.vector(scale(y))) %>% + + # add the time indicator + dplyr::mutate(time = dplyr::row_number()) %>% + dplyr::ungroup() + +## ----------------------------------------------------------------------------- +head(plankton_data) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(plankton_data) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = plankton_data, series = 'all') + +## ----------------------------------------------------------------------------- +image(is.na(t(plankton_data)), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(plankton_data)), + labels = colnames(plankton_data)) + +## ----------------------------------------------------------------------------- +plankton_data %>% + dplyr::filter(series == 'Other.algae') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Other algae (red)') + +## ----------------------------------------------------------------------------- +plankton_data %>% + dplyr::filter(series == 'Diatoms') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Diatoms (red)') + +## ----------------------------------------------------------------------------- +plankton_data %>% + dplyr::filter(series == 'Greens') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Greens (red)') + +## ----------------------------------------------------------------------------- +plankton_train <- plankton_data %>% + dplyr::filter(time <= 112) +plankton_test <- plankton_data %>% + dplyr::filter(time > 112) + +## ----notrend_mod, include = FALSE, results='hide'----------------------------- +notrend_mod <- mvgam(y ~ + te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = series), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'None') + +## ----eval=FALSE--------------------------------------------------------------- +# notrend_mod <- mvgam(y ~ +# # tensor of temp and month to capture +# # "global" seasonality +# te(temp, month, k = c(4, 4)) + +# +# # series-specific deviation tensor products +# te(temp, month, k = c(4, 4), by = series), +# family = gaussian(), +# data = plankton_train, +# newdata = plankton_test, +# trend_model = 'None') +# + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 1) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 2) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 3) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 4) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 5) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 6) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 1) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 2) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 3) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 4) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 5) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 1) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 2) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 3) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 4) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 5) + +## ----------------------------------------------------------------------------- +priors <- get_mvgam_priors( + # observation formula, which has no terms in it + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with uncorrelated process errors + trend_model = 'VAR1', + family = gaussian(), + data = plankton_train) + +## ----------------------------------------------------------------------------- +priors[, 3] + +## ----------------------------------------------------------------------------- +priors[, 4] + +## ----------------------------------------------------------------------------- +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) + +## ----var_mod, include = FALSE, results='hide'--------------------------------- +var_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1', + priors = priors, + burnin = 1000) + +## ----eval=FALSE--------------------------------------------------------------- +# var_mod <- mvgam( +# # observation formula, which is empty +# y ~ -1, +# +# # process model formula, which includes the smooth functions +# trend_formula = ~ te(temp, month, k = c(4, 4)) + +# te(temp, month, k = c(4, 4), by = trend), +# +# # VAR1 model with uncorrelated process errors +# trend_model = 'VAR1', +# family = gaussian(), +# data = plankton_train, +# newdata = plankton_test, +# +# # include the updated priors +# priors = priors) + +## ----------------------------------------------------------------------------- +summary(var_mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(var_mod, 'smooths', trend_effects = TRUE) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +mcmc_plot(var_mod, variable = 'A', regex = TRUE, type = 'hist') + +## ----warning=FALSE, message=FALSE--------------------------------------------- +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(A_pars)), + type = 'hist') + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'trend', series = 1) + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'trend', series = 3) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') + +## ----warning=FALSE, message=FALSE--------------------------------------------- +mcmc_plot(var_mod, variable = 'sigma_obs', regex = TRUE, type = 'hist') + +## ----------------------------------------------------------------------------- +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) + +## ----varcor_mod, include = FALSE, results='hide'------------------------------ +varcor_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1cor', + burnin = 1000, + priors = priors) + +## ----eval=FALSE--------------------------------------------------------------- +# varcor_mod <- mvgam( +# # observation formula, which remains empty +# y ~ -1, +# +# # process model formula, which includes the smooth functions +# trend_formula = ~ te(temp, month, k = c(4, 4)) + +# te(temp, month, k = c(4, 4), by = trend), +# +# # VAR1 model with correlated process errors +# trend_model = 'VAR1cor', +# family = gaussian(), +# data = plankton_train, +# newdata = plankton_test, +# +# # include the updated priors +# priors = priors) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +mcmc_plot(varcor_mod, type = 'rhat') + + labs(title = 'VAR1cor') +mcmc_plot(var_mod, type = 'rhat') + + labs(title = 'VAR1') + +## ----warning=FALSE, message=FALSE--------------------------------------------- +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') + +## ----------------------------------------------------------------------------- +Sigma_post <- as.matrix(varcor_mod, variable = 'Sigma', regex = TRUE) +median_correlations <- cov2cor(matrix(apply(Sigma_post, 2, median), + nrow = 5, ncol = 5)) +rownames(median_correlations) <- colnames(median_correlations) <- levels(plankton_train$series) + +round(median_correlations, 2) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(A_pars)), + type = 'hist') + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'forecast', series = 1, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(varcor_mod, type = 'forecast', series = 1, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'forecast', series = 2, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(varcor_mod, type = 'forecast', series = 2, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'forecast', series = 3, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(varcor_mod, type = 'forecast', series = 3, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +# create forecast objects for each model +fcvar <- forecast(var_mod) +fcvarcor <- forecast(varcor_mod) + +# plot the difference in variogram scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'variogram')$all_series$score - + score(fcvar, score = 'variogram')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(variogram[VAR1cor]~-~variogram[VAR1])) +abline(h = 0, lty = 'dashed') + +## ----------------------------------------------------------------------------- +# plot the difference in energy scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'energy')$all_series$score - + score(fcvar, score = 'energy')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(energy[VAR1cor]~-~energy[VAR1])) +abline(h = 0, lty = 'dashed') + diff --git a/doc/trend_formulas.Rmd b/doc/trend_formulas.Rmd new file mode 100644 index 00000000..e65a4ce7 --- /dev/null +++ b/doc/trend_formulas.Rmd @@ -0,0 +1,561 @@ +--- +title: "State-Space models in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{State-Space models in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to fit and interrogate State-Space models with nonlinear effects. + +## State-Space Models + +![Illustration of a basic State-Space model, which assumes that a latent dynamic *process* (X) can evolve independently from the way we take *observations* (Y) of that process](SS_model.svg){width=85%} + +
+ +State-Space models allow us to separately make inferences about the underlying dynamic *process model* that we are interested in (i.e. the evolution of a time series or a collection of time series) and the *observation model* (i.e. the way that we survey / measure this underlying process). This is extremely useful in ecology because our observations are always imperfect / noisy measurements of the thing we are interested in measuring. It is also helpful because we often know that some covariates will impact our ability to measure accurately (i.e. we cannot take accurate counts of rodents if there is a thunderstorm happening) while other covariate impact the underlying process (it is highly unlikely that rodent abundance responds to one storm, but instead probably responds to longer-term weather and climate variation). A State-Space model allows us to model both components in a single unified modelling framework. A major advantage of `mvgam` is that it can include nonlinear effects and random effects in BOTH model components while also capturing dynamic processes. + +### Lake Washington plankton data +The data we will use to illustrate how we can fit State-Space models in `mvgam` are from a long-term monitoring study of plankton counts (cells per mL) taken from Lake Washington in Washington, USA. The data are available as part of the `MARSS` package and can be downloaded using the following: +```{r} +load(url('https://github.com/atsa-es/MARSS/raw/master/data/lakeWAplankton.rda')) +``` + +We will work with five different groups of plankton: +```{r} +outcomes <- c('Greens', 'Bluegreens', 'Diatoms', 'Unicells', 'Other.algae') +``` + +As usual, preparing the data into the correct format for `mvgam` modelling takes a little bit of wrangling in `dplyr`: +```{r} +# loop across each plankton group to create the long datframe +plankton_data <- do.call(rbind, lapply(outcomes, function(x){ + + # create a group-specific dataframe with counts labelled 'y' + # and the group name in the 'series' variable + data.frame(year = lakeWAplanktonTrans[, 'Year'], + month = lakeWAplanktonTrans[, 'Month'], + y = lakeWAplanktonTrans[, x], + series = x, + temp = lakeWAplanktonTrans[, 'Temp'])})) %>% + + # change the 'series' label to a factor + dplyr::mutate(series = factor(series)) %>% + + # filter to only include some years in the data + dplyr::filter(year >= 1965 & year < 1975) %>% + dplyr::arrange(year, month) %>% + dplyr::group_by(series) %>% + + # z-score the counts so they are approximately standard normal + dplyr::mutate(y = as.vector(scale(y))) %>% + + # add the time indicator + dplyr::mutate(time = dplyr::row_number()) %>% + dplyr::ungroup() +``` + +Inspect the data structure +```{r} +head(plankton_data) +``` + +```{r} +dplyr::glimpse(plankton_data) +``` + +Note that we have z-scored the counts in this example as that will make it easier to specify priors (though this is not completely necessary; it is often better to build a model that respects the properties of the actual outcome variables) +```{r} +plot_mvgam_series(data = plankton_data, series = 'all') +``` + +It is always helpful to check the data for `NA`s before attempting any models: +```{r} +image(is.na(t(plankton_data)), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(plankton_data)), + labels = colnames(plankton_data)) +``` + +We have some missing observations, but this isn't an issue for modelling in `mvgam`. A useful property to understand about these counts is that they tend to be highly seasonal. Below are some plots of z-scored counts against the z-scored temperature measurements in the lake for each month: +```{r} +plankton_data %>% + dplyr::filter(series == 'Other.algae') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Other algae (red)') +``` + + +```{r} +plankton_data %>% + dplyr::filter(series == 'Diatoms') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Diatoms (red)') +``` + +```{r} +plankton_data %>% + dplyr::filter(series == 'Greens') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Greens (red)') +``` + +We will have to try and capture this seasonality in our process model, which should be easy to do given the flexibility of GAMs. Next we will split the data into training and testing splits: +```{r} +plankton_train <- plankton_data %>% + dplyr::filter(time <= 112) +plankton_test <- plankton_data %>% + dplyr::filter(time > 112) +``` + +Now time to fit some models. This requires a bit of thinking about how we can best tackle the seasonal variation and the likely dependence structure in the data. These algae are interacting as part of a complex system within the same lake, so we certainly expect there to be some lagged cross-dependencies underling their dynamics. But if we do not capture the seasonal variation, our multivariate dynamic model will be forced to try and capture it, which could lead to poor convergence and unstable results (we could feasibly capture cyclic dynamics with a more complex multi-species Lotka-Volterra model, but ordinary differential equation approaches are beyond the scope of `mvgam`). + +### Capturing seasonality + +First we will fit a model that does not include a dynamic component, just to see if it can reproduce the seasonal variation in the observations. This model introduces hierarchical multidimensional smooths, where all time series share a "global" tensor product of the `month` and `temp` variables, capturing our expectation that algal seasonality responds to temperature variation. But this response should depend on when in the year these temperatures are recorded (i.e. a response to warm temperatures in Spring should be different to a response to warm temperatures in Autumn). The model also fits series-specific deviation smooths (i.e. one tensor product per series) to capture how each algal group's seasonality differs from the overall "global" seasonality. Note that we do not include series-specific intercepts in this model because each series was z-scored to have a mean of 0. +```{r notrend_mod, include = FALSE, results='hide'} +notrend_mod <- mvgam(y ~ + te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = series), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'None') +``` + +```{r eval=FALSE} +notrend_mod <- mvgam(y ~ + # tensor of temp and month to capture + # "global" seasonality + te(temp, month, k = c(4, 4)) + + + # series-specific deviation tensor products + te(temp, month, k = c(4, 4), by = series), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'None') + +``` + +The "global" tensor product smooth function can be quickly visualized: +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 1) +``` + +On this plot, red indicates below-average linear predictors and white indicates above-average. We can then plot the deviation smooths for each algal group to see how they vary from the "global" pattern: +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 2) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 3) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 4) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 5) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 6) +``` + +These multidimensional smooths have done a good job of capturing the seasonal variation in our observations: +```{r} +plot(notrend_mod, type = 'forecast', series = 1) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 2) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 3) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 4) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 5) +``` + +This basic model gives us confidence that we can capture the seasonal variation in the observations. But the model has not captured the remaining temporal dynamics, which is obvious when we inspect Dunn-Smyth residuals for each series: +```{r} +plot(notrend_mod, type = 'residuals', series = 1) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 2) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 3) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 4) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 5) +``` + +### Multiseries dynamics +Now it is time to get into multivariate State-Space models. We will fit two models that can both incorporate lagged cross-dependencies in the latent process models. The first model assumes that the process errors operate independently from one another, while the second assumes that there may be contemporaneous correlations in the process errors. Both models include a Vector Autoregressive component for the process means, and so both can model complex community dynamics. The models can be described mathematically as follows: + +\begin{align*} +\boldsymbol{count}_t & \sim \text{Normal}(\mu_{obs[t]}, \sigma_{obs}) \\ +\mu_{obs[t]} & = process_t \\ +process_t & \sim \text{MVNormal}(\mu_{process[t]}, \Sigma_{process}) \\ +\mu_{process[t]} & = VAR * process_{t-1} + f_{global}(\boldsymbol{month},\boldsymbol{temp})_t + f_{series}(\boldsymbol{month},\boldsymbol{temp})_t \\ +f_{global}(\boldsymbol{month},\boldsymbol{temp}) & = \sum_{k=1}^{K}b_{global} * \beta_{global} \\ +f_{series}(\boldsymbol{month},\boldsymbol{temp}) & = \sum_{k=1}^{K}b_{series} * \beta_{series} \end{align*} + +Here you can see that there are no terms in the observation model apart from the underlying process model. But we could easily add covariates into the observation model if we felt that they could explain some of the systematic observation errors. We also assume independent observation processes (there is no covariance structure in the observation errors $\sigma_{obs}$). At present, `mvgam` does not support multivariate observation models. But this feature will be added in future versions. However the underlying process model is multivariate, and there is a lot going on here. This component has a Vector Autoregressive part, where the process mean at time $t$ $(\mu_{process[t]})$ is a vector that evolves as a function of where the vector-valued process model was at time $t-1$. The $VAR$ matrix captures these dynamics with self-dependencies on the diagonal and possibly asymmetric cross-dependencies on the off-diagonals, while also incorporating the nonlinear smooth functions that capture seasonality for each series. The contemporaneous process errors are modeled by $\Sigma_{process}$, which can be constrained so that process errors are independent (i.e. setting the off-diagonals to 0) or can be fully parameterized using a Cholesky decomposition (using `Stan`'s $LKJcorr$ distribution to place a prior on the strength of inter-species correlations). For those that are interested in the inner-workings, `mvgam` makes use of a recent breakthrough by [Sarah Heaps to enforce stationarity of Bayesian VAR processes](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648). This is advantageous as we often don't expect forecast variance to increase without bound forever into the future, but many estimated VARs tend to behave this way. + +
+Ok that was a lot to take in. Let's fit some models to try and inspect what is going on and what they assume. But first, we need to update `mvgam`'s default priors for the observation and process errors. By default, `mvgam` uses a fairly wide Student-T prior on these parameters to avoid being overly informative. But our observations are z-scored and so we do not expect very large process or observation errors. However, we also do not expect very small observation errors either as we know these measurements are not perfect. So let's update the priors for these parameters. In doing so, you will get to see how the formula for the latent process (i.e. trend) model is used in `mvgam`: +```{r} +priors <- get_mvgam_priors( + # observation formula, which has no terms in it + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with uncorrelated process errors + trend_model = 'VAR1', + family = gaussian(), + data = plankton_train) +``` + +Get names of all parameters whose priors can be modified: +```{r} +priors[, 3] +``` + +And their default prior distributions: +```{r} +priors[, 4] +``` + +Setting priors is easy in `mvgam` as you can use `brms` routines. Here we use more informative Normal priors for both error components, but we impose a lower bound of 0.2 for the observation errors: +```{r} +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) +``` + +You may have noticed something else unique about this model: there is no intercept term in the observation formula. This is because a shared intercept parameter can sometimes be unidentifiable with respect to the latent VAR process, particularly if our series have similar long-run averages (which they do in this case because they were z-scored). We will often get better convergence in these State-Space models if we drop this parameter. `mvgam` accomplishes this by fixing the coefficient for the intercept to zero. Now we can fit the first model, which assumes that process errors are contemporaneously uncorrelated +```{r var_mod, include = FALSE, results='hide'} +var_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1', + priors = priors, + burnin = 1000) +``` + +```{r eval=FALSE} +var_mod <- mvgam( + # observation formula, which is empty + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with uncorrelated process errors + trend_model = 'VAR1', + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + + # include the updated priors + priors = priors) +``` + +### Inspecting SS models +This model's summary is a bit different to other `mvgam` summaries. It separates parameters based on whether they belong to the observation model or to the latent process model. This is because we may often have covariates that impact the observations but not the latent process, so we can have fairly complex models for each component. You will notice that some parameters have not fully converged, particularly for the VAR coefficients (called `A` in the output) and for the process errors (`Sigma`). Note that we set `include_betas = FALSE` to stop the summary from printing output for all of the spline coefficients, which can be dense and hard to interpret: +```{r} +summary(var_mod, include_betas = FALSE) +``` + +The convergence of this model isn't fabulous (more on this in a moment). But we can again plot the smooth functions, which this time operate on the process model. We can see the same plot using `trend_effects = TRUE` in the plotting functions: +```{r} +plot(var_mod, 'smooths', trend_effects = TRUE) +``` + +The VAR matrix is of particular interest here, as it captures lagged dependencies and cross-dependencies in the latent process model: +```{r warning=FALSE, message=FALSE} +mcmc_plot(var_mod, variable = 'A', regex = TRUE, type = 'hist') +``` + +Unfortunately `bayesplot` doesn't know this is a matrix of parameters so what we see is actually the transpose of the VAR matrix. A little bit of wrangling gives us these histograms in the correct order: +```{r warning=FALSE, message=FALSE} +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(A_pars)), + type = 'hist') +``` + +There is a lot happening in this matrix. Each cell captures the lagged effect of the process in the column on the process in the row in the next timestep. So for example, the effect in cell [1,3], which is quite strongly negative, means that an *increase* in the process for series 3 (Greens) at time $t$ is expected to lead to a subsequent *decrease* in the process for series 1 (Bluegreens) at time $t+1$. The latent process model is now capturing these effects and the smooth seasonal effects, so the trend plot shows our best estimate of what the *true* count should have been at each time point: +```{r} +plot(var_mod, type = 'trend', series = 1) +``` + +```{r} +plot(var_mod, type = 'trend', series = 3) +``` + +The process error $(\Sigma)$ captures unmodelled variation in the process models. Again, we fixed the off-diagonals to 0, so the histograms for these will look like flat boxes: +```{r warning=FALSE, message=FALSE} +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') +``` + +The observation error estimates $(\sigma_{obs})$ represent how much the model thinks we might miss the true count when we take our imperfect measurements: +```{r warning=FALSE, message=FALSE} +mcmc_plot(var_mod, variable = 'sigma_obs', regex = TRUE, type = 'hist') +``` + +These are still a bit hard to identify overall, especially when trying to estimate both process and observation error. Often we need to make some strong assumptions about which of these is more important for determining unexplained variation in our observations. + +### Correlated process errors + +Let's see if these estimates improve when we allow the process errors to be correlated. Once again, we need to first update the priors for the observation errors: +```{r} +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) +``` + +And now we can fit the correlated process error model +```{r varcor_mod, include = FALSE, results='hide'} +varcor_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1cor', + burnin = 1000, + priors = priors) +``` + +```{r eval=FALSE} +varcor_mod <- mvgam( + # observation formula, which remains empty + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with correlated process errors + trend_model = 'VAR1cor', + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + + # include the updated priors + priors = priors) +``` + +Plot convergence diagnostics for the two models, which shows that both models display similar levels of convergence: +```{r warning=FALSE, message=FALSE} +mcmc_plot(varcor_mod, type = 'rhat') + + labs(title = 'VAR1cor') +mcmc_plot(var_mod, type = 'rhat') + + labs(title = 'VAR1') +``` + +The $(\Sigma)$ matrix now captures any evidence of contemporaneously correlated process error: +```{r warning=FALSE, message=FALSE} +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') +``` + +This symmetric matrix tells us there is support for correlated process errors. For example, series 1 and 3 (Bluegreens and Greens) show negatively correlated process errors, while series 1 and 4 (Bluegreens and Other.algae) show positively correlated errors. But it is easier to interpret these estimates if we convert the covariance matrix to a correlation matrix. Here we compute the posterior median process error correlations: +```{r} +Sigma_post <- as.matrix(varcor_mod, variable = 'Sigma', regex = TRUE) +median_correlations <- cov2cor(matrix(apply(Sigma_post, 2, median), + nrow = 5, ncol = 5)) +rownames(median_correlations) <- colnames(median_correlations) <- levels(plankton_train$series) + +round(median_correlations, 2) +``` + +Because this model is able to capture correlated errors, the VAR matrix has changed slightly: +```{r warning=FALSE, message=FALSE} +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(A_pars)), + type = 'hist') +``` + +We still have some evidence of lagged cross-dependence, but some of these interactions have now been pulled more toward zero. But which model is better? Forecasts don't appear to differ very much, at least qualitatively (here are forecasts for three of the series, for each model): +```{r} +plot(var_mod, type = 'forecast', series = 1, newdata = plankton_test) +``` + +```{r} +plot(varcor_mod, type = 'forecast', series = 1, newdata = plankton_test) +``` + +```{r} +plot(var_mod, type = 'forecast', series = 2, newdata = plankton_test) +``` + +```{r} +plot(varcor_mod, type = 'forecast', series = 2, newdata = plankton_test) +``` + +```{r} +plot(var_mod, type = 'forecast', series = 3, newdata = plankton_test) +``` + +```{r} +plot(varcor_mod, type = 'forecast', series = 3, newdata = plankton_test) +``` + +We can compute the variogram score for out of sample forecasts to get a sense of which model does a better job of capturing the dependence structure in the true evaluation set: +```{r} +# create forecast objects for each model +fcvar <- forecast(var_mod) +fcvarcor <- forecast(varcor_mod) + +# plot the difference in variogram scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'variogram')$all_series$score - + score(fcvar, score = 'variogram')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(variogram[VAR1cor]~-~variogram[VAR1])) +abline(h = 0, lty = 'dashed') +``` + +And we can also compute the energy score for out of sample forecasts to get a sense of which model provides forecasts that are better calibrated: +```{r} +# plot the difference in energy scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'energy')$all_series$score - + score(fcvar, score = 'energy')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(energy[VAR1cor]~-~energy[VAR1])) +abline(h = 0, lty = 'dashed') +``` + +The models tend to provide similar forecasts, though the correlated error model does slightly better overall. We would probably need to use a more extensive rolling forecast evaluation exercise if we felt like we needed to only choose one for production. `mvgam` offers some utilities for doing this (i.e. see `?lfo_cv` for guidance). + +### Further reading +The following papers and resources offer a lot of useful material about multivariate State-Space models and how they can be applied in practice: + +Heaps, Sarah E. "[Enforcing stationarity through the prior in vector autoregressions.](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648)" *Journal of Computational and Graphical Statistics* 32.1 (2023): 74-83. + +Hannaford, Naomi E., et al. "[A sparse Bayesian hierarchical vector autoregressive model for microbial dynamics in a wastewater treatment plant.](https://www.sciencedirect.com/science/article/pii/S0167947322002390)" *Computational Statistics & Data Analysis* 179 (2023): 107659. + +Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. "[MARSS: multivariate autoregressive state-space models for analyzing time-series data.](https://d1wqtxts1xzle7.cloudfront.net/30588864/rjournal_2012-1_holmes_et_al-libre.pdf?1391843792=&response-content-disposition=inline%3B+filename%3DMARSS_Multivariate_Autoregressive_State.pdf&Expires=1695861526&Signature=TCRXULs0mUKRM4m1pmvZxwE10bUqS6vzLcuKeUBCj57YIjx23iTxS1fEgBpV0fs2wb5XAw7ZkG84XyMaoS~vjiqZ-DpheQDHwAHpIWG-TcckHQjEjPWTNajFvAemToUdCiHnDa~yrhW9HRgXjgncdalkjzjvjT3HLSW8mcjBDhQN-WJ3MKQFSXtxoBpWfcuPYbf-HC1E1oSl7957y~w0I1gcIVdu6LHjP~CEKXa0BQzS4xuarL2nz~tHD2MverbNJYMrDGrAxIi-MX6i~lfHWuwV6UKRdoOZ0pXIcMYWBTv9V5xYey76aMKTICiJ~0NqXLZdXO5qlS4~~2nFEO7b7w__&Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA)" *R Journal*. 4.1 (2012): 11. + +Ward, Eric J., et al. "[Inferring spatial structure from time‐series data: using multivariate state‐space models to detect metapopulation structure of California sea lions in the Gulf of California, Mexico.](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/j.1365-2664.2009.01745.x)" *Journal of Applied Ecology* 47.1 (2010): 47-56. + +Auger‐Méthé, Marie, et al. ["A guide to state–space modeling of ecological time series.](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecm.1470)" *Ecological Monographs* 91.4 (2021): e01470. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/doc/trend_formulas.html b/doc/trend_formulas.html new file mode 100644 index 00000000..e6e67bc0 --- /dev/null +++ b/doc/trend_formulas.html @@ -0,0 +1,1134 @@ + + + + + + + + + + + + + + + + +State-Space models in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

State-Space models in mvgam

+

Nicholas J Clark

+

2024-04-16

+ + +
+ +
+ +

The purpose of this vignette is to show how the mvgam +package can be used to fit and interrogate State-Space models with +nonlinear effects.

+
+

State-Space Models

+
+Illustration of a basic State-Space model, which assumes that a latent dynamic process (X) can evolve independently from the way we take observations (Y) of that process +
Illustration of a basic State-Space model, which +assumes that a latent dynamic process (X) can evolve +independently from the way we take observations (Y) of that +process
+
+


+

State-Space models allow us to separately make inferences about the +underlying dynamic process model that we are interested in +(i.e. the evolution of a time series or a collection of time series) and +the observation model (i.e. the way that we survey / measure +this underlying process). This is extremely useful in ecology because +our observations are always imperfect / noisy measurements of the thing +we are interested in measuring. It is also helpful because we often know +that some covariates will impact our ability to measure accurately +(i.e. we cannot take accurate counts of rodents if there is a +thunderstorm happening) while other covariate impact the underlying +process (it is highly unlikely that rodent abundance responds to one +storm, but instead probably responds to longer-term weather and climate +variation). A State-Space model allows us to model both components in a +single unified modelling framework. A major advantage of +mvgam is that it can include nonlinear effects and random +effects in BOTH model components while also capturing dynamic +processes.

+
+

Lake Washington plankton data

+

The data we will use to illustrate how we can fit State-Space models +in mvgam are from a long-term monitoring study of plankton +counts (cells per mL) taken from Lake Washington in Washington, USA. The +data are available as part of the MARSS package and can be +downloaded using the following:

+
load(url('https://github.com/atsa-es/MARSS/raw/master/data/lakeWAplankton.rda'))
+

We will work with five different groups of plankton:

+
outcomes <- c('Greens', 'Bluegreens', 'Diatoms', 'Unicells', 'Other.algae')
+

As usual, preparing the data into the correct format for +mvgam modelling takes a little bit of wrangling in +dplyr:

+
# loop across each plankton group to create the long datframe
+plankton_data <- do.call(rbind, lapply(outcomes, function(x){
+  
+  # create a group-specific dataframe with counts labelled 'y'
+  # and the group name in the 'series' variable
+  data.frame(year = lakeWAplanktonTrans[, 'Year'],
+             month = lakeWAplanktonTrans[, 'Month'],
+             y = lakeWAplanktonTrans[, x],
+             series = x,
+             temp = lakeWAplanktonTrans[, 'Temp'])})) %>%
+  
+  # change the 'series' label to a factor
+  dplyr::mutate(series = factor(series)) %>%
+  
+  # filter to only include some years in the data
+  dplyr::filter(year >= 1965 & year < 1975) %>%
+  dplyr::arrange(year, month) %>%
+  dplyr::group_by(series) %>%
+  
+  # z-score the counts so they are approximately standard normal
+  dplyr::mutate(y = as.vector(scale(y))) %>%
+  
+  # add the time indicator
+  dplyr::mutate(time = dplyr::row_number()) %>%
+  dplyr::ungroup()
+

Inspect the data structure

+
head(plankton_data)
+#> # A tibble: 6 × 6
+#>    year month       y series       temp  time
+#>   <dbl> <dbl>   <dbl> <fct>       <dbl> <int>
+#> 1  1965     1 -0.542  Greens      -1.23     1
+#> 2  1965     1 -0.344  Bluegreens  -1.23     1
+#> 3  1965     1 -0.0768 Diatoms     -1.23     1
+#> 4  1965     1 -1.52   Unicells    -1.23     1
+#> 5  1965     1 -0.491  Other.algae -1.23     1
+#> 6  1965     2 NA      Greens      -1.32     2
+
dplyr::glimpse(plankton_data)
+#> Rows: 600
+#> Columns: 6
+#> $ year   <dbl> 1965, 1965, 1965, 1965, 1965, 1965, 1965, 1965, 1965, 1965, 196…
+#> $ month  <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, …
+#> $ y      <dbl> -0.54241769, -0.34410776, -0.07684901, -1.52243490, -0.49055442…
+#> $ series <fct> Greens, Bluegreens, Diatoms, Unicells, Other.algae, Greens, Blu…
+#> $ temp   <dbl> -1.2306562, -1.2306562, -1.2306562, -1.2306562, -1.2306562, -1.…
+#> $ time   <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, …
+

Note that we have z-scored the counts in this example as that will +make it easier to specify priors (though this is not completely +necessary; it is often better to build a model that respects the +properties of the actual outcome variables)

+
plot_mvgam_series(data = plankton_data, series = 'all')
+

+

It is always helpful to check the data for NAs before +attempting any models:

+
image(is.na(t(plankton_data)), axes = F,
+      col = c('grey80', 'darkred'))
+axis(3, at = seq(0,1, len = NCOL(plankton_data)), 
+     labels = colnames(plankton_data))
+

+

We have some missing observations, but this isn’t an issue for +modelling in mvgam. A useful property to understand about +these counts is that they tend to be highly seasonal. Below are some +plots of z-scored counts against the z-scored temperature measurements +in the lake for each month:

+
plankton_data %>%
+  dplyr::filter(series == 'Other.algae') %>%
+  ggplot(aes(x = time, y = temp)) +
+  geom_line(size = 1.1) +
+  geom_line(aes(y = y), col = 'white',
+            size = 1.3) +
+  geom_line(aes(y = y), col = 'darkred',
+            size = 1.1) +
+  ylab('z-score') +
+  xlab('Time') +
+  ggtitle('Temperature (black) vs Other algae (red)')
+#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
+#> ℹ Please use `linewidth` instead.
+#> This warning is displayed once every 8 hours.
+#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+#> generated.
+

+
plankton_data %>%
+  dplyr::filter(series == 'Diatoms') %>%
+  ggplot(aes(x = time, y = temp)) +
+  geom_line(size = 1.1) +
+  geom_line(aes(y = y), col = 'white',
+            size = 1.3) +
+  geom_line(aes(y = y), col = 'darkred',
+            size = 1.1) +
+  ylab('z-score') +
+  xlab('Time') +
+  ggtitle('Temperature (black) vs Diatoms (red)')
+

+
plankton_data %>%
+  dplyr::filter(series == 'Greens') %>%
+  ggplot(aes(x = time, y = temp)) +
+  geom_line(size = 1.1) +
+  geom_line(aes(y = y), col = 'white',
+            size = 1.3) +
+  geom_line(aes(y = y), col = 'darkred',
+            size = 1.1) +
+  ylab('z-score') +
+  xlab('Time') +
+  ggtitle('Temperature (black) vs Greens (red)')
+

+

We will have to try and capture this seasonality in our process +model, which should be easy to do given the flexibility of GAMs. Next we +will split the data into training and testing splits:

+
plankton_train <- plankton_data %>%
+  dplyr::filter(time <= 112)
+plankton_test <- plankton_data %>%
+  dplyr::filter(time > 112)
+

Now time to fit some models. This requires a bit of thinking about +how we can best tackle the seasonal variation and the likely dependence +structure in the data. These algae are interacting as part of a complex +system within the same lake, so we certainly expect there to be some +lagged cross-dependencies underling their dynamics. But if we do not +capture the seasonal variation, our multivariate dynamic model will be +forced to try and capture it, which could lead to poor convergence and +unstable results (we could feasibly capture cyclic dynamics with a more +complex multi-species Lotka-Volterra model, but ordinary differential +equation approaches are beyond the scope of mvgam).

+
+
+

Capturing seasonality

+

First we will fit a model that does not include a dynamic component, +just to see if it can reproduce the seasonal variation in the +observations. This model introduces hierarchical multidimensional +smooths, where all time series share a “global” tensor product of the +month and temp variables, capturing our +expectation that algal seasonality responds to temperature variation. +But this response should depend on when in the year these temperatures +are recorded (i.e. a response to warm temperatures in Spring should be +different to a response to warm temperatures in Autumn). The model also +fits series-specific deviation smooths (i.e. one tensor product per +series) to capture how each algal group’s seasonality differs from the +overall “global” seasonality. Note that we do not include +series-specific intercepts in this model because each series was +z-scored to have a mean of 0.

+
notrend_mod <- mvgam(y ~ 
+                       # tensor of temp and month to capture
+                       # "global" seasonality
+                       te(temp, month, k = c(4, 4)) +
+                       
+                       # series-specific deviation tensor products
+                       te(temp, month, k = c(4, 4), by = series),
+                     family = gaussian(),
+                     data = plankton_train,
+                     newdata = plankton_test,
+                     trend_model = 'None')
+

The “global” tensor product smooth function can be quickly +visualized:

+
plot_mvgam_smooth(notrend_mod, smooth = 1)
+

+

On this plot, red indicates below-average linear predictors and white +indicates above-average. We can then plot the deviation smooths for each +algal group to see how they vary from the “global” pattern:

+
plot_mvgam_smooth(notrend_mod, smooth = 2)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 3)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 4)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 5)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 6)
+

+

These multidimensional smooths have done a good job of capturing the +seasonal variation in our observations:

+
plot(notrend_mod, type = 'forecast', series = 1)
+

+
#> Out of sample CRPS:
+#> [1] 6.795543
+
plot(notrend_mod, type = 'forecast', series = 2)
+

+
#> Out of sample CRPS:
+#> [1] 6.841293
+
plot(notrend_mod, type = 'forecast', series = 3)
+

+
#> Out of sample CRPS:
+#> [1] 4.109977
+
plot(notrend_mod, type = 'forecast', series = 4)
+

+
#> Out of sample CRPS:
+#> [1] 3.533645
+
plot(notrend_mod, type = 'forecast', series = 5)
+

+
#> Out of sample CRPS:
+#> [1] 2.859834
+

This basic model gives us confidence that we can capture the seasonal +variation in the observations. But the model has not captured the +remaining temporal dynamics, which is obvious when we inspect Dunn-Smyth +residuals for each series:

+
plot(notrend_mod, type = 'residuals', series = 1)
+

+
plot(notrend_mod, type = 'residuals', series = 2)
+

+
plot(notrend_mod, type = 'residuals', series = 3)
+

+
plot(notrend_mod, type = 'residuals', series = 4)
+

+
plot(notrend_mod, type = 'residuals', series = 5)
+

+
+
+

Multiseries dynamics

+

Now it is time to get into multivariate State-Space models. We will +fit two models that can both incorporate lagged cross-dependencies in +the latent process models. The first model assumes that the process +errors operate independently from one another, while the second assumes +that there may be contemporaneous correlations in the process errors. +Both models include a Vector Autoregressive component for the process +means, and so both can model complex community dynamics. The models can +be described mathematically as follows:

+

\[\begin{align*} +\boldsymbol{count}_t & \sim \text{Normal}(\mu_{obs[t]}, +\sigma_{obs}) \\ +\mu_{obs[t]} & = process_t \\ +process_t & \sim \text{MVNormal}(\mu_{process[t]}, \Sigma_{process}) +\\ +\mu_{process[t]} & = VAR * process_{t-1} + +f_{global}(\boldsymbol{month},\boldsymbol{temp})_t + +f_{series}(\boldsymbol{month},\boldsymbol{temp})_t \\ +f_{global}(\boldsymbol{month},\boldsymbol{temp}) & = +\sum_{k=1}^{K}b_{global} * \beta_{global} \\ +f_{series}(\boldsymbol{month},\boldsymbol{temp}) & = +\sum_{k=1}^{K}b_{series} * \beta_{series} \end{align*}\]

+

Here you can see that there are no terms in the observation model +apart from the underlying process model. But we could easily add +covariates into the observation model if we felt that they could explain +some of the systematic observation errors. We also assume independent +observation processes (there is no covariance structure in the +observation errors \(\sigma_{obs}\)). +At present, mvgam does not support multivariate observation +models. But this feature will be added in future versions. However the +underlying process model is multivariate, and there is a lot going on +here. This component has a Vector Autoregressive part, where the process +mean at time \(t\) \((\mu_{process[t]})\) is a vector that +evolves as a function of where the vector-valued process model was at +time \(t-1\). The \(VAR\) matrix captures these dynamics with +self-dependencies on the diagonal and possibly asymmetric +cross-dependencies on the off-diagonals, while also incorporating the +nonlinear smooth functions that capture seasonality for each series. The +contemporaneous process errors are modeled by \(\Sigma_{process}\), which can be +constrained so that process errors are independent (i.e. setting the +off-diagonals to 0) or can be fully parameterized using a Cholesky +decomposition (using Stan’s \(LKJcorr\) distribution to place a prior on +the strength of inter-species correlations). For those that are +interested in the inner-workings, mvgam makes use of a +recent breakthrough by Sarah +Heaps to enforce stationarity of Bayesian VAR processes. This is +advantageous as we often don’t expect forecast variance to increase +without bound forever into the future, but many estimated VARs tend to +behave this way.

+


Ok that was a lot to take in. Let’s fit some models to try and +inspect what is going on and what they assume. But first, we need to +update mvgam’s default priors for the observation and +process errors. By default, mvgam uses a fairly wide +Student-T prior on these parameters to avoid being overly informative. +But our observations are z-scored and so we do not expect very large +process or observation errors. However, we also do not expect very small +observation errors either as we know these measurements are not perfect. +So let’s update the priors for these parameters. In doing so, you will +get to see how the formula for the latent process (i.e. trend) model is +used in mvgam:

+
priors <- get_mvgam_priors(
+  # observation formula, which has no terms in it
+  y ~ -1,
+  
+  # process model formula, which includes the smooth functions
+  trend_formula = ~ te(temp, month, k = c(4, 4)) +
+    te(temp, month, k = c(4, 4), by = trend),
+  
+  # VAR1 model with uncorrelated process errors
+  trend_model = 'VAR1',
+  family = gaussian(),
+  data = plankton_train)
+

Get names of all parameters whose priors can be modified:

+
priors[, 3]
+#>  [1] "(Intercept)"                                                                                                                                                                                                                                                           
+#>  [2] "process error sd"                                                                                                                                                                                                                                                      
+#>  [3] "diagonal autocorrelation population mean"                                                                                                                                                                                                                              
+#>  [4] "off-diagonal autocorrelation population mean"                                                                                                                                                                                                                          
+#>  [5] "diagonal autocorrelation population variance"                                                                                                                                                                                                                          
+#>  [6] "off-diagonal autocorrelation population variance"                                                                                                                                                                                                                      
+#>  [7] "shape1 for diagonal autocorrelation precision"                                                                                                                                                                                                                         
+#>  [8] "shape1 for off-diagonal autocorrelation precision"                                                                                                                                                                                                                     
+#>  [9] "shape2 for diagonal autocorrelation precision"                                                                                                                                                                                                                         
+#> [10] "shape2 for off-diagonal autocorrelation precision"                                                                                                                                                                                                                     
+#> [11] "observation error sd"                                                                                                                                                                                                                                                  
+#> [12] "te(temp,month) smooth parameters, te(temp,month):trendtrend1 smooth parameters, te(temp,month):trendtrend2 smooth parameters, te(temp,month):trendtrend3 smooth parameters, te(temp,month):trendtrend4 smooth parameters, te(temp,month):trendtrend5 smooth parameters"
+

And their default prior distributions:

+
priors[, 4]
+#>  [1] "(Intercept) ~ student_t(3, -0.1, 2.5);"
+#>  [2] "sigma ~ student_t(3, 0, 2.5);"         
+#>  [3] "es[1] = 0;"                            
+#>  [4] "es[2] = 0;"                            
+#>  [5] "fs[1] = sqrt(0.455);"                  
+#>  [6] "fs[2] = sqrt(0.455);"                  
+#>  [7] "gs[1] = 1.365;"                        
+#>  [8] "gs[2] = 1.365;"                        
+#>  [9] "hs[1] = 0.071175;"                     
+#> [10] "hs[2] = 0.071175;"                     
+#> [11] "sigma_obs ~ student_t(3, 0, 2.5);"     
+#> [12] "lambda_trend ~ normal(5, 30);"
+

Setting priors is easy in mvgam as you can use +brms routines. Here we use more informative Normal priors +for both error components, but we impose a lower bound of 0.2 for the +observation errors:

+
priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2),
+            prior(normal(0.5, 0.25), class = sigma))
+

You may have noticed something else unique about this model: there is +no intercept term in the observation formula. This is because a shared +intercept parameter can sometimes be unidentifiable with respect to the +latent VAR process, particularly if our series have similar long-run +averages (which they do in this case because they were z-scored). We +will often get better convergence in these State-Space models if we drop +this parameter. mvgam accomplishes this by fixing the +coefficient for the intercept to zero. Now we can fit the first model, +which assumes that process errors are contemporaneously uncorrelated

+
var_mod <- mvgam(  
+  # observation formula, which is empty
+  y ~ -1,
+  
+  # process model formula, which includes the smooth functions
+  trend_formula = ~ te(temp, month, k = c(4, 4)) +
+    te(temp, month, k = c(4, 4), by = trend),
+  
+  # VAR1 model with uncorrelated process errors
+  trend_model = 'VAR1',
+  family = gaussian(),
+  data = plankton_train,
+  newdata = plankton_test,
+  
+  # include the updated priors
+  priors = priors)
+
+
+

Inspecting SS models

+

This model’s summary is a bit different to other mvgam +summaries. It separates parameters based on whether they belong to the +observation model or to the latent process model. This is because we may +often have covariates that impact the observations but not the latent +process, so we can have fairly complex models for each component. You +will notice that some parameters have not fully converged, particularly +for the VAR coefficients (called A in the output) and for +the process errors (Sigma). Note that we set +include_betas = FALSE to stop the summary from printing +output for all of the spline coefficients, which can be dense and hard +to interpret:

+
summary(var_mod, include_betas = FALSE)
+#> GAM observation formula:
+#> y ~ 1
+#> 
+#> GAM process formula:
+#> ~te(temp, month, k = c(4, 4)) + te(temp, month, k = c(4, 4), 
+#>     by = trend)
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> VAR1
+#> 
+#> N process models:
+#> 5 
+#> 
+#> N series:
+#> 5 
+#> 
+#> N timepoints:
+#> 112 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1500; warmup = 1000; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5%  50% 97.5% Rhat n_eff
+#> sigma_obs[1] 0.21 0.26  0.35 1.01   376
+#> sigma_obs[2] 0.24 0.39  0.53 1.03   152
+#> sigma_obs[3] 0.43 0.64  0.83 1.10    43
+#> sigma_obs[4] 0.24 0.38  0.50 1.01   219
+#> sigma_obs[5] 0.29 0.42  0.54 1.03   173
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>             2.5% 50% 97.5% Rhat n_eff
+#> (Intercept)    0   0     0  NaN   NaN
+#> 
+#> Process model VAR parameter estimates:
+#>          2.5%    50% 97.5% Rhat n_eff
+#> A[1,1] -0.055  0.480 0.900 1.07    64
+#> A[1,2] -0.370 -0.039 0.210 1.01   295
+#> A[1,3] -0.500 -0.053 0.330 1.01   188
+#> A[1,4] -0.270  0.023 0.410 1.01   488
+#> A[1,5] -0.092  0.140 0.580 1.04   135
+#> A[2,1] -0.170  0.012 0.270 1.00   514
+#> A[2,2]  0.610  0.800 0.920 1.01   231
+#> A[2,3] -0.390 -0.120 0.048 1.01   223
+#> A[2,4] -0.044  0.100 0.340 1.01   241
+#> A[2,5] -0.063  0.061 0.210 1.01   371
+#> A[3,1] -0.300  0.014 0.820 1.06    54
+#> A[3,2] -0.540 -0.210 0.016 1.02   137
+#> A[3,3]  0.072  0.410 0.700 1.02   221
+#> A[3,4] -0.015  0.230 0.650 1.01   151
+#> A[3,5] -0.073  0.130 0.420 1.02   139
+#> A[4,1] -0.160  0.065 0.560 1.04    83
+#> A[4,2] -0.130  0.058 0.260 1.03   165
+#> A[4,3] -0.440 -0.120 0.120 1.03   143
+#> A[4,4]  0.480  0.730 0.960 1.03   144
+#> A[4,5] -0.230 -0.035 0.130 1.02   426
+#> A[5,1] -0.230  0.082 0.900 1.06    56
+#> A[5,2] -0.420 -0.120 0.079 1.02   128
+#> A[5,3] -0.650 -0.200 0.120 1.03    90
+#> A[5,4] -0.061  0.180 0.580 1.01   153
+#> A[5,5]  0.510  0.740 0.980 1.02   156
+#> 
+#> Process error parameter estimates:
+#>             2.5%  50% 97.5% Rhat n_eff
+#> Sigma[1,1] 0.019 0.28  0.65 1.11    32
+#> Sigma[1,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[1,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[1,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[1,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,2] 0.063 0.11  0.18 1.01   371
+#> Sigma[2,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,3] 0.056 0.16  0.31 1.03   106
+#> Sigma[3,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,4] 0.048 0.14  0.27 1.03   111
+#> Sigma[4,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,5] 0.098 0.20  0.36 1.02   131
+#> 
+#> Approximate significance of GAM process smooths:
+#>                              edf Ref.df    F p-value
+#> te(temp,month)              3.93     15 2.73    0.25
+#> te(temp,month):seriestrend1 1.22     15 0.16    1.00
+#> te(temp,month):seriestrend2 1.25     15 0.48    1.00
+#> te(temp,month):seriestrend3 3.98     15 3.53    0.24
+#> te(temp,month):seriestrend4 1.60     15 1.02    0.95
+#> te(temp,month):seriestrend5 1.97     15 0.32    1.00
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhats above 1.05 found for 24 parameters
+#>  *Diagnose further to investigate why the chains have not mixed
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> Chain 1: E-FMI = 0.179
+#> Chain 3: E-FMI = 0.1616
+#> Chain 4: E-FMI = 0.1994
+#>  *E-FMI below 0.2 indicates you may need to reparameterize your model
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:19:23 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The convergence of this model isn’t fabulous (more on this in a +moment). But we can again plot the smooth functions, which this time +operate on the process model. We can see the same plot using +trend_effects = TRUE in the plotting functions:

+
plot(var_mod, 'smooths', trend_effects = TRUE)
+

+

The VAR matrix is of particular interest here, as it captures lagged +dependencies and cross-dependencies in the latent process model:

+
mcmc_plot(var_mod, variable = 'A', regex = TRUE, type = 'hist')
+

+

Unfortunately bayesplot doesn’t know this is a matrix of +parameters so what we see is actually the transpose of the VAR matrix. A +little bit of wrangling gives us these histograms in the correct +order:

+
A_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    A_pars[i, j] <- paste0('A[', i, ',', j, ']')
+  }
+}
+mcmc_plot(var_mod, 
+          variable = as.vector(t(A_pars)), 
+          type = 'hist')
+

+

There is a lot happening in this matrix. Each cell captures the +lagged effect of the process in the column on the process in the row in +the next timestep. So for example, the effect in cell [1,3], which is +quite strongly negative, means that an increase in the process +for series 3 (Greens) at time \(t\) is +expected to lead to a subsequent decrease in the process for +series 1 (Bluegreens) at time \(t+1\). +The latent process model is now capturing these effects and the smooth +seasonal effects, so the trend plot shows our best estimate of what the +true count should have been at each time point:

+
plot(var_mod, type = 'trend', series = 1)
+

+
plot(var_mod, type = 'trend', series = 3)
+

+

The process error \((\Sigma)\) +captures unmodelled variation in the process models. Again, we fixed the +off-diagonals to 0, so the histograms for these will look like flat +boxes:

+
Sigma_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']')
+  }
+}
+mcmc_plot(var_mod, 
+          variable = as.vector(t(Sigma_pars)), 
+          type = 'hist')
+

+

The observation error estimates \((\sigma_{obs})\) represent how much the +model thinks we might miss the true count when we take our imperfect +measurements:

+
mcmc_plot(var_mod, variable = 'sigma_obs', regex = TRUE, type = 'hist')
+

+

These are still a bit hard to identify overall, especially when +trying to estimate both process and observation error. Often we need to +make some strong assumptions about which of these is more important for +determining unexplained variation in our observations.

+
+
+

Correlated process errors

+

Let’s see if these estimates improve when we allow the process errors +to be correlated. Once again, we need to first update the priors for the +observation errors:

+
priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2),
+            prior(normal(0.5, 0.25), class = sigma))
+

And now we can fit the correlated process error model

+
varcor_mod <- mvgam(  
+  # observation formula, which remains empty
+  y ~ -1,
+  
+  # process model formula, which includes the smooth functions
+  trend_formula = ~ te(temp, month, k = c(4, 4)) +
+    te(temp, month, k = c(4, 4), by = trend),
+  
+  # VAR1 model with correlated process errors
+  trend_model = 'VAR1cor',
+  family = gaussian(),
+  data = plankton_train,
+  newdata = plankton_test,
+  
+  # include the updated priors
+  priors = priors)
+

Plot convergence diagnostics for the two models, which shows that +both models display similar levels of convergence:

+
mcmc_plot(varcor_mod, type = 'rhat') +
+  labs(title = 'VAR1cor')
+

+
mcmc_plot(var_mod, type = 'rhat') +
+  labs(title = 'VAR1')
+

+

The \((\Sigma)\) matrix now captures +any evidence of contemporaneously correlated process error:

+
Sigma_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']')
+  }
+}
+mcmc_plot(varcor_mod, 
+          variable = as.vector(t(Sigma_pars)), 
+          type = 'hist')
+

+

This symmetric matrix tells us there is support for correlated +process errors. For example, series 1 and 3 (Bluegreens and Greens) show +negatively correlated process errors, while series 1 and 4 (Bluegreens +and Other.algae) show positively correlated errors. But it is easier to +interpret these estimates if we convert the covariance matrix to a +correlation matrix. Here we compute the posterior median process error +correlations:

+
Sigma_post <- as.matrix(varcor_mod, variable = 'Sigma', regex = TRUE)
+median_correlations <- cov2cor(matrix(apply(Sigma_post, 2, median),
+                                      nrow = 5, ncol = 5))
+rownames(median_correlations) <- colnames(median_correlations) <- levels(plankton_train$series)
+
+round(median_correlations, 2)
+#>             Bluegreens Diatoms Greens Other.algae Unicells
+#> Bluegreens        1.00   -0.03   0.15       -0.05     0.32
+#> Diatoms          -0.03    1.00  -0.20        0.48     0.17
+#> Greens            0.15   -0.20   1.00        0.18     0.46
+#> Other.algae      -0.05    0.48   0.18        1.00     0.28
+#> Unicells          0.32    0.17   0.46        0.28     1.00
+

Because this model is able to capture correlated errors, the VAR +matrix has changed slightly:

+
A_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    A_pars[i, j] <- paste0('A[', i, ',', j, ']')
+  }
+}
+mcmc_plot(varcor_mod, 
+          variable = as.vector(t(A_pars)), 
+          type = 'hist')
+

+

We still have some evidence of lagged cross-dependence, but some of +these interactions have now been pulled more toward zero. But which +model is better? Forecasts don’t appear to differ very much, at least +qualitatively (here are forecasts for three of the series, for each +model):

+
plot(var_mod, type = 'forecast', series = 1, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 2.954187
+
plot(varcor_mod, type = 'forecast', series = 1, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 3.114382
+
plot(var_mod, type = 'forecast', series = 2, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 5.827133
+
plot(varcor_mod, type = 'forecast', series = 2, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 5.503073
+
plot(var_mod, type = 'forecast', series = 3, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 4.04051
+
plot(varcor_mod, type = 'forecast', series = 3, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 4.007712
+

We can compute the variogram score for out of sample forecasts to get +a sense of which model does a better job of capturing the dependence +structure in the true evaluation set:

+
# create forecast objects for each model
+fcvar <- forecast(var_mod)
+fcvarcor <- forecast(varcor_mod)
+
+# plot the difference in variogram scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better
+diff_scores <- score(fcvarcor, score = 'variogram')$all_series$score -
+  score(fcvar, score = 'variogram')$all_series$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(variogram[VAR1cor]~-~variogram[VAR1]))
+abline(h = 0, lty = 'dashed')
+

+

And we can also compute the energy score for out of sample forecasts +to get a sense of which model provides forecasts that are better +calibrated:

+
# plot the difference in energy scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better
+diff_scores <- score(fcvarcor, score = 'energy')$all_series$score -
+  score(fcvar, score = 'energy')$all_series$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(energy[VAR1cor]~-~energy[VAR1]))
+abline(h = 0, lty = 'dashed')
+

+

The models tend to provide similar forecasts, though the correlated +error model does slightly better overall. We would probably need to use +a more extensive rolling forecast evaluation exercise if we felt like we +needed to only choose one for production. mvgam offers some +utilities for doing this (i.e. see ?lfo_cv for +guidance).

+
+
+

Further reading

+

The following papers and resources offer a lot of useful material +about multivariate State-Space models and how they can be applied in +practice:

+

Heaps, Sarah E. “Enforcing +stationarity through the prior in vector autoregressions.” +Journal of Computational and Graphical Statistics 32.1 (2023): +74-83.

+

Hannaford, Naomi E., et al. “A +sparse Bayesian hierarchical vector autoregressive model for microbial +dynamics in a wastewater treatment plant.Computational +Statistics & Data Analysis 179 (2023): 107659.

+

Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. “MARSS: +multivariate autoregressive state-space models for analyzing time-series +data.R Journal. 4.1 (2012): 11.

+

Ward, Eric J., et al. “Inferring +spatial structure from time‐series data: using multivariate state‐space +models to detect metapopulation structure of California sea lions in the +Gulf of California, Mexico.Journal of Applied Ecology +47.1 (2010): 47-56.

+

Auger‐Méthé, Marie, et al. “A +guide to state–space modeling of ecological time series.” +Ecological Monographs 91.4 (2021): e01470.

+
+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/SS_model.svg b/inst/doc/SS_model.svg new file mode 100644 index 00000000..f6441719 --- /dev/null +++ b/inst/doc/SS_model.svg @@ -0,0 +1,255 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + X + 1 + + X + 2 + + X + T + + + X + 3 + + Y + 1 + + + Y + 3 + + + Y + T + + Process model + Observation model + diff --git a/inst/doc/data_in_mvgam.R b/inst/doc/data_in_mvgam.R new file mode 100644 index 00000000..29efbeba --- /dev/null +++ b/inst/doc/data_in_mvgam.R @@ -0,0 +1,248 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +simdat <- sim_mvgam(n_series = 4, T = 24, prop_missing = 0.2) +head(simdat$data_train, 16) + +## ----------------------------------------------------------------------------- +class(simdat$data_train$series) +levels(simdat$data_train$series) + +## ----------------------------------------------------------------------------- +all(levels(simdat$data_train$series) %in% unique(simdat$data_train$series)) + +## ----------------------------------------------------------------------------- +summary(glm(y ~ series + time, + data = simdat$data_train, + family = poisson())) + +## ----------------------------------------------------------------------------- +summary(gam(y ~ series + s(time, by = series), + data = simdat$data_train, + family = poisson())) + +## ----------------------------------------------------------------------------- +gauss_dat <- data.frame(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +gauss_dat + +## ----------------------------------------------------------------------------- +gam(outcome ~ time, + family = betar(), + data = gauss_dat) + +## ----error=TRUE--------------------------------------------------------------- +mvgam(outcome ~ time, + family = betar(), + data = gauss_dat) + +## ----------------------------------------------------------------------------- +# A function to ensure all timepoints within a sequence are identical +all_times_avail = function(time, min_time, max_time){ + identical(as.numeric(sort(time)), + as.numeric(seq.int(from = min_time, to = max_time))) +} + +# Get min and max times from the data +min_time <- min(simdat$data_train$time) +max_time <- max(simdat$data_train$time) + +# Check that all times are recorded for each series +data.frame(series = simdat$data_train$series, + time = simdat$data_train$time) %>% + dplyr::group_by(series) %>% + dplyr::summarise(all_there = all_times_avail(time, + min_time, + max_time)) -> checked_times +if(any(checked_times$all_there == FALSE)){ + warning("One or more series in is missing observations for one or more timepoints") +} else { + cat('All series have observations at all timepoints :)') +} + +## ----------------------------------------------------------------------------- +bad_times <- data.frame(time = seq(1, 16, by = 2), + series = factor('series_1'), + outcome = rnorm(8)) +bad_times + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = bad_times, + family = gaussian()) + +## ----------------------------------------------------------------------------- +bad_times %>% + dplyr::right_join(expand.grid(time = seq(min(bad_times$time), + max(bad_times$time)), + series = factor(unique(bad_times$series), + levels = levels(bad_times$series)))) %>% + dplyr::arrange(time) -> good_times +good_times + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = good_times, + family = gaussian()) + +## ----------------------------------------------------------------------------- +bad_levels <- data.frame(time = 1:8, + series = factor('series_1', + levels = c('series_1', + 'series_2')), + outcome = rnorm(8)) + +levels(bad_levels$series) + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = bad_levels, + family = gaussian()) + +## ----------------------------------------------------------------------------- +setdiff(levels(bad_levels$series), unique(bad_levels$series)) + +## ----------------------------------------------------------------------------- +bad_levels %>% + dplyr::mutate(series = droplevels(series)) -> good_levels +levels(good_levels$series) + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ 1, + data = good_levels, + family = gaussian()) + +## ----------------------------------------------------------------------------- +miss_dat <- data.frame(outcome = rnorm(10), + cov = c(NA, rnorm(9)), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat + +## ----error = TRUE------------------------------------------------------------- +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) + +## ----------------------------------------------------------------------------- +miss_dat <- list(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat$cov <- matrix(rnorm(50), ncol = 5, nrow = 10) +miss_dat$cov[2,3] <- NA + +## ----error=TRUE--------------------------------------------------------------- +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 'all') + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 1) + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + y = 'y', + series = 1) + +## ----------------------------------------------------------------------------- +data("all_neon_tick_data") +str(dplyr::ungroup(all_neon_tick_data)) + +## ----------------------------------------------------------------------------- +plotIDs <- c('SCBI_013','SCBI_002', + 'SERC_001','SERC_005', + 'SERC_006','SERC_012', + 'BLAN_012','BLAN_005') + +## ----------------------------------------------------------------------------- +model_dat <- all_neon_tick_data %>% + dplyr::ungroup() %>% + dplyr::mutate(target = ixodes_scapularis) %>% + dplyr::filter(plotID %in% plotIDs) %>% + dplyr::select(Year, epiWeek, plotID, target) %>% + dplyr::mutate(epiWeek = as.numeric(epiWeek)) + +## ----------------------------------------------------------------------------- +model_dat %>% + # Create all possible combos of plotID, Year and epiWeek; + # missing outcomes will be filled in as NA + dplyr::full_join(expand.grid(plotID = unique(model_dat$plotID), + Year = unique(model_dat$Year), + epiWeek = seq(1, 52))) %>% + + # left_join back to original data so plotID and siteID will + # match up, in case you need the siteID for anything else later on + dplyr::left_join(all_neon_tick_data %>% + dplyr::select(siteID, plotID) %>% + dplyr::distinct()) -> model_dat + +## ----------------------------------------------------------------------------- +model_dat %>% + dplyr::mutate(series = plotID, + y = target) %>% + dplyr::mutate(siteID = factor(siteID), + series = factor(series)) %>% + dplyr::select(-target, -plotID) %>% + dplyr::arrange(Year, epiWeek, series) -> model_dat + +## ----------------------------------------------------------------------------- +model_dat %>% + dplyr::ungroup() %>% + dplyr::group_by(series) %>% + dplyr::arrange(Year, epiWeek) %>% + dplyr::mutate(time = seq(1, dplyr::n())) %>% + dplyr::ungroup() -> model_dat + +## ----------------------------------------------------------------------------- +levels(model_dat$series) + +## ----error=TRUE--------------------------------------------------------------- +get_mvgam_priors(y ~ 1, + data = model_dat, + family = poisson()) + +## ----------------------------------------------------------------------------- +testmod <- mvgam(y ~ s(epiWeek, by = series, bs = 'cc') + + s(series, bs = 're'), + trend_model = 'AR1', + data = model_dat, + backend = 'cmdstanr', + run_model = FALSE) + +## ----------------------------------------------------------------------------- +str(testmod$model_data) + +## ----------------------------------------------------------------------------- +code(testmod) + diff --git a/inst/doc/data_in_mvgam.Rmd b/inst/doc/data_in_mvgam.Rmd new file mode 100644 index 00000000..b8240edd --- /dev/null +++ b/inst/doc/data_in_mvgam.Rmd @@ -0,0 +1,354 @@ +--- +title: "Formatting data for use in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Formatting data for use in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +This vignette gives an example of how to take raw data and format it for use in `mvgam`. This is not an exhaustive example, as data can be recorded and stored in a variety of ways, which requires different approaches to wrangle the data into the necessary format for `mvgam`. For full details on the basic `mvgam` functionality, please see [the introductory vignette](https://nicholasjclark.github.io/mvgam/articles/mvgam_overview.html). + +## Required *long* data format +Manipulating the data into a 'long' format is necessary for modelling in `mvgam`. By 'long' format, we mean that each `series x time` observation needs to have its own entry in the `dataframe` or `list` object that we wish to use as data for modelling. A simple example can be viewed by simulating data using the `sim_mvgam` function. See `?sim_mvgam` for more details +```{r} +simdat <- sim_mvgam(n_series = 4, T = 24, prop_missing = 0.2) +head(simdat$data_train, 16) +``` + +### `series` as a `factor` variable +Notice how we have four different time series in these simulated data, and we have identified the series-level indicator as a `factor` variable. +```{r} +class(simdat$data_train$series) +levels(simdat$data_train$series) +``` + +It is important that the number of levels matches the number of unique series in the data to ensure indexing across series works properly in the underlying modelling functions. Several of the main workhorse functions in the package (including `mvgam()` and `get_mvgam_priors()`) will give an error if this is not the case, but it may be worth checking anyway: +```{r} +all(levels(simdat$data_train$series) %in% unique(simdat$data_train$series)) +``` + +Note that you can technically supply data that does not have a `series` indicator, and the package will assume that you are only using a single time series. But again, it is better to have this included so there is no confusion. + +### A single outcome variable +You may also have notices that we do not spread the `numeric / integer`-classed outcome variable into different columns. Rather, there is only a single column for the outcome variable, labelled `y` in these simulated data (though the outcome does not have to be labelled `y`). This is another important requirement in `mvgam`, but it shouldn't be too unfamiliar to `R` users who frequently use modelling packages such as `lme4`, `mgcv`, `brms` or the many other regression modelling packages out there. The advantage of this format is that it is now very easy to specify effects that vary among time series: +```{r} +summary(glm(y ~ series + time, + data = simdat$data_train, + family = poisson())) +``` + +```{r} +summary(gam(y ~ series + s(time, by = series), + data = simdat$data_train, + family = poisson())) +``` + +Depending on the observation families you plan to use when building models, there may be some restrictions that need to be satisfied within the outcome variable. For example, a Beta regression can only handle proportional data, so values `>= 1` or `<= 0` are not allowed. Likewise, a Poisson regression can only handle non-negative integers. Most regression functions in `R` will assume the user knows all of this and so will not issue any warnings or errors if you choose the wrong distribution, but often this ends up leading to some unhelpful error from an optimizer that is difficult to interpret and diagnose. `mvgam` will attempt to provide some errors if you do something that is simply not allowed. For example, we can simulate data from a zero-centred Gaussian distribution (ensuring that some of our values will be `< 1`) and attempt a Beta regression in `mvgam` using the `betar` family: +```{r} +gauss_dat <- data.frame(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +gauss_dat +``` + +A call to `gam` using the `mgcv` package leads to a model that actually fits (though it does give an unhelpful warning message): +```{r} +gam(outcome ~ time, + family = betar(), + data = gauss_dat) +``` + +But the same call to `mvgam` gives us something more useful: +```{r error=TRUE} +mvgam(outcome ~ time, + family = betar(), + data = gauss_dat) +``` + +Please see `?mvgam_families` for more information on the types of responses that the package can handle and their restrictions + +### A `time` variable +The other requirement for most models that can be fit in `mvgam` is a `numeric / integer`-classed variable labelled `time`. This ensures the modelling software knows how to arrange the time series when building models. This setup still allows us to formulate multivariate time series models. If you plan to use any of the autoregressive dynamic trend functions available in `mvgam` (see `?mvgam_trends` for details of available dynamic processes), you will need to ensure your time series are entered with a fixed sampling interval (i.e. the time between timesteps 1 and 2 should be the same as the time between timesteps 2 and 3, etc...). But note that you can have missing observations for some (or all) series. `mvgam` will check this for you, but again it is useful to ensure you have no missing timepoint x series combinations in your data. You can generally do this with a simple `dplyr` call: +```{r} +# A function to ensure all timepoints within a sequence are identical +all_times_avail = function(time, min_time, max_time){ + identical(as.numeric(sort(time)), + as.numeric(seq.int(from = min_time, to = max_time))) +} + +# Get min and max times from the data +min_time <- min(simdat$data_train$time) +max_time <- max(simdat$data_train$time) + +# Check that all times are recorded for each series +data.frame(series = simdat$data_train$series, + time = simdat$data_train$time) %>% + dplyr::group_by(series) %>% + dplyr::summarise(all_there = all_times_avail(time, + min_time, + max_time)) -> checked_times +if(any(checked_times$all_there == FALSE)){ + warning("One or more series in is missing observations for one or more timepoints") +} else { + cat('All series have observations at all timepoints :)') +} +``` + +Note that models which use dynamic components will assume that smaller values of `time` are *older* (i.e. `time = 1` came *before* `time = 2`, etc...) + +### Irregular sampling intervals? +Most `mvgam` trend models expect `time` to be measured in discrete, evenly-spaced intervals (i.e. one measurement per week, or one per year, for example; though missing values are allowed). But please note that irregularly sampled time intervals are allowed, in which case the `CAR()` trend model (continuous time autoregressive) is appropriate. You can see an example of this kind of model in the **Examples** section in `?CAR`. You can also use `trend_model = 'None'` (the default in `mvgam()`) and instead use a Gaussian Process to model temporal variation for irregularly-sampled time series. See the `?brms::gp` for details + +## Checking data with `get_mvgam_priors` +The `get_mvgam_priors` function is designed to return information about the parameters in a model whose prior distributions can be modified by the user. But in doing so, it will perform a series of checks to ensure the data are formatted properly. It can therefore be very useful to new users for ensuring there isn't anything strange going on in the data setup. For example, we can replicate the steps taken above (to check factor levels and timepoint x series combinations) with a single call to `get_mvgam_priors`. Here we first simulate some data in which some of the timepoints in the `time` variable are not included in the data: +```{r} +bad_times <- data.frame(time = seq(1, 16, by = 2), + series = factor('series_1'), + outcome = rnorm(8)) +bad_times +``` + +Next we call `get_mvgam_priors` by simply specifying an intercept-only model, which is enough to trigger all the checks: +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = bad_times, + family = gaussian()) +``` + +This error is useful as it tells us where the problem is. There are many ways to fill in missing timepoints, so the correct way will have to be left up to the user. But if you don't have any covariates, it should be pretty easy using `expand.grid`: +```{r} +bad_times %>% + dplyr::right_join(expand.grid(time = seq(min(bad_times$time), + max(bad_times$time)), + series = factor(unique(bad_times$series), + levels = levels(bad_times$series)))) %>% + dplyr::arrange(time) -> good_times +good_times +``` + +Now the call to `get_mvgam_priors`, using our filled in data, should work: +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = good_times, + family = gaussian()) +``` + +This function should also pick up on misaligned factor levels for the `series` variable. We can check this by again simulating, this time adding an additional factor level that is not included in the data: +```{r} +bad_levels <- data.frame(time = 1:8, + series = factor('series_1', + levels = c('series_1', + 'series_2')), + outcome = rnorm(8)) + +levels(bad_levels$series) +``` + +Another call to `get_mvgam_priors` brings up a useful error: +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = bad_levels, + family = gaussian()) +``` + +Following the message's advice tells us there is a level for `series_2` in the `series` variable, but there are no observations for this series in the data: +```{r} +setdiff(levels(bad_levels$series), unique(bad_levels$series)) +``` + +Re-assigning the levels fixes the issue: +```{r} +bad_levels %>% + dplyr::mutate(series = droplevels(series)) -> good_levels +levels(good_levels$series) +``` + +```{r error = TRUE} +get_mvgam_priors(outcome ~ 1, + data = good_levels, + family = gaussian()) +``` + +### Covariates with no `NA`s +Covariates can be used in models just as you would when using `mgcv` (see `?formula.gam` for details of the formula syntax). But although the outcome variable can have `NA`s, covariates cannot. Most regression software will silently drop any raws in the model matrix that have `NA`s, which is not helpful when debugging. Both the `mvgam` and `get_mvgam_priors` functions will run some simple checks for you, and hopefully will return useful errors if it finds in missing values: +```{r} +miss_dat <- data.frame(outcome = rnorm(10), + cov = c(NA, rnorm(9)), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat +``` + +```{r error = TRUE} +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) +``` + +Just like with the `mgcv` package, `mvgam` can also accept data as a `list` object. This is useful if you want to set up [linear functional predictors](https://rdrr.io/cran/mgcv/man/linear.functional.terms.html) or even distributed lag predictors. The checks run by `mvgam` should still work on these data. Here we change the `cov` predictor to be a `matrix`: +```{r} +miss_dat <- list(outcome = rnorm(10), + series = factor('series1', + levels = 'series1'), + time = 1:10) +miss_dat$cov <- matrix(rnorm(50), ncol = 5, nrow = 10) +miss_dat$cov[2,3] <- NA +``` + +A call to `mvgam` returns the same error: +```{r error=TRUE} +get_mvgam_priors(outcome ~ cov, + data = miss_dat, + family = gaussian()) +``` + +## Plotting with `plot_mvgam_series` +Plotting the data is a useful way to ensure everything looks ok, once you've gone throug the above checks on factor levels and timepoint x series combinations. The `plot_mvgam_series` function will take supplied data and plot either a series of line plots (if you choose `series = 'all'`) or a set of plots to describe the distribution for a single time series. For example, to plot all of the time series in our data, and highlight a single series in each plot, we can use: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 'all') +``` + +Or we can look more closely at the distribution for the first time series: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + y = 'y', + series = 1) +``` + +If you have split your data into training and testing folds (i.e. for forecast evaluation), you can include the test data in your plots: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + y = 'y', + series = 1) +``` + +## Example with NEON tick data +To give one example of how data can be reformatted for `mvgam` modelling, we will use observations from the National Ecological Observatory Network (NEON) tick drag cloth samples. *Ixodes scapularis* is a widespread tick species capable of transmitting a diversity of parasites to animals and humans, many of which are zoonotic. Due to the medical and ecological importance of this tick species, a common goal is to understand factors that influence their abundances. The NEON field team carries out standardised [long-term monitoring of tick abundances as well as other important indicators of ecological change](https://www.neonscience.org/data-collection/ticks){target="_blank"}. Nymphal abundance of *I. scapularis* is routinely recorded across NEON plots using a field sampling method called drag cloth sampling, which is a common method for sampling ticks in the landscape. Field researchers sample ticks by dragging a large cloth behind themselves through terrain that is suspected of harboring ticks, usually working in a grid-like pattern. The sites have been sampled since 2014, resulting in a rich dataset of nymph abundance time series. These tick time series show strong seasonality and incorporate many of the challenging features associated with ecological data including overdispersion, high proportions of missingness and irregular sampling in time, making them useful for exploring the utility of dynamic GAMs. + +We begin by loading NEON tick data for the years 2014 - 2021, which were downloaded from NEON and prepared as described in [Clark & Wells 2022](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.13974){target="_blank"}. You can read a bit about the data using the call `?all_neon_tick_data` +```{r} +data("all_neon_tick_data") +str(dplyr::ungroup(all_neon_tick_data)) +``` + +For this exercise, we will use the `epiWeek` variable as an index of seasonality, and we will only work with observations from a few sampling plots (labelled in the `plotID` column): +```{r} +plotIDs <- c('SCBI_013','SCBI_002', + 'SERC_001','SERC_005', + 'SERC_006','SERC_012', + 'BLAN_012','BLAN_005') +``` + +Now we can select the target species we want (*I. scapularis*), filter to the correct plot IDs and convert the `epiWeek` variable from `character` to `numeric`: +```{r} +model_dat <- all_neon_tick_data %>% + dplyr::ungroup() %>% + dplyr::mutate(target = ixodes_scapularis) %>% + dplyr::filter(plotID %in% plotIDs) %>% + dplyr::select(Year, epiWeek, plotID, target) %>% + dplyr::mutate(epiWeek = as.numeric(epiWeek)) +``` + +Now is the tricky part: we need to fill in missing observations with `NA`s. The tick data are sparse in that field observers do not go out and sample in each possible `epiWeek`. So there are many particular weeks in which observations are not included in the data. But we can use `expand.grid` again to take care of this: +```{r} +model_dat %>% + # Create all possible combos of plotID, Year and epiWeek; + # missing outcomes will be filled in as NA + dplyr::full_join(expand.grid(plotID = unique(model_dat$plotID), + Year = unique(model_dat$Year), + epiWeek = seq(1, 52))) %>% + + # left_join back to original data so plotID and siteID will + # match up, in case you need the siteID for anything else later on + dplyr::left_join(all_neon_tick_data %>% + dplyr::select(siteID, plotID) %>% + dplyr::distinct()) -> model_dat +``` + +Create the `series` variable needed for `mvgam` modelling: +```{r} +model_dat %>% + dplyr::mutate(series = plotID, + y = target) %>% + dplyr::mutate(siteID = factor(siteID), + series = factor(series)) %>% + dplyr::select(-target, -plotID) %>% + dplyr::arrange(Year, epiWeek, series) -> model_dat +``` + +Now create the `time` variable, which needs to track `Year` and `epiWeek` for each unique series. The `n` function from `dplyr` is often useful if generating a `time` index for grouped dataframes: +```{r} +model_dat %>% + dplyr::ungroup() %>% + dplyr::group_by(series) %>% + dplyr::arrange(Year, epiWeek) %>% + dplyr::mutate(time = seq(1, dplyr::n())) %>% + dplyr::ungroup() -> model_dat +``` + +Check factor levels for the `series`: +```{r} +levels(model_dat$series) +``` + +This looks good, as does a more rigorous check using `get_mvgam_priors`: +```{r error=TRUE} +get_mvgam_priors(y ~ 1, + data = model_dat, + family = poisson()) +``` + +We can also set up a model in `mvgam` but use `run_model = FALSE` to further ensure all of the necessary steps for creating the modelling code and objects will run. It is recommended that you use the `cmdstanr` backend if possible, as the auto-formatting options available in this package are very useful for checking the package-generated `Stan` code for any inefficiencies that can be fixed to lead to sampling performance improvements: +```{r} +testmod <- mvgam(y ~ s(epiWeek, by = series, bs = 'cc') + + s(series, bs = 're'), + trend_model = 'AR1', + data = model_dat, + backend = 'cmdstanr', + run_model = FALSE) +``` + +This call runs without issue, and the resulting object now contains the model code and data objects that are needed to initiate sampling: +```{r} +str(testmod$model_data) +``` + +```{r} +code(testmod) +``` + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/inst/doc/data_in_mvgam.html b/inst/doc/data_in_mvgam.html new file mode 100644 index 00000000..369d4844 --- /dev/null +++ b/inst/doc/data_in_mvgam.html @@ -0,0 +1,1148 @@ + + + + + + + + + + + + + + + + +Formatting data for use in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Formatting data for use in mvgam

+

Nicholas J Clark

+

2024-04-16

+ + +
+ +
+ +

This vignette gives an example of how to take raw data and format it +for use in mvgam. This is not an exhaustive example, as +data can be recorded and stored in a variety of ways, which requires +different approaches to wrangle the data into the necessary format for +mvgam. For full details on the basic mvgam +functionality, please see the +introductory vignette.

+
+

Required long data format

+

Manipulating the data into a ‘long’ format is necessary for modelling +in mvgam. By ‘long’ format, we mean that each +series x time observation needs to have its own entry in +the dataframe or list object that we wish to +use as data for modelling. A simple example can be viewed by simulating +data using the sim_mvgam function. See +?sim_mvgam for more details

+
simdat <- sim_mvgam(n_series = 4, T = 24, prop_missing = 0.2)
+head(simdat$data_train, 16)
+#>     y season year   series time
+#> 1  NA      1    1 series_1    1
+#> 2   0      1    1 series_2    1
+#> 3   0      1    1 series_3    1
+#> 4   0      1    1 series_4    1
+#> 5   0      2    1 series_1    2
+#> 6   1      2    1 series_2    2
+#> 7   1      2    1 series_3    2
+#> 8   1      2    1 series_4    2
+#> 9   0      3    1 series_1    3
+#> 10 NA      3    1 series_2    3
+#> 11  0      3    1 series_3    3
+#> 12 NA      3    1 series_4    3
+#> 13  1      4    1 series_1    4
+#> 14  0      4    1 series_2    4
+#> 15  0      4    1 series_3    4
+#> 16  2      4    1 series_4    4
+
+

series as a factor variable

+

Notice how we have four different time series in these simulated +data, and we have identified the series-level indicator as a +factor variable.

+
class(simdat$data_train$series)
+#> [1] "factor"
+levels(simdat$data_train$series)
+#> [1] "series_1" "series_2" "series_3" "series_4"
+

It is important that the number of levels matches the number of +unique series in the data to ensure indexing across series works +properly in the underlying modelling functions. Several of the main +workhorse functions in the package (including mvgam() and +get_mvgam_priors()) will give an error if this is not the +case, but it may be worth checking anyway:

+
all(levels(simdat$data_train$series) %in% unique(simdat$data_train$series))
+#> [1] TRUE
+

Note that you can technically supply data that does not have a +series indicator, and the package will assume that you are +only using a single time series. But again, it is better to have this +included so there is no confusion.

+
+
+

A single outcome variable

+

You may also have notices that we do not spread the +numeric / integer-classed outcome variable into different +columns. Rather, there is only a single column for the outcome variable, +labelled y in these simulated data (though the outcome does +not have to be labelled y). This is another important +requirement in mvgam, but it shouldn’t be too unfamiliar to +R users who frequently use modelling packages such as +lme4, mgcv, brms or the many +other regression modelling packages out there. The advantage of this +format is that it is now very easy to specify effects that vary among +time series:

+
summary(glm(y ~ series + time,
+            data = simdat$data_train,
+            family = poisson()))
+#> 
+#> Call:
+#> glm(formula = y ~ series + time, family = poisson(), data = simdat$data_train)
+#> 
+#> Coefficients:
+#>                Estimate Std. Error z value Pr(>|z|)  
+#> (Intercept)    -0.05275    0.38870  -0.136   0.8920  
+#> seriesseries_2 -0.80716    0.45417  -1.777   0.0755 .
+#> seriesseries_3 -1.21614    0.51290  -2.371   0.0177 *
+#> seriesseries_4  0.55084    0.31854   1.729   0.0838 .
+#> time            0.01725    0.02701   0.639   0.5229  
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> (Dispersion parameter for poisson family taken to be 1)
+#> 
+#>     Null deviance: 120.029  on 56  degrees of freedom
+#> Residual deviance:  96.641  on 52  degrees of freedom
+#>   (15 observations deleted due to missingness)
+#> AIC: 166.83
+#> 
+#> Number of Fisher Scoring iterations: 6
+
summary(gam(y ~ series + s(time, by = series),
+            data = simdat$data_train,
+            family = poisson()))
+#> 
+#> Family: poisson 
+#> Link function: log 
+#> 
+#> Formula:
+#> y ~ series + s(time, by = series)
+#> 
+#> Parametric coefficients:
+#>                Estimate Std. Error z value Pr(>|z|)
+#> (Intercept)      -4.293      5.500  -0.781    0.435
+#> seriesseries_2    3.001      5.533   0.542    0.588
+#> seriesseries_3    3.193      5.518   0.579    0.563
+#> seriesseries_4    4.795      5.505   0.871    0.384
+#> 
+#> Approximate significance of smooth terms:
+#>                          edf Ref.df Chi.sq p-value  
+#> s(time):seriesseries_1 7.737  8.181  6.541  0.5585  
+#> s(time):seriesseries_2 3.444  4.213  4.739  0.3415  
+#> s(time):seriesseries_3 1.000  1.000  0.006  0.9365  
+#> s(time):seriesseries_4 3.958  4.832 11.636  0.0363 *
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> R-sq.(adj) =  0.605   Deviance explained = 66.2%
+#> UBRE = 0.4193  Scale est. = 1         n = 57
+

Depending on the observation families you plan to use when building +models, there may be some restrictions that need to be satisfied within +the outcome variable. For example, a Beta regression can only handle +proportional data, so values >= 1 or +<= 0 are not allowed. Likewise, a Poisson regression can +only handle non-negative integers. Most regression functions in +R will assume the user knows all of this and so will not +issue any warnings or errors if you choose the wrong distribution, but +often this ends up leading to some unhelpful error from an optimizer +that is difficult to interpret and diagnose. mvgam will +attempt to provide some errors if you do something that is simply not +allowed. For example, we can simulate data from a zero-centred Gaussian +distribution (ensuring that some of our values will be +< 1) and attempt a Beta regression in mvgam +using the betar family:

+
gauss_dat <- data.frame(outcome = rnorm(10),
+                        series = factor('series1',
+                                        levels = 'series1'),
+                        time = 1:10)
+gauss_dat
+#>        outcome  series time
+#> 1  -1.51807964 series1    1
+#> 2  -0.12895041 series1    2
+#> 3   0.91902592 series1    3
+#> 4  -0.78329254 series1    4
+#> 5   0.28469724 series1    5
+#> 6   0.07481887 series1    6
+#> 7   0.03770728 series1    7
+#> 8  -0.37485636 series1    8
+#> 9   0.23694172 series1    9
+#> 10 -0.53988302 series1   10
+

A call to gam using the mgcv package leads +to a model that actually fits (though it does give an unhelpful warning +message):

+
gam(outcome ~ time,
+    family = betar(),
+    data = gauss_dat)
+#> Warning in family$saturated.ll(y, prior.weights, theta): saturated likelihood
+#> may be inaccurate
+#> 
+#> Family: Beta regression(0.44) 
+#> Link function: logit 
+#> 
+#> Formula:
+#> outcome ~ time
+#> Total model degrees of freedom 2 
+#> 
+#> REML score: -127.2706
+

But the same call to mvgam gives us something more +useful:

+
mvgam(outcome ~ time,
+      family = betar(),
+      data = gauss_dat)
+#> Error: Values <= 0 not allowed for beta responses
+

Please see ?mvgam_families for more information on the +types of responses that the package can handle and their +restrictions

+
+
+

A time variable

+

The other requirement for most models that can be fit in +mvgam is a numeric / integer-classed variable +labelled time. This ensures the modelling software knows +how to arrange the time series when building models. This setup still +allows us to formulate multivariate time series models. If you plan to +use any of the autoregressive dynamic trend functions available in +mvgam (see ?mvgam_trends for details of +available dynamic processes), you will need to ensure your time series +are entered with a fixed sampling interval (i.e. the time between +timesteps 1 and 2 should be the same as the time between timesteps 2 and +3, etc…). But note that you can have missing observations for some (or +all) series. mvgam will check this for you, but again it is +useful to ensure you have no missing timepoint x series combinations in +your data. You can generally do this with a simple dplyr +call:

+
# A function to ensure all timepoints within a sequence are identical
+all_times_avail = function(time, min_time, max_time){
+    identical(as.numeric(sort(time)),
+              as.numeric(seq.int(from = min_time, to = max_time)))
+}
+
+# Get min and max times from the data
+min_time <- min(simdat$data_train$time)
+max_time <- max(simdat$data_train$time)
+
+# Check that all times are recorded for each series
+data.frame(series = simdat$data_train$series,
+           time = simdat$data_train$time) %>%
+    dplyr::group_by(series) %>%
+    dplyr::summarise(all_there = all_times_avail(time,
+                                                 min_time,
+                                                 max_time)) -> checked_times
+if(any(checked_times$all_there == FALSE)){
+  warning("One or more series in is missing observations for one or more timepoints")
+} else {
+  cat('All series have observations at all timepoints :)')
+}
+#> All series have observations at all timepoints :)
+

Note that models which use dynamic components will assume that +smaller values of time are older +(i.e. time = 1 came before time = 2, +etc…)

+
+
+

Irregular sampling intervals?

+

Most mvgam trend models expect time to be +measured in discrete, evenly-spaced intervals (i.e. one measurement per +week, or one per year, for example; though missing values are allowed). +But please note that irregularly sampled time intervals are allowed, in +which case the CAR() trend model (continuous time +autoregressive) is appropriate. You can see an example of this kind of +model in the Examples section in ?CAR. You +can also use trend_model = 'None' (the default in +mvgam()) and instead use a Gaussian Process to model +temporal variation for irregularly-sampled time series. See the +?brms::gp for details

+
+
+
+

Checking data with get_mvgam_priors

+

The get_mvgam_priors function is designed to return +information about the parameters in a model whose prior distributions +can be modified by the user. But in doing so, it will perform a series +of checks to ensure the data are formatted properly. It can therefore be +very useful to new users for ensuring there isn’t anything strange going +on in the data setup. For example, we can replicate the steps taken +above (to check factor levels and timepoint x series combinations) with +a single call to get_mvgam_priors. Here we first simulate +some data in which some of the timepoints in the time +variable are not included in the data:

+
bad_times <- data.frame(time = seq(1, 16, by = 2),
+                        series = factor('series_1'),
+                        outcome = rnorm(8))
+bad_times
+#>   time   series    outcome
+#> 1    1 series_1  1.4681068
+#> 2    3 series_1  0.1796627
+#> 3    5 series_1 -0.4204020
+#> 4    7 series_1 -1.0729359
+#> 5    9 series_1 -0.1738239
+#> 6   11 series_1 -0.5463268
+#> 7   13 series_1  0.8275198
+#> 8   15 series_1  2.2085085
+

Next we call get_mvgam_priors by simply specifying an +intercept-only model, which is enough to trigger all the checks:

+
get_mvgam_priors(outcome ~ 1,
+                 data = bad_times,
+                 family = gaussian())
+#> Error: One or more series in data is missing observations for one or more timepoints
+

This error is useful as it tells us where the problem is. There are +many ways to fill in missing timepoints, so the correct way will have to +be left up to the user. But if you don’t have any covariates, it should +be pretty easy using expand.grid:

+
bad_times %>%
+  dplyr::right_join(expand.grid(time = seq(min(bad_times$time),
+                                           max(bad_times$time)),
+                                series = factor(unique(bad_times$series),
+                                                levels = levels(bad_times$series)))) %>%
+  dplyr::arrange(time) -> good_times
+#> Joining with `by = join_by(time, series)`
+good_times
+#>    time   series    outcome
+#> 1     1 series_1  1.4681068
+#> 2     2 series_1         NA
+#> 3     3 series_1  0.1796627
+#> 4     4 series_1         NA
+#> 5     5 series_1 -0.4204020
+#> 6     6 series_1         NA
+#> 7     7 series_1 -1.0729359
+#> 8     8 series_1         NA
+#> 9     9 series_1 -0.1738239
+#> 10   10 series_1         NA
+#> 11   11 series_1 -0.5463268
+#> 12   12 series_1         NA
+#> 13   13 series_1  0.8275198
+#> 14   14 series_1         NA
+#> 15   15 series_1  2.2085085
+

Now the call to get_mvgam_priors, using our filled in +data, should work:

+
get_mvgam_priors(outcome ~ 1,
+                 data = good_times,
+                 family = gaussian())
+#>                             param_name param_length           param_info
+#> 1                          (Intercept)            1          (Intercept)
+#> 2 vector<lower=0>[n_series] sigma_obs;            1 observation error sd
+#>                                 prior                   example_change
+#> 1 (Intercept) ~ student_t(3, 0, 2.5);      (Intercept) ~ normal(0, 1);
+#> 2   sigma_obs ~ student_t(3, 0, 2.5); sigma_obs ~ normal(-0.22, 0.33);
+#>   new_lowerbound new_upperbound
+#> 1             NA             NA
+#> 2             NA             NA
+

This function should also pick up on misaligned factor levels for the +series variable. We can check this by again simulating, +this time adding an additional factor level that is not included in the +data:

+
bad_levels <- data.frame(time = 1:8,
+                        series = factor('series_1',
+                                        levels = c('series_1',
+                                                   'series_2')),
+                        outcome = rnorm(8))
+
+levels(bad_levels$series)
+#> [1] "series_1" "series_2"
+

Another call to get_mvgam_priors brings up a useful +error:

+
get_mvgam_priors(outcome ~ 1,
+                 data = bad_levels,
+                 family = gaussian())
+#> Error: Mismatch between factor levels of "series" and unique values of "series"
+#> Use
+#>   `setdiff(levels(data$series), unique(data$series))` 
+#> and
+#>   `intersect(levels(data$series), unique(data$series))`
+#> for guidance
+

Following the message’s advice tells us there is a level for +series_2 in the series variable, but there are +no observations for this series in the data:

+
setdiff(levels(bad_levels$series), unique(bad_levels$series))
+#> [1] "series_2"
+

Re-assigning the levels fixes the issue:

+
bad_levels %>%
+  dplyr::mutate(series = droplevels(series)) -> good_levels
+levels(good_levels$series)
+#> [1] "series_1"
+
get_mvgam_priors(outcome ~ 1,
+                 data = good_levels,
+                 family = gaussian())
+#>                             param_name param_length           param_info
+#> 1                          (Intercept)            1          (Intercept)
+#> 2 vector<lower=0>[n_series] sigma_obs;            1 observation error sd
+#>                                  prior                  example_change
+#> 1 (Intercept) ~ student_t(3, -1, 2.5);     (Intercept) ~ normal(0, 1);
+#> 2    sigma_obs ~ student_t(3, 0, 2.5); sigma_obs ~ normal(0.98, 0.91);
+#>   new_lowerbound new_upperbound
+#> 1             NA             NA
+#> 2             NA             NA
+
+

Covariates with no NAs

+

Covariates can be used in models just as you would when using +mgcv (see ?formula.gam for details of the +formula syntax). But although the outcome variable can have +NAs, covariates cannot. Most regression software will +silently drop any raws in the model matrix that have NAs, +which is not helpful when debugging. Both the mvgam and +get_mvgam_priors functions will run some simple checks for +you, and hopefully will return useful errors if it finds in missing +values:

+
miss_dat <- data.frame(outcome = rnorm(10),
+                       cov = c(NA, rnorm(9)),
+                       series = factor('series1',
+                                       levels = 'series1'),
+                       time = 1:10)
+miss_dat
+#>        outcome        cov  series time
+#> 1   0.77436859         NA series1    1
+#> 2   0.33222199 -0.2653819 series1    2
+#> 3   0.50385503  0.6658354 series1    3
+#> 4  -0.99577591  0.3541730 series1    4
+#> 5  -1.09812817 -2.3125954 series1    5
+#> 6  -0.49687774 -1.0778578 series1    6
+#> 7  -1.26666072 -0.1973507 series1    7
+#> 8  -0.11638041 -3.0585179 series1    8
+#> 9   0.08890432  1.7964928 series1    9
+#> 10 -0.64375459  0.7894733 series1   10
+
get_mvgam_priors(outcome ~ cov,
+                 data = miss_dat,
+                 family = gaussian())
+#> Error: Missing values found in data predictors:
+#>  Error in na.fail.default(structure(list(outcome = c(0.774368589907313, : missing values in object
+

Just like with the mgcv package, mvgam can +also accept data as a list object. This is useful if you +want to set up linear +functional predictors or even distributed lag predictors. The checks +run by mvgam should still work on these data. Here we +change the cov predictor to be a matrix:

+
miss_dat <- list(outcome = rnorm(10),
+                 series = factor('series1',
+                                 levels = 'series1'),
+                 time = 1:10)
+miss_dat$cov <- matrix(rnorm(50), ncol = 5, nrow = 10)
+miss_dat$cov[2,3] <- NA
+

A call to mvgam returns the same error:

+
get_mvgam_priors(outcome ~ cov,
+                 data = miss_dat,
+                 family = gaussian())
+#> Error: Missing values found in data predictors:
+#>  Error in na.fail.default(structure(list(outcome = c(-0.708736388395862, : missing values in object
+
+
+
+

Plotting with plot_mvgam_series

+

Plotting the data is a useful way to ensure everything looks ok, once +you’ve gone throug the above checks on factor levels and timepoint x +series combinations. The plot_mvgam_series function will +take supplied data and plot either a series of line plots (if you choose +series = 'all') or a set of plots to describe the +distribution for a single time series. For example, to plot all of the +time series in our data, and highlight a single series in each plot, we +can use:

+
plot_mvgam_series(data = simdat$data_train, 
+                  y = 'y', 
+                  series = 'all')
+

Plotting time series features for GAM models in mvgam

+

Or we can look more closely at the distribution for the first time +series:

+
plot_mvgam_series(data = simdat$data_train, 
+                  y = 'y', 
+                  series = 1)
+

Plotting time series features for GAM models in mvgam

+

If you have split your data into training and testing folds (i.e. for +forecast evaluation), you can include the test data in your plots:

+
plot_mvgam_series(data = simdat$data_train,
+                  newdata = simdat$data_test,
+                  y = 'y', 
+                  series = 1)
+

Plotting time series features for GAM models in mvgam

+
+
+

Example with NEON tick data

+

To give one example of how data can be reformatted for +mvgam modelling, we will use observations from the National +Ecological Observatory Network (NEON) tick drag cloth samples. +Ixodes scapularis is a widespread tick species capable of +transmitting a diversity of parasites to animals and humans, many of +which are zoonotic. Due to the medical and ecological importance of this +tick species, a common goal is to understand factors that influence +their abundances. The NEON field team carries out standardised long-term monitoring of tick abundances as well as other +important indicators of ecological change. Nymphal abundance of +I. scapularis is routinely recorded across NEON plots using a +field sampling method called drag cloth sampling, which is a common +method for sampling ticks in the landscape. Field researchers sample +ticks by dragging a large cloth behind themselves through terrain that +is suspected of harboring ticks, usually working in a grid-like pattern. +The sites have been sampled since 2014, resulting in a rich dataset of +nymph abundance time series. These tick time series show strong +seasonality and incorporate many of the challenging features associated +with ecological data including overdispersion, high proportions of +missingness and irregular sampling in time, making them useful for +exploring the utility of dynamic GAMs.

+

We begin by loading NEON tick data for the years 2014 - 2021, which +were downloaded from NEON and prepared as described in Clark & Wells 2022. You can read a bit about the +data using the call ?all_neon_tick_data

+
data("all_neon_tick_data")
+str(dplyr::ungroup(all_neon_tick_data))
+#> tibble [3,505 × 24] (S3: tbl_df/tbl/data.frame)
+#>  $ Year                : num [1:3505] 2015 2015 2015 2015 2015 ...
+#>  $ epiWeek             : chr [1:3505] "37" "38" "39" "40" ...
+#>  $ yearWeek            : chr [1:3505] "201537" "201538" "201539" "201540" ...
+#>  $ plotID              : chr [1:3505] "BLAN_005" "BLAN_005" "BLAN_005" "BLAN_005" ...
+#>  $ siteID              : chr [1:3505] "BLAN" "BLAN" "BLAN" "BLAN" ...
+#>  $ nlcdClass           : chr [1:3505] "deciduousForest" "deciduousForest" "deciduousForest" "deciduousForest" ...
+#>  $ decimalLatitude     : num [1:3505] 39.1 39.1 39.1 39.1 39.1 ...
+#>  $ decimalLongitude    : num [1:3505] -78 -78 -78 -78 -78 ...
+#>  $ elevation           : num [1:3505] 168 168 168 168 168 ...
+#>  $ totalSampledArea    : num [1:3505] 162 NA NA NA 162 NA NA NA NA 164 ...
+#>  $ amblyomma_americanum: num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ ixodes_scapularis   : num [1:3505] 2 NA NA NA 0 NA NA NA NA 0 ...
+#>  $ time                : Date[1:3505], format: "2015-09-13" "2015-09-20" ...
+#>  $ RHMin_precent       : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ RHMin_variance      : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ RHMax_precent       : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ RHMax_variance      : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMin_degC     : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMin_variance : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMax_degC     : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ airTempMax_variance : num [1:3505] NA NA NA NA NA NA NA NA NA NA ...
+#>  $ soi                 : num [1:3505] -18.4 -17.9 -23.5 -28.4 -25.9 ...
+#>  $ cum_sdd             : num [1:3505] 173 173 173 173 173 ...
+#>  $ cum_gdd             : num [1:3505] 1129 1129 1129 1129 1129 ...
+

For this exercise, we will use the epiWeek variable as +an index of seasonality, and we will only work with observations from a +few sampling plots (labelled in the plotID column):

+
plotIDs <- c('SCBI_013','SCBI_002',
+             'SERC_001','SERC_005',
+             'SERC_006','SERC_012',
+             'BLAN_012','BLAN_005')
+

Now we can select the target species we want (I. +scapularis), filter to the correct plot IDs and convert the +epiWeek variable from character to +numeric:

+
model_dat <- all_neon_tick_data %>%
+  dplyr::ungroup() %>%
+  dplyr::mutate(target = ixodes_scapularis) %>%
+  dplyr::filter(plotID %in% plotIDs) %>%
+  dplyr::select(Year, epiWeek, plotID, target) %>%
+  dplyr::mutate(epiWeek = as.numeric(epiWeek))
+

Now is the tricky part: we need to fill in missing observations with +NAs. The tick data are sparse in that field observers do +not go out and sample in each possible epiWeek. So there +are many particular weeks in which observations are not included in the +data. But we can use expand.grid again to take care of +this:

+
model_dat %>%
+  # Create all possible combos of plotID, Year and epiWeek; 
+  # missing outcomes will be filled in as NA
+  dplyr::full_join(expand.grid(plotID = unique(model_dat$plotID),
+                               Year = unique(model_dat$Year),
+                               epiWeek = seq(1, 52))) %>%
+  
+  # left_join back to original data so plotID and siteID will
+  # match up, in case you need the siteID for anything else later on
+  dplyr::left_join(all_neon_tick_data %>%
+                     dplyr::select(siteID, plotID) %>%
+                     dplyr::distinct()) -> model_dat
+#> Joining with `by = join_by(Year, epiWeek, plotID)`
+#> Joining with `by = join_by(plotID)`
+

Create the series variable needed for mvgam +modelling:

+
model_dat %>%
+  dplyr::mutate(series = plotID,
+                y = target) %>%
+  dplyr::mutate(siteID = factor(siteID),
+                series = factor(series)) %>%
+  dplyr::select(-target, -plotID) %>%
+  dplyr::arrange(Year, epiWeek, series) -> model_dat 
+

Now create the time variable, which needs to track +Year and epiWeek for each unique series. The +n function from dplyr is often useful if +generating a time index for grouped dataframes:

+
model_dat %>%
+  dplyr::ungroup() %>%
+  dplyr::group_by(series) %>%
+  dplyr::arrange(Year, epiWeek) %>%
+  dplyr::mutate(time = seq(1, dplyr::n())) %>%
+  dplyr::ungroup() -> model_dat
+

Check factor levels for the series:

+
levels(model_dat$series)
+#> [1] "BLAN_005" "BLAN_012" "SCBI_002" "SCBI_013" "SERC_001" "SERC_005" "SERC_006"
+#> [8] "SERC_012"
+

This looks good, as does a more rigorous check using +get_mvgam_priors:

+
get_mvgam_priors(y ~ 1,
+                 data = model_dat,
+                 family = poisson())
+#>    param_name param_length  param_info                                  prior
+#> 1 (Intercept)            1 (Intercept) (Intercept) ~ student_t(3, -2.3, 2.5);
+#>                example_change new_lowerbound new_upperbound
+#> 1 (Intercept) ~ normal(0, 1);             NA             NA
+

We can also set up a model in mvgam but use +run_model = FALSE to further ensure all of the necessary +steps for creating the modelling code and objects will run. It is +recommended that you use the cmdstanr backend if possible, +as the auto-formatting options available in this package are very useful +for checking the package-generated Stan code for any +inefficiencies that can be fixed to lead to sampling performance +improvements:

+
testmod <- mvgam(y ~ s(epiWeek, by = series, bs = 'cc') +
+                   s(series, bs = 're'),
+                 trend_model = 'AR1',
+                 data = model_dat,
+                 backend = 'cmdstanr',
+                 run_model = FALSE)
+

This call runs without issue, and the resulting object now contains +the model code and data objects that are needed to initiate +sampling:

+
str(testmod$model_data)
+#> List of 25
+#>  $ y           : num [1:416, 1:8] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
+#>  $ n           : int 416
+#>  $ X           : num [1:3328, 1:73] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..- attr(*, "dimnames")=List of 2
+#>   .. ..$ : chr [1:3328] "1" "2" "3" "4" ...
+#>   .. ..$ : chr [1:73] "X.Intercept." "V2" "V3" "V4" ...
+#>  $ S1          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ zero        : num [1:73] 0 0 0 0 0 0 0 0 0 0 ...
+#>  $ S2          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S3          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S4          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S5          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S6          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S7          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ S8          : num [1:8, 1:8] 1.037 -0.416 0.419 0.117 0.188 ...
+#>  $ p_coefs     : Named num 0.806
+#>   ..- attr(*, "names")= chr "(Intercept)"
+#>  $ p_taus      : num 301
+#>  $ ytimes      : int [1:416, 1:8] 1 9 17 25 33 41 49 57 65 73 ...
+#>  $ n_series    : int 8
+#>  $ sp          : Named num [1:9] 4.68 59.57 1.11 2.73 6.5 ...
+#>   ..- attr(*, "names")= chr [1:9] "s(epiWeek):seriesBLAN_005" "s(epiWeek):seriesBLAN_012" "s(epiWeek):seriesSCBI_002" "s(epiWeek):seriesSCBI_013" ...
+#>  $ y_observed  : num [1:416, 1:8] 0 0 0 0 0 0 0 0 0 0 ...
+#>  $ total_obs   : int 3328
+#>  $ num_basis   : int 73
+#>  $ n_sp        : num 9
+#>  $ n_nonmissing: int 400
+#>  $ obs_ind     : int [1:400] 89 93 98 101 115 118 121 124 127 130 ...
+#>  $ flat_ys     : num [1:400] 2 0 0 0 0 0 0 25 36 14 ...
+#>  $ flat_xs     : num [1:400, 1:73] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..- attr(*, "dimnames")=List of 2
+#>   .. ..$ : chr [1:400] "705" "737" "777" "801" ...
+#>   .. ..$ : chr [1:73] "X.Intercept." "V2" "V3" "V4" ...
+#>  - attr(*, "trend_model")= chr "AR1"
+
code(testmod)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_sp; // number of smoothing parameters
+#>   int<lower=0> n_series; // number of series
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   vector[num_basis] zero; // prior locations for basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   matrix[8, 8] S1; // mgcv smooth penalty matrix S1
+#>   matrix[8, 8] S2; // mgcv smooth penalty matrix S2
+#>   matrix[8, 8] S3; // mgcv smooth penalty matrix S3
+#>   matrix[8, 8] S4; // mgcv smooth penalty matrix S4
+#>   matrix[8, 8] S5; // mgcv smooth penalty matrix S5
+#>   matrix[8, 8] S6; // mgcv smooth penalty matrix S6
+#>   matrix[8, 8] S7; // mgcv smooth penalty matrix S7
+#>   matrix[8, 8] S8; // mgcv smooth penalty matrix S8
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   
+#>   // random effect variances
+#>   vector<lower=0>[1] sigma_raw;
+#>   
+#>   // random effect means
+#>   vector[1] mu_raw;
+#>   
+#>   // latent trend AR1 terms
+#>   vector<lower=-1.5, upper=1.5>[n_series] ar1;
+#>   
+#>   // latent trend variance parameters
+#>   vector<lower=0>[n_series] sigma;
+#>   
+#>   // latent trends
+#>   matrix[n, n_series] trend;
+#>   
+#>   // smoothing parameters
+#>   vector<lower=0>[n_sp] lambda;
+#> }
+#> transformed parameters {
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   b[1 : 65] = b_raw[1 : 65];
+#>   b[66 : 73] = mu_raw[1] + b_raw[66 : 73] * sigma_raw[1];
+#> }
+#> model {
+#>   // prior for random effect population variances
+#>   sigma_raw ~ student_t(3, 0, 2.5);
+#>   
+#>   // prior for random effect population means
+#>   mu_raw ~ std_normal();
+#>   
+#>   // prior for (Intercept)...
+#>   b_raw[1] ~ student_t(3, -2.3, 2.5);
+#>   
+#>   // prior for s(epiWeek):seriesBLAN_005...
+#>   b_raw[2 : 9] ~ multi_normal_prec(zero[2 : 9], S1[1 : 8, 1 : 8] * lambda[1]);
+#>   
+#>   // prior for s(epiWeek):seriesBLAN_012...
+#>   b_raw[10 : 17] ~ multi_normal_prec(zero[10 : 17],
+#>                                      S2[1 : 8, 1 : 8] * lambda[2]);
+#>   
+#>   // prior for s(epiWeek):seriesSCBI_002...
+#>   b_raw[18 : 25] ~ multi_normal_prec(zero[18 : 25],
+#>                                      S3[1 : 8, 1 : 8] * lambda[3]);
+#>   
+#>   // prior for s(epiWeek):seriesSCBI_013...
+#>   b_raw[26 : 33] ~ multi_normal_prec(zero[26 : 33],
+#>                                      S4[1 : 8, 1 : 8] * lambda[4]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_001...
+#>   b_raw[34 : 41] ~ multi_normal_prec(zero[34 : 41],
+#>                                      S5[1 : 8, 1 : 8] * lambda[5]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_005...
+#>   b_raw[42 : 49] ~ multi_normal_prec(zero[42 : 49],
+#>                                      S6[1 : 8, 1 : 8] * lambda[6]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_006...
+#>   b_raw[50 : 57] ~ multi_normal_prec(zero[50 : 57],
+#>                                      S7[1 : 8, 1 : 8] * lambda[7]);
+#>   
+#>   // prior for s(epiWeek):seriesSERC_012...
+#>   b_raw[58 : 65] ~ multi_normal_prec(zero[58 : 65],
+#>                                      S8[1 : 8, 1 : 8] * lambda[8]);
+#>   
+#>   // prior (non-centred) for s(series)...
+#>   b_raw[66 : 73] ~ std_normal();
+#>   
+#>   // priors for AR parameters
+#>   ar1 ~ std_normal();
+#>   
+#>   // priors for smoothing parameters
+#>   lambda ~ normal(5, 30);
+#>   
+#>   // priors for latent trend variance parameters
+#>   sigma ~ student_t(3, 0, 2.5);
+#>   
+#>   // trend estimates
+#>   trend[1, 1 : n_series] ~ normal(0, sigma);
+#>   for (s in 1 : n_series) {
+#>     trend[2 : n, s] ~ normal(ar1[s] * trend[1 : (n - 1), s], sigma[s]);
+#>   }
+#>   {
+#>     // likelihood functions
+#>     vector[n_nonmissing] flat_trends;
+#>     flat_trends = to_vector(trend)[obs_ind];
+#>     flat_ys ~ poisson_log_glm(append_col(flat_xs, flat_trends), 0.0,
+#>                               append_row(b, 1.0));
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   vector[n_sp] rho;
+#>   vector[n_series] tau;
+#>   array[n, n_series] int ypred;
+#>   rho = log(lambda);
+#>   for (s in 1 : n_series) {
+#>     tau[s] = pow(sigma[s], -2.0);
+#>   }
+#>   
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]] + trend[1 : n, s];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/forecast_evaluation.R b/inst/doc/forecast_evaluation.R new file mode 100644 index 00000000..28942b03 --- /dev/null +++ b/inst/doc/forecast_evaluation.R @@ -0,0 +1,221 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +set.seed(2345) +simdat <- sim_mvgam(T = 100, + n_series = 3, + trend_model = 'GP', + prop_trend = 0.75, + family = poisson(), + prop_missing = 0.10) + +## ----------------------------------------------------------------------------- +str(simdat) + +## ----fig.alt = "Simulating data for dynamic GAM models in mvgam"-------------- +plot(simdat$global_seasonality[1:12], + type = 'l', lwd = 2, + ylab = 'Relative effect', + xlab = 'Season', + bty = 'l') + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + series = 'all') + +## ----fig.alt = "Plotting time series features for GAM models in mvgam"-------- +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 1) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 2) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 3) + +## ----include=FALSE------------------------------------------------------------ +mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + s(time, by = series, bs = 'cr', k = 20), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + +# s(time, by = series, bs = 'cr', k = 20), +# knots = list(season = c(0.5, 12.5)), +# trend_model = 'None', +# data = simdat$data_train) + +## ----------------------------------------------------------------------------- +summary(mod1, include_betas = FALSE) + +## ----fig.alt = "Plotting GAM smooth functions using mvgam"-------------------- +plot(mod1, type = 'smooths') + +## ----include=FALSE------------------------------------------------------------ +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + adapt_delta = 0.98) + +## ----eval=FALSE--------------------------------------------------------------- +# mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + +# gp(time, by = series, c = 5/4, k = 20, +# scale = FALSE), +# knots = list(season = c(0.5, 12.5)), +# trend_model = 'None', +# data = simdat$data_train) + +## ----------------------------------------------------------------------------- +summary(mod2, include_betas = FALSE) + +## ----fig.alt = "Summarising latent Gaussian Process parameters in mvgam"------ +mcmc_plot(mod2, variable = c('alpha_gp'), regex = TRUE, type = 'areas') + +## ----fig.alt = "Summarising latent Gaussian Process parameters in mvgam"------ +mcmc_plot(mod2, variable = c('rho_gp'), regex = TRUE, type = 'areas') + +## ----fig.alt = "Plotting Gaussian Process effects in mvgam"------------------- +plot(mod2, type = 'smooths') + +## ----fig.alt = "Summarising latent Gaussian Process parameters in mvgam and marginaleffects"---- +require('ggplot2') +plot_predictions(mod2, + condition = c('time', 'series', 'series'), + type = 'link') + + theme(legend.position = 'none') + +## ----------------------------------------------------------------------------- +fc_mod1 <- forecast(mod1, newdata = simdat$data_test) +fc_mod2 <- forecast(mod2, newdata = simdat$data_test) + +## ----------------------------------------------------------------------------- +str(fc_mod1) + +## ----------------------------------------------------------------------------- +plot(fc_mod1, series = 1) +plot(fc_mod2, series = 1) + +plot(fc_mod1, series = 2) +plot(fc_mod2, series = 2) + +plot(fc_mod1, series = 3) +plot(fc_mod2, series = 3) + +## ----include=FALSE------------------------------------------------------------ +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + newdata = simdat$data_test, + adapt_delta = 0.98) + +## ----eval=FALSE--------------------------------------------------------------- +# mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + +# gp(time, by = series, c = 5/4, k = 20, +# scale = FALSE), +# knots = list(season = c(0.5, 12.5)), +# trend_model = 'None', +# data = simdat$data_train, +# newdata = simdat$data_test) + +## ----------------------------------------------------------------------------- +fc_mod2 <- forecast(mod2) + +## ----warning=FALSE, fig.alt = "Plotting posterior forecast distributions using mvgam and R"---- +plot(fc_mod2, series = 1) + +## ----warning=FALSE------------------------------------------------------------ +crps_mod1 <- score(fc_mod1, score = 'crps') +str(crps_mod1) +crps_mod1$series_1 + +## ----warning=FALSE------------------------------------------------------------ +crps_mod1 <- score(fc_mod1, score = 'crps', interval_width = 0.6) +crps_mod1$series_1 + +## ----------------------------------------------------------------------------- +link_mod1 <- forecast(mod1, newdata = simdat$data_test, type = 'link') +score(link_mod1, score = 'elpd')$series_1 + +## ----------------------------------------------------------------------------- +energy_mod2 <- score(fc_mod2, score = 'energy') +str(energy_mod2) + +## ----------------------------------------------------------------------------- +energy_mod2$all_series + +## ----------------------------------------------------------------------------- +crps_mod1 <- score(fc_mod1, score = 'crps') +crps_mod2 <- score(fc_mod2, score = 'crps') + +diff_scores <- crps_mod2$series_1$score - + crps_mod1$series_1$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + + +diff_scores <- crps_mod2$series_2$score - + crps_mod1$series_2$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + +diff_scores <- crps_mod2$series_3$score - + crps_mod1$series_3$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + + diff --git a/inst/doc/forecast_evaluation.Rmd b/inst/doc/forecast_evaluation.Rmd new file mode 100644 index 00000000..36ea6227 --- /dev/null +++ b/inst/doc/forecast_evaluation.Rmd @@ -0,0 +1,314 @@ +--- +title: "Forecasting and forecast evaluation in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Forecasting and forecast evaluation in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to produce probabilistic forecasts and to evaluate those forecasts using a variety of proper scoring rules. + +## Simulating discrete time series +We begin by simulating some data to show how forecasts are computed and evaluated in `mvgam`. The `sim_mvgam()` function can be used to simulate series that come from a variety of response distributions as well as seasonal patterns and/or dynamic temporal patterns. Here we simulate a collection of three time count-valued series. These series all share the same seasonal pattern but have different temporal dynamics. By setting `trend_model = 'GP'` and `prop_trend = 0.75`, we are generating time series that have smooth underlying temporal trends (evolving as Gaussian Processes with squared exponential kernel) and moderate seasonal patterns. The observations are Poisson-distributed and we allow 10% of observations to be missing. +```{r} +set.seed(2345) +simdat <- sim_mvgam(T = 100, + n_series = 3, + trend_model = 'GP', + prop_trend = 0.75, + family = poisson(), + prop_missing = 0.10) +``` + +The returned object is a `list` containing training and testing data (`sim_mvgam()` automatically splits the data into these folds for us) together with some other information about the data generating process that was used to simulate the data +```{r} +str(simdat) +``` + +Each series in this case has a shared seasonal pattern, which we can visualise: +```{r, fig.alt = "Simulating data for dynamic GAM models in mvgam"} +plot(simdat$global_seasonality[1:12], + type = 'l', lwd = 2, + ylab = 'Relative effect', + xlab = 'Season', + bty = 'l') +``` + +The resulting time series are similar to what we might encounter when dealing with count-valued data that can take small counts: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + series = 'all') +``` + +For each individual series, we can plot the training and testing data, as well as some more specific features of the observed data: +```{r, fig.alt = "Plotting time series features for GAM models in mvgam"} +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 1) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 2) +plot_mvgam_series(data = simdat$data_train, + newdata = simdat$data_test, + series = 3) +``` + +### Modelling dynamics with splines +The first model we will fit uses a shared cyclic spline to capture the repeated seasonality, as well as series-specific splines of time to capture the long-term dynamics. We allow the temporal splines to be fairly complex so they can capture as much of the temporal variation as possible: +```{r include=FALSE} +mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + s(time, by = series, bs = 'cr', k = 20), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) +``` + +```{r eval=FALSE} +mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + s(time, by = series, bs = 'cr', k = 20), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) +``` + +The model fits without issue: +```{r} +summary(mod1, include_betas = FALSE) +``` + +And we can plot the partial effects of the splines to see that they are estimated to be highly nonlinear +```{r, fig.alt = "Plotting GAM smooth functions using mvgam"} +plot(mod1, type = 'smooths') +``` + +### Modelling dynamics with GPs +Before showing how to produce and evaluate forecasts, we will fit a second model to these data so the two models can be compared. This model is equivalent to the above, except we now use Gaussian Processes to model series-specific dynamics. This makes use of the `gp()` function from `brms`, which can fit Hilbert space approximate GPs. See `?brms::gp` for more details. +```{r include=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + adapt_delta = 0.98) +``` + +```{r eval=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train) +``` + +The summary for this model now contains information on the GP parameters for each time series: +```{r} +summary(mod2, include_betas = FALSE) +``` + +We can plot the posteriors for these parameters, and for any other parameter for that matter, using `bayesplot` routines. First the marginal deviation ($\alpha$) parameters: +```{r, fig.alt = "Summarising latent Gaussian Process parameters in mvgam"} +mcmc_plot(mod2, variable = c('alpha_gp'), regex = TRUE, type = 'areas') +``` + +And now the length scale ($\rho$) parameters: +```{r, fig.alt = "Summarising latent Gaussian Process parameters in mvgam"} +mcmc_plot(mod2, variable = c('rho_gp'), regex = TRUE, type = 'areas') +``` + +We can also plot the nonlinear effects as before: +```{r, fig.alt = "Plotting Gaussian Process effects in mvgam"} +plot(mod2, type = 'smooths') +``` +These can also be plotted using `marginaleffects` utilities: +```{r, fig.alt = "Summarising latent Gaussian Process parameters in mvgam and marginaleffects"} +require('ggplot2') +plot_predictions(mod2, + condition = c('time', 'series', 'series'), + type = 'link') + + theme(legend.position = 'none') +``` + +The estimates for the temporal trends are fairly similar for the two models, but below we will see if they produce similar forecasts + +## Forecasting with the `forecast()` function +Probabilistic forecasts can be computed in two main ways in `mvgam`. The first is to take a model that was fit only to training data (as we did above in the two example models) and produce temporal predictions from the posterior predictive distribution by feeding `newdata` to the `forecast()` function. It is crucial that any `newdata` fed to the `forecast()` function follows on sequentially from the data that was used to fit the model (this is not internally checked by the package because it might be a headache to do so when data are not supplied in a specific time-order). When calling the `forecast()` function, you have the option to generate different kinds of predictions (i.e. predicting on the link scale, response scale or to produce expectations; see `?forecast.mvgam` for details). We will use the default and produce forecasts on the response scale, which is the most common way to evaluate forecast distributions +```{r} +fc_mod1 <- forecast(mod1, newdata = simdat$data_test) +fc_mod2 <- forecast(mod2, newdata = simdat$data_test) +``` + +The objects we have created are of class `mvgam_forecast`, which contain information on hindcast distributions, forecast distributions and true observations for each series in the data: +```{r} +str(fc_mod1) +``` + +We can plot the forecasts for each series from each model using the `S3 plot` method for objects of this class: +```{r} +plot(fc_mod1, series = 1) +plot(fc_mod2, series = 1) + +plot(fc_mod1, series = 2) +plot(fc_mod2, series = 2) + +plot(fc_mod1, series = 3) +plot(fc_mod2, series = 3) +``` + +Clearly the two models do not produce equivalent forecasts. We will come back to scoring these forecasts in a moment. + +## Forecasting with `newdata` in `mvgam()` +The second way we can produce forecasts in `mvgam` is to feed the testing data directly to the `mvgam()` function as `newdata`. This will include the testing data as missing observations so that they are automatically predicted from the posterior predictive distribution using the `generated quantities` block in `Stan`. As an example, we can refit `mod2` but include the testing data for automatic forecasts: +```{r include=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + newdata = simdat$data_test, + adapt_delta = 0.98) +``` + +```{r eval=FALSE} +mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + + gp(time, by = series, c = 5/4, k = 20, + scale = FALSE), + knots = list(season = c(0.5, 12.5)), + trend_model = 'None', + data = simdat$data_train, + newdata = simdat$data_test) +``` + +Because the model already contains a forecast distribution, we do not need to feed `newdata` to the `forecast()` function: +```{r} +fc_mod2 <- forecast(mod2) +``` + +The forecasts will be nearly identical to those calculated previously: +```{r warning=FALSE, fig.alt = "Plotting posterior forecast distributions using mvgam and R"} +plot(fc_mod2, series = 1) +``` + +## Scoring forecast distributions +A primary purpose of the `mvgam_forecast` class is to readily allow forecast evaluations for each series in the data, using a variety of possible scoring functions. See `?mvgam::score.mvgam_forecast` to view the types of scores that are available. A useful scoring metric is the [Continuous Rank Probability Score (CRPS)](https://www.annualreviews.org/content/journals/10.1146/annurev-statistics-062713-085831){target="_blank"}. A CRPS value is similar to what we might get if we calculated a weighted absolute error using the full forecast distribution. +```{r warning=FALSE} +crps_mod1 <- score(fc_mod1, score = 'crps') +str(crps_mod1) +crps_mod1$series_1 +``` + +The returned list contains a `data.frame` for each series in the data that shows the CRPS score for each evaluation in the testing data, along with some other useful information about the fit of the forecast distribution. In particular, we are given a logical value (1s and 0s) telling us whether the true value was within a pre-specified credible interval (i.e. the coverage of the forecast distribution). The default interval width is 0.9, so we would hope that the values in the `in_interval` column take a 1 approximately 90% of the time. This value can be changed if you wish to compute different coverages, say using a 60% interval: +```{r warning=FALSE} +crps_mod1 <- score(fc_mod1, score = 'crps', interval_width = 0.6) +crps_mod1$series_1 +``` + +We can also compare forecasts against out of sample observations using the [Expected Log Predictive Density (ELPD; also known as the log score)](https://link.springer.com/article/10.1007/s11222-016-9696-4){target="_blank"}. The ELPD is a strictly proper scoring rule that can be applied to any distributional forecast, but to compute it we need predictions on the link scale rather than on the outcome scale. This is where it is advantageous to change the type of prediction we can get using the `forecast()` function: +```{r} +link_mod1 <- forecast(mod1, newdata = simdat$data_test, type = 'link') +score(link_mod1, score = 'elpd')$series_1 +``` + +Finally, when we have multiple time series it may also make sense to use a multivariate proper scoring rule. `mvgam` offers two such options: the Energy score and the Variogram score. The first penalizes forecast distributions that are less well calibrated against the truth, while the second penalizes forecasts that do not capture the observed true correlation structure. Which score to use depends on your goals, but both are very easy to compute: +```{r} +energy_mod2 <- score(fc_mod2, score = 'energy') +str(energy_mod2) +``` + +The returned object still provides information on interval coverage for each individual series, but there is only a single score per horizon now (which is provided in the `all_series` slot): +```{r} +energy_mod2$all_series +``` + +You can use your score(s) of choice to compare different models. For example, we can compute and plot the difference in CRPS scores for each series in data. Here, a negative value means the Gaussian Process model (`mod2`) is better, while a positive value means the spline model (`mod1`) is better. +```{r} +crps_mod1 <- score(fc_mod1, score = 'crps') +crps_mod2 <- score(fc_mod2, score = 'crps') + +diff_scores <- crps_mod2$series_1$score - + crps_mod1$series_1$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + + +diff_scores <- crps_mod2$series_2$score - + crps_mod1$series_2$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + +diff_scores <- crps_mod2$series_3$score - + crps_mod1$series_3$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(CRPS[GP]~-~CRPS[spline])) +abline(h = 0, lty = 'dashed', lwd = 2) +gp_better <- length(which(diff_scores < 0)) +title(main = paste0('GP better in ', gp_better, ' of 25 evaluations', + '\nMean difference = ', + round(mean(diff_scores, na.rm = TRUE), 2))) + +``` + +The GP model consistently gives better forecasts, and the difference between scores grows quickly as the forecast horizon increases. This is not unexpected given the way that splines linearly extrapolate outside the range of training data + +## Further reading +The following papers and resources offer useful material about Bayesian forecasting and proper scoring rules: + +Hyndman, Rob J., and George Athanasopoulos. [Forecasting: principles and practice](https://otexts.com/fpp3/distaccuracy.html). *OTexts*, 2018. + +Gneiting, Tilmann, and Adrian E. Raftery. [Strictly proper scoring rules, prediction, and estimation](https://www.tandfonline.com/doi/abs/10.1198/016214506000001437) *Journal of the American statistical Association* 102.477 (2007) 359-378. + +Simonis, Juniper L., Ethan P. White, and SK Morgan Ernest. [Evaluating probabilistic ecological forecasts](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecy.3431) *Ecology* 102.8 (2021) e03431. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/inst/doc/forecast_evaluation.html b/inst/doc/forecast_evaluation.html new file mode 100644 index 00000000..570bab86 --- /dev/null +++ b/inst/doc/forecast_evaluation.html @@ -0,0 +1,1020 @@ + + + + + + + + + + + + + + + + +Forecasting and forecast evaluation in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Forecasting and forecast evaluation in +mvgam

+

Nicholas J Clark

+

2024-04-18

+ + +
+ +
+ +

The purpose of this vignette is to show how the mvgam +package can be used to produce probabilistic forecasts and to evaluate +those forecasts using a variety of proper scoring rules.

+
+

Simulating discrete time series

+

We begin by simulating some data to show how forecasts are computed +and evaluated in mvgam. The sim_mvgam() +function can be used to simulate series that come from a variety of +response distributions as well as seasonal patterns and/or dynamic +temporal patterns. Here we simulate a collection of three time +count-valued series. These series all share the same seasonal pattern +but have different temporal dynamics. By setting +trend_model = 'GP' and prop_trend = 0.75, we +are generating time series that have smooth underlying temporal trends +(evolving as Gaussian Processes with squared exponential kernel) and +moderate seasonal patterns. The observations are Poisson-distributed and +we allow 10% of observations to be missing.

+
set.seed(2345)
+simdat <- sim_mvgam(T = 100, 
+                    n_series = 3, 
+                    trend_model = 'GP',
+                    prop_trend = 0.75,
+                    family = poisson(),
+                    prop_missing = 0.10)
+

The returned object is a list containing training and +testing data (sim_mvgam() automatically splits the data +into these folds for us) together with some other information about the +data generating process that was used to simulate the data

+
str(simdat)
+#> List of 6
+#>  $ data_train        :'data.frame':  225 obs. of  5 variables:
+#>   ..$ y     : int [1:225] 0 1 3 0 0 0 1 0 3 1 ...
+#>   ..$ season: int [1:225] 1 1 1 2 2 2 3 3 3 4 ...
+#>   ..$ year  : int [1:225] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..$ series: Factor w/ 3 levels "series_1","series_2",..: 1 2 3 1 2 3 1 2 3 1 ...
+#>   ..$ time  : int [1:225] 1 1 1 2 2 2 3 3 3 4 ...
+#>  $ data_test         :'data.frame':  75 obs. of  5 variables:
+#>   ..$ y     : int [1:75] 0 1 1 0 0 0 2 2 0 NA ...
+#>   ..$ season: int [1:75] 4 4 4 5 5 5 6 6 6 7 ...
+#>   ..$ year  : int [1:75] 7 7 7 7 7 7 7 7 7 7 ...
+#>   ..$ series: Factor w/ 3 levels "series_1","series_2",..: 1 2 3 1 2 3 1 2 3 1 ...
+#>   ..$ time  : int [1:75] 76 76 76 77 77 77 78 78 78 79 ...
+#>  $ true_corrs        : num [1:3, 1:3] 1 0.465 -0.577 0.465 1 ...
+#>  $ true_trends       : num [1:100, 1:3] -1.45 -1.54 -1.61 -1.67 -1.73 ...
+#>  $ global_seasonality: num [1:100] 0.0559 0.6249 1.3746 1.6805 0.5246 ...
+#>  $ trend_params      :List of 2
+#>   ..$ alpha: num [1:3] 0.767 0.988 0.897
+#>   ..$ rho  : num [1:3] 6.02 6.94 5.04
+

Each series in this case has a shared seasonal pattern, which we can +visualise:

+
plot(simdat$global_seasonality[1:12], 
+     type = 'l', lwd = 2,
+     ylab = 'Relative effect',
+     xlab = 'Season',
+     bty = 'l')
+

Simulating data for dynamic GAM models in mvgam

+

The resulting time series are similar to what we might encounter when +dealing with count-valued data that can take small counts:

+
plot_mvgam_series(data = simdat$data_train, 
+                  series = 'all')
+

Plotting time series features for GAM models in mvgam

+

For each individual series, we can plot the training and testing +data, as well as some more specific features of the observed data:

+
plot_mvgam_series(data = simdat$data_train, 
+                  newdata = simdat$data_test,
+                  series = 1)
+

Plotting time series features for GAM models in mvgam

+
plot_mvgam_series(data = simdat$data_train, 
+                  newdata = simdat$data_test,
+                  series = 2)
+

Plotting time series features for GAM models in mvgam

+
plot_mvgam_series(data = simdat$data_train, 
+                  newdata = simdat$data_test,
+                  series = 3)
+

Plotting time series features for GAM models in mvgam

+
+

Modelling dynamics with splines

+

The first model we will fit uses a shared cyclic spline to capture +the repeated seasonality, as well as series-specific splines of time to +capture the long-term dynamics. We allow the temporal splines to be +fairly complex so they can capture as much of the temporal variation as +possible:

+
mod1 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + 
+                s(time, by = series, bs = 'cr', k = 20),
+              knots = list(season = c(0.5, 12.5)),
+              trend_model = 'None',
+              data = simdat$data_train)
+

The model fits without issue:

+
summary(mod1, include_betas = FALSE)
+#> GAM formula:
+#> y ~ s(season, bs = "cc", k = 8) + s(time, by = series, bs = "cr", 
+#>     k = 20)
+#> <environment: 0x0000029cbf2b3570>
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 75 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>              2.5%   50%  97.5% Rhat n_eff
+#> (Intercept) -0.41 -0.21 -0.039    1   813
+#> 
+#> Approximate significance of GAM smooths:
+#>                         edf Ref.df Chi.sq p-value    
+#> s(season)              3.77      6   21.8   0.004 ** 
+#> s(time):seriesseries_1 6.50     19   15.3   0.848    
+#> s(time):seriesseries_2 9.49     19  226.0  <2e-16 ***
+#> s(time):seriesseries_3 5.93     19   18.3   0.867    
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:31:33 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

And we can plot the partial effects of the splines to see that they +are estimated to be highly nonlinear

+
plot(mod1, type = 'smooths')
+

Plotting GAM smooth functions using mvgam

+
+
+

Modelling dynamics with GPs

+

Before showing how to produce and evaluate forecasts, we will fit a +second model to these data so the two models can be compared. This model +is equivalent to the above, except we now use Gaussian Processes to +model series-specific dynamics. This makes use of the gp() +function from brms, which can fit Hilbert space approximate +GPs. See ?brms::gp for more details.

+
mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + 
+                gp(time, by = series, c = 5/4, k = 20,
+                   scale = FALSE),
+              knots = list(season = c(0.5, 12.5)),
+              trend_model = 'None',
+              data = simdat$data_train)
+

The summary for this model now contains information on the GP +parameters for each time series:

+
summary(mod2, include_betas = FALSE)
+#> GAM formula:
+#> y ~ s(season, bs = "cc", k = 8) + gp(time, by = series, c = 5/4, 
+#>     k = 20, scale = FALSE)
+#> <environment: 0x0000029cbf2b3570>
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 75 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5%   50% 97.5% Rhat n_eff
+#> (Intercept) -1.1 -0.52  0.31    1   768
+#> 
+#> GAM gp term marginal deviation (alpha) and length scale (rho) estimates:
+#>                               2.5%  50% 97.5% Rhat n_eff
+#> alpha_gp(time):seriesseries_1 0.21  0.8   2.1 1.01   763
+#> alpha_gp(time):seriesseries_2 0.74  1.4   2.9 1.00  1028
+#> alpha_gp(time):seriesseries_3 0.50  1.1   2.8 1.00  1026
+#> rho_gp(time):seriesseries_1   1.20  5.1  23.0 1.00   681
+#> rho_gp(time):seriesseries_2   2.20 10.0  17.0 1.00   644
+#> rho_gp(time):seriesseries_3   1.50  8.8  23.0 1.00   819
+#> 
+#> Approximate significance of GAM smooths:
+#>            edf Ref.df Chi.sq p-value    
+#> s(season) 4.12      6   25.9 0.00053 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 4 of 2000 iterations ended with a divergence (0.2%)
+#>  *Try running with larger adapt_delta to remove the divergences
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:33:03 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

We can plot the posteriors for these parameters, and for any other +parameter for that matter, using bayesplot routines. First +the marginal deviation (\(\alpha\)) +parameters:

+
mcmc_plot(mod2, variable = c('alpha_gp'), regex = TRUE, type = 'areas')
+

Summarising latent Gaussian Process parameters in mvgam

+

And now the length scale (\(\rho\)) +parameters:

+
mcmc_plot(mod2, variable = c('rho_gp'), regex = TRUE, type = 'areas')
+

Summarising latent Gaussian Process parameters in mvgam

+

We can also plot the nonlinear effects as before:

+
plot(mod2, type = 'smooths')
+

Plotting Gaussian Process effects in mvgam +These can also be plotted using marginaleffects +utilities:

+
require('ggplot2')
+plot_predictions(mod2, 
+                 condition = c('time', 'series', 'series'),
+                 type = 'link') +
+  theme(legend.position = 'none')
+

Summarising latent Gaussian Process parameters in mvgam and marginaleffects

+

The estimates for the temporal trends are fairly similar for the two +models, but below we will see if they produce similar forecasts

+
+
+
+

Forecasting with the forecast() function

+

Probabilistic forecasts can be computed in two main ways in +mvgam. The first is to take a model that was fit only to +training data (as we did above in the two example models) and produce +temporal predictions from the posterior predictive distribution by +feeding newdata to the forecast() function. It +is crucial that any newdata fed to the +forecast() function follows on sequentially from the data +that was used to fit the model (this is not internally checked by the +package because it might be a headache to do so when data are not +supplied in a specific time-order). When calling the +forecast() function, you have the option to generate +different kinds of predictions (i.e. predicting on the link scale, +response scale or to produce expectations; see +?forecast.mvgam for details). We will use the default and +produce forecasts on the response scale, which is the most common way to +evaluate forecast distributions

+
fc_mod1 <- forecast(mod1, newdata = simdat$data_test)
+fc_mod2 <- forecast(mod2, newdata = simdat$data_test)
+

The objects we have created are of class mvgam_forecast, +which contain information on hindcast distributions, forecast +distributions and true observations for each series in the data:

+
str(fc_mod1)
+#> List of 16
+#>  $ call              :Class 'formula'  language y ~ s(season, bs = "cc", k = 8) + s(time, by = series, bs = "cr", k = 20)
+#>   .. ..- attr(*, ".Environment")=<environment: 0x0000029cbf2b3570> 
+#>  $ trend_call        : NULL
+#>  $ family            : chr "poisson"
+#>  $ family_pars       : NULL
+#>  $ trend_model       : chr "None"
+#>  $ drift             : logi FALSE
+#>  $ use_lv            : logi FALSE
+#>  $ fit_engine        : chr "stan"
+#>  $ type              : chr "response"
+#>  $ series_names      : Factor w/ 3 levels "series_1","series_2",..: 1 2 3
+#>  $ train_observations:List of 3
+#>   ..$ series_1: int [1:75] 0 0 1 1 0 0 0 0 0 0 ...
+#>   ..$ series_2: int [1:75] 1 0 0 1 1 0 1 0 1 2 ...
+#>   ..$ series_3: int [1:75] 3 0 3 NA 2 1 1 1 1 3 ...
+#>  $ train_times       : int [1:75] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ test_observations :List of 3
+#>   ..$ series_1: int [1:25] 0 0 2 NA 0 2 2 1 1 1 ...
+#>   ..$ series_2: int [1:25] 1 0 2 1 1 3 0 1 0 NA ...
+#>   ..$ series_3: int [1:25] 1 0 0 1 0 0 1 0 1 0 ...
+#>  $ test_times        : int [1:25] 76 77 78 79 80 81 82 83 84 85 ...
+#>  $ hindcasts         :List of 3
+#>   ..$ series_1: num [1:2000, 1:75] 1 1 0 0 0 1 1 1 0 0 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:75] "ypred[1,1]" "ypred[2,1]" "ypred[3,1]" "ypred[4,1]" ...
+#>   ..$ series_2: num [1:2000, 1:75] 0 0 0 0 0 0 0 1 0 0 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:75] "ypred[1,2]" "ypred[2,2]" "ypred[3,2]" "ypred[4,2]" ...
+#>   ..$ series_3: num [1:2000, 1:75] 3 0 2 1 0 1 2 1 5 1 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:75] "ypred[1,3]" "ypred[2,3]" "ypred[3,3]" "ypred[4,3]" ...
+#>  $ forecasts         :List of 3
+#>   ..$ series_1: num [1:2000, 1:25] 1 3 2 1 0 0 1 1 0 0 ...
+#>   ..$ series_2: num [1:2000, 1:25] 6 0 0 0 0 2 0 0 0 0 ...
+#>   ..$ series_3: num [1:2000, 1:25] 0 1 1 3 3 1 3 2 4 2 ...
+#>  - attr(*, "class")= chr "mvgam_forecast"
+

We can plot the forecasts for each series from each model using the +S3 plot method for objects of this class:

+
plot(fc_mod1, series = 1)
+

+
#> Out of sample CRPS:
+#> [1] 14.62964
+plot(fc_mod2, series = 1)
+

+
#> Out of sample DRPS:
+#> [1] 10.92516
+
+plot(fc_mod1, series = 2)
+

+
#> Out of sample CRPS:
+#> [1] 84201962708
+plot(fc_mod2, series = 2)
+

+
#> Out of sample DRPS:
+#> [1] 14.31152
+
+plot(fc_mod1, series = 3)
+

+
#> Out of sample CRPS:
+#> [1] 32.44136
+plot(fc_mod2, series = 3)
+

+
#> Out of sample DRPS:
+#> [1] 15.44332
+

Clearly the two models do not produce equivalent forecasts. We will +come back to scoring these forecasts in a moment.

+
+
+

Forecasting with newdata in mvgam()

+

The second way we can produce forecasts in mvgam is to +feed the testing data directly to the mvgam() function as +newdata. This will include the testing data as missing +observations so that they are automatically predicted from the posterior +predictive distribution using the generated quantities +block in Stan. As an example, we can refit +mod2 but include the testing data for automatic +forecasts:

+
mod2 <- mvgam(y ~ s(season, bs = 'cc', k = 8) + 
+                gp(time, by = series, c = 5/4, k = 20,
+                   scale = FALSE),
+              knots = list(season = c(0.5, 12.5)),
+              trend_model = 'None',
+              data = simdat$data_train,
+              newdata = simdat$data_test)
+

Because the model already contains a forecast distribution, we do not +need to feed newdata to the forecast() +function:

+
fc_mod2 <- forecast(mod2)
+

The forecasts will be nearly identical to those calculated +previously:

+
plot(fc_mod2, series = 1)
+

Plotting posterior forecast distributions using mvgam and R

+
#> Out of sample DRPS:
+#> [1] 10.85762
+
+
+

Scoring forecast distributions

+

A primary purpose of the mvgam_forecast class is to +readily allow forecast evaluations for each series in the data, using a +variety of possible scoring functions. See +?mvgam::score.mvgam_forecast to view the types of scores +that are available. A useful scoring metric is the Continuous Rank Probability Score (CRPS). A CRPS +value is similar to what we might get if we calculated a weighted +absolute error using the full forecast distribution.

+
crps_mod1 <- score(fc_mod1, score = 'crps')
+str(crps_mod1)
+#> List of 4
+#>  $ series_1  :'data.frame':  25 obs. of  5 variables:
+#>   ..$ score         : num [1:25] 0.1938 0.1366 1.355 NA 0.0348 ...
+#>   ..$ in_interval   : num [1:25] 1 1 1 NA 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type    : chr [1:25] "crps" "crps" "crps" "crps" ...
+#>  $ series_2  :'data.frame':  25 obs. of  5 variables:
+#>   ..$ score         : num [1:25] 0.379 0.306 0.941 0.5 0.573 ...
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 NA ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type    : chr [1:25] "crps" "crps" "crps" "crps" ...
+#>  $ series_3  :'data.frame':  25 obs. of  5 variables:
+#>   ..$ score         : num [1:25] 0.32 0.556 0.379 0.362 0.219 ...
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type    : chr [1:25] "crps" "crps" "crps" "crps" ...
+#>  $ all_series:'data.frame':  25 obs. of  3 variables:
+#>   ..$ score       : num [1:25] 0.892 0.999 2.675 NA 0.827 ...
+#>   ..$ eval_horizon: int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type  : chr [1:25] "sum_crps" "sum_crps" "sum_crps" "sum_crps" ...
+crps_mod1$series_1
+#>         score in_interval interval_width eval_horizon score_type
+#> 1  0.19375525           1            0.9            1       crps
+#> 2  0.13663925           1            0.9            2       crps
+#> 3  1.35502175           1            0.9            3       crps
+#> 4          NA          NA            0.9            4       crps
+#> 5  0.03482775           1            0.9            5       crps
+#> 6  1.55416700           1            0.9            6       crps
+#> 7  1.51028900           1            0.9            7       crps
+#> 8  0.62121225           1            0.9            8       crps
+#> 9  0.62630125           1            0.9            9       crps
+#> 10 0.59853100           1            0.9           10       crps
+#> 11 1.30998625           1            0.9           11       crps
+#> 12 2.04829775           1            0.9           12       crps
+#> 13 0.61251800           1            0.9           13       crps
+#> 14 0.14052300           1            0.9           14       crps
+#> 15 0.65110800           1            0.9           15       crps
+#> 16 0.07973125           1            0.9           16       crps
+#> 17 0.07675600           1            0.9           17       crps
+#> 18 0.09382375           1            0.9           18       crps
+#> 19 0.12356725           1            0.9           19       crps
+#> 20         NA          NA            0.9           20       crps
+#> 21 0.20173600           1            0.9           21       crps
+#> 22 0.84066825           1            0.9           22       crps
+#> 23         NA          NA            0.9           23       crps
+#> 24 1.06489225           1            0.9           24       crps
+#> 25 0.75528825           1            0.9           25       crps
+

The returned list contains a data.frame for each series +in the data that shows the CRPS score for each evaluation in the testing +data, along with some other useful information about the fit of the +forecast distribution. In particular, we are given a logical value (1s +and 0s) telling us whether the true value was within a pre-specified +credible interval (i.e. the coverage of the forecast distribution). The +default interval width is 0.9, so we would hope that the values in the +in_interval column take a 1 approximately 90% of the time. +This value can be changed if you wish to compute different coverages, +say using a 60% interval:

+
crps_mod1 <- score(fc_mod1, score = 'crps', interval_width = 0.6)
+crps_mod1$series_1
+#>         score in_interval interval_width eval_horizon score_type
+#> 1  0.19375525           1            0.6            1       crps
+#> 2  0.13663925           1            0.6            2       crps
+#> 3  1.35502175           0            0.6            3       crps
+#> 4          NA          NA            0.6            4       crps
+#> 5  0.03482775           1            0.6            5       crps
+#> 6  1.55416700           0            0.6            6       crps
+#> 7  1.51028900           0            0.6            7       crps
+#> 8  0.62121225           1            0.6            8       crps
+#> 9  0.62630125           1            0.6            9       crps
+#> 10 0.59853100           1            0.6           10       crps
+#> 11 1.30998625           0            0.6           11       crps
+#> 12 2.04829775           0            0.6           12       crps
+#> 13 0.61251800           1            0.6           13       crps
+#> 14 0.14052300           1            0.6           14       crps
+#> 15 0.65110800           1            0.6           15       crps
+#> 16 0.07973125           1            0.6           16       crps
+#> 17 0.07675600           1            0.6           17       crps
+#> 18 0.09382375           1            0.6           18       crps
+#> 19 0.12356725           1            0.6           19       crps
+#> 20         NA          NA            0.6           20       crps
+#> 21 0.20173600           1            0.6           21       crps
+#> 22 0.84066825           1            0.6           22       crps
+#> 23         NA          NA            0.6           23       crps
+#> 24 1.06489225           1            0.6           24       crps
+#> 25 0.75528825           1            0.6           25       crps
+

We can also compare forecasts against out of sample observations +using the Expected Log Predictive Density (ELPD; also known as the +log score). The ELPD is a strictly proper scoring rule that can be +applied to any distributional forecast, but to compute it we need +predictions on the link scale rather than on the outcome scale. This is +where it is advantageous to change the type of prediction we can get +using the forecast() function:

+
link_mod1 <- forecast(mod1, newdata = simdat$data_test, type = 'link')
+score(link_mod1, score = 'elpd')$series_1
+#>         score eval_horizon score_type
+#> 1  -0.5304414            1       elpd
+#> 2  -0.4298955            2       elpd
+#> 3  -2.9617583            3       elpd
+#> 4          NA            4       elpd
+#> 5  -0.2007644            5       elpd
+#> 6  -3.3781408            6       elpd
+#> 7  -3.2729088            7       elpd
+#> 8  -2.0363750            8       elpd
+#> 9  -2.0670612            9       elpd
+#> 10 -2.0844818           10       elpd
+#> 11 -3.0576463           11       elpd
+#> 12 -3.6291058           12       elpd
+#> 13 -2.1692669           13       elpd
+#> 14 -0.2960899           14       elpd
+#> 15 -2.3738851           15       elpd
+#> 16 -0.2160804           16       elpd
+#> 17 -0.2036782           17       elpd
+#> 18 -0.2115539           18       elpd
+#> 19 -0.2235072           19       elpd
+#> 20         NA           20       elpd
+#> 21 -0.2413680           21       elpd
+#> 22 -2.6791984           22       elpd
+#> 23         NA           23       elpd
+#> 24 -2.6851981           24       elpd
+#> 25 -0.2836901           25       elpd
+

Finally, when we have multiple time series it may also make sense to +use a multivariate proper scoring rule. mvgam offers two +such options: the Energy score and the Variogram score. The first +penalizes forecast distributions that are less well calibrated against +the truth, while the second penalizes forecasts that do not capture the +observed true correlation structure. Which score to use depends on your +goals, but both are very easy to compute:

+
energy_mod2 <- score(fc_mod2, score = 'energy')
+str(energy_mod2)
+#> List of 4
+#>  $ series_1  :'data.frame':  25 obs. of  3 variables:
+#>   ..$ in_interval   : num [1:25] 1 1 1 NA 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ series_2  :'data.frame':  25 obs. of  3 variables:
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 NA ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ series_3  :'data.frame':  25 obs. of  3 variables:
+#>   ..$ in_interval   : num [1:25] 1 1 1 1 1 1 1 1 1 1 ...
+#>   ..$ interval_width: num [1:25] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 ...
+#>   ..$ eval_horizon  : int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ all_series:'data.frame':  25 obs. of  3 variables:
+#>   ..$ score       : num [1:25] 0.773 1.147 1.226 NA 0.458 ...
+#>   ..$ eval_horizon: int [1:25] 1 2 3 4 5 6 7 8 9 10 ...
+#>   ..$ score_type  : chr [1:25] "energy" "energy" "energy" "energy" ...
+

The returned object still provides information on interval coverage +for each individual series, but there is only a single score per horizon +now (which is provided in the all_series slot):

+
energy_mod2$all_series
+#>        score eval_horizon score_type
+#> 1  0.7728517            1     energy
+#> 2  1.1469836            2     energy
+#> 3  1.2258781            3     energy
+#> 4         NA            4     energy
+#> 5  0.4577536            5     energy
+#> 6  1.8094487            6     energy
+#> 7  1.4887317            7     energy
+#> 8  0.7651593            8     energy
+#> 9  1.1180634            9     energy
+#> 10        NA           10     energy
+#> 11 1.5008324           11     energy
+#> 12 3.2142460           12     energy
+#> 13 1.6129732           13     energy
+#> 14 1.2704438           14     energy
+#> 15 1.1335958           15     energy
+#> 16 1.8717420           16     energy
+#> 17        NA           17     energy
+#> 18 0.7953392           18     energy
+#> 19 0.9919119           19     energy
+#> 20        NA           20     energy
+#> 21 1.2461964           21     energy
+#> 22 1.5170615           22     energy
+#> 23        NA           23     energy
+#> 24 2.3824552           24     energy
+#> 25 1.5314557           25     energy
+

You can use your score(s) of choice to compare different models. For +example, we can compute and plot the difference in CRPS scores for each +series in data. Here, a negative value means the Gaussian Process model +(mod2) is better, while a positive value means the spline +model (mod1) is better.

+
crps_mod1 <- score(fc_mod1, score = 'crps')
+crps_mod2 <- score(fc_mod2, score = 'crps')
+
+diff_scores <- crps_mod2$series_1$score -
+  crps_mod1$series_1$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(CRPS[GP]~-~CRPS[spline]))
+abline(h = 0, lty = 'dashed', lwd = 2)
+gp_better <- length(which(diff_scores < 0))
+title(main = paste0('GP better in ', gp_better, ' of 25 evaluations',
+                    '\nMean difference = ', 
+                    round(mean(diff_scores, na.rm = TRUE), 2)))
+

+

+
+diff_scores <- crps_mod2$series_2$score -
+  crps_mod1$series_2$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(CRPS[GP]~-~CRPS[spline]))
+abline(h = 0, lty = 'dashed', lwd = 2)
+gp_better <- length(which(diff_scores < 0))
+title(main = paste0('GP better in ', gp_better, ' of 25 evaluations',
+                    '\nMean difference = ', 
+                    round(mean(diff_scores, na.rm = TRUE), 2)))
+

+

+diff_scores <- crps_mod2$series_3$score -
+  crps_mod1$series_3$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(CRPS[GP]~-~CRPS[spline]))
+abline(h = 0, lty = 'dashed', lwd = 2)
+gp_better <- length(which(diff_scores < 0))
+title(main = paste0('GP better in ', gp_better, ' of 25 evaluations',
+                    '\nMean difference = ', 
+                    round(mean(diff_scores, na.rm = TRUE), 2)))
+

+

The GP model consistently gives better forecasts, and the difference +between scores grows quickly as the forecast horizon increases. This is +not unexpected given the way that splines linearly extrapolate outside +the range of training data

+
+
+

Further reading

+

The following papers and resources offer useful material about +Bayesian forecasting and proper scoring rules:

+

Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles +and practice. OTexts, 2018.

+

Gneiting, Tilmann, and Adrian E. Raftery. Strictly +proper scoring rules, prediction, and estimation Journal of the +American statistical Association 102.477 (2007) 359-378.

+

Simonis, Juniper L., Ethan P. White, and SK Morgan Ernest. Evaluating +probabilistic ecological forecasts Ecology 102.8 (2021) +e03431.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/mvgam_overview.R b/inst/doc/mvgam_overview.R new file mode 100644 index 00000000..c8b40d2d --- /dev/null +++ b/inst/doc/mvgam_overview.R @@ -0,0 +1,311 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----Access time series data-------------------------------------------------- +data("portal_data") + +## ----Inspect data format and structure---------------------------------------- +head(portal_data) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(portal_data) + +## ----------------------------------------------------------------------------- +data <- sim_mvgam(n_series = 4, T = 24) +head(data$data_train, 12) + +## ----Wrangle data for modelling----------------------------------------------- +portal_data %>% + + # mvgam requires a 'time' variable be present in the data to index + # the temporal observations. This is especially important when tracking + # multiple time series. In the Portal data, the 'moon' variable indexes the + # lunar monthly timestep of the trapping sessions + dplyr::mutate(time = moon - (min(moon)) + 1) %>% + + # We can also provide a more informative name for the outcome variable, which + # is counts of the 'PP' species (Chaetodipus penicillatus) across all control + # plots + dplyr::mutate(count = PP) %>% + + # The other requirement for mvgam is a 'series' variable, which needs to be a + # factor variable to index which time series each row in the data belongs to. + # Again, this is more useful when you have multiple time series in the data + dplyr::mutate(series = as.factor('PP')) %>% + + # Select the variables of interest to keep in the model_data + dplyr::select(series, year, time, count, mintemp, ndvi) -> model_data + +## ----------------------------------------------------------------------------- +head(model_data) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(model_data) + +## ----Summarise variables------------------------------------------------------ +summary(model_data) + +## ----------------------------------------------------------------------------- +image(is.na(t(model_data %>% + dplyr::arrange(dplyr::desc(time)))), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(model_data)), labels = colnames(model_data)) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = model_data, series = 1, y = 'count') + +## ----------------------------------------------------------------------------- +model_data %>% + + # Create a 'year_fac' factor version of 'year' + dplyr::mutate(year_fac = factor(year)) -> model_data + +## ----------------------------------------------------------------------------- +dplyr::glimpse(model_data) +levels(model_data$year_fac) + +## ----model1, include=FALSE, results='hide'------------------------------------ +model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, +# family = poisson(), +# data = model_data) + +## ----------------------------------------------------------------------------- +get_mvgam_priors(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data) + +## ----------------------------------------------------------------------------- +summary(model1) + +## ----Extract coefficient posteriors------------------------------------------- +beta_post <- as.data.frame(model1, variable = 'betas') +dplyr::glimpse(beta_post) + +## ----------------------------------------------------------------------------- +code(model1) + +## ----Plot random effect estimates--------------------------------------------- +plot(model1, type = 're') + +## ----------------------------------------------------------------------------- +mcmc_plot(object = model1, + variable = 'betas', + type = 'areas') + +## ----------------------------------------------------------------------------- +pp_check(object = model1) +pp_check(model1, type = "rootogram") + +## ----Plot posterior hindcasts------------------------------------------------- +plot(model1, type = 'forecast') + +## ----Extract posterior hindcast----------------------------------------------- +hc <- hindcast(model1) +str(hc) + +## ----Extract hindcasts on the linear predictor scale-------------------------- +hc <- hindcast(model1, type = 'link') +range(hc$hindcasts$PP) + +## ----Plot hindcasts on the linear predictor scale----------------------------- +plot(hc) + +## ----Plot posterior residuals------------------------------------------------- +plot(model1, type = 'residuals') + +## ----------------------------------------------------------------------------- +model_data %>% + dplyr::filter(time <= 160) -> data_train +model_data %>% + dplyr::filter(time > 160) -> data_test + +## ----include=FALSE, message=FALSE, warning=FALSE------------------------------ +model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, +# family = poisson(), +# data = data_train, +# newdata = data_test) + +## ----------------------------------------------------------------------------- +plot(model1b, type = 're') + +## ----------------------------------------------------------------------------- +plot(model1b, type = 'forecast') + +## ----Plotting predictions against test data----------------------------------- +plot(model1b, type = 'forecast', newdata = data_test) + +## ----Extract posterior forecasts---------------------------------------------- +fc <- forecast(model1b) +str(fc) + +## ----model2, include=FALSE, message=FALSE, warning=FALSE---------------------- +model2 <- mvgam(count ~ s(year_fac, bs = 're') + + ndvi - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model2 <- mvgam(count ~ s(year_fac, bs = 're') + +# ndvi - 1, +# family = poisson(), +# data = data_train, +# newdata = data_test) + +## ----class.output="scroll-300"------------------------------------------------ +summary(model2) + +## ----Posterior quantiles of model coefficients-------------------------------- +coef(model2) + +## ----Plot NDVI effect--------------------------------------------------------- +plot(model2, type = 'pterms') + +## ----------------------------------------------------------------------------- +beta_post <- as.data.frame(model2, variable = 'betas') +dplyr::glimpse(beta_post) + +## ----Histogram of NDVI effects------------------------------------------------ +hist(beta_post$ndvi, + xlim = c(-1 * max(abs(beta_post$ndvi)), + max(abs(beta_post$ndvi))), + col = 'darkred', + border = 'white', + xlab = expression(beta[NDVI]), + ylab = '', + yaxt = 'n', + main = '', + lwd = 2) +abline(v = 0, lwd = 2.5) + +## ----warning=FALSE------------------------------------------------------------ +plot_predictions(model2, + condition = "ndvi", + # include the observed count values + # as points, and show rugs for the observed + # ndvi and count values on the axes + points = 0.5, rug = TRUE) + +## ----warning=FALSE------------------------------------------------------------ +plot(conditional_effects(model2), ask = FALSE) + +## ----model3, include=FALSE, message=FALSE, warning=FALSE---------------------- +model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + + ndvi, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + +# ndvi, +# family = poisson(), +# data = data_train, +# newdata = data_test) + +## ----------------------------------------------------------------------------- +summary(model3) + +## ----------------------------------------------------------------------------- +plot(model3, type = 'smooths') + +## ----------------------------------------------------------------------------- +plot(model3, type = 'smooths', realisations = TRUE, + n_realisations = 30) + +## ----Plot smooth term derivatives, warning = FALSE, fig.asp = 1--------------- +plot(model3, type = 'smooths', derivatives = TRUE) + +## ----warning=FALSE------------------------------------------------------------ +plot(conditional_effects(model3), ask = FALSE) + +## ----warning=FALSE------------------------------------------------------------ +plot(conditional_effects(model3, type = 'link'), ask = FALSE) + +## ----class.output="scroll-300"------------------------------------------------ +code(model3) + +## ----------------------------------------------------------------------------- +plot(model3, type = 'forecast', newdata = data_test) + +## ----Plot extrapolated temporal functions using newdata----------------------- +plot_mvgam_smooth(model3, smooth = 's(time)', + # feed newdata to the plot function to generate + # predictions of the temporal smooth to the end of the + # testing period + newdata = data.frame(time = 1:max(data_test$time), + ndvi = 0)) +abline(v = max(data_train$time), lty = 'dashed', lwd = 2) + +## ----model4, include=FALSE---------------------------------------------------- +model4 <- mvgam(count ~ s(ndvi, k = 6), + family = poisson(), + data = data_train, + newdata = data_test, + trend_model = 'AR1', + parallel = FALSE) + +## ----eval=FALSE--------------------------------------------------------------- +# model4 <- mvgam(count ~ s(ndvi, k = 6), +# family = poisson(), +# data = data_train, +# newdata = data_test, +# trend_model = 'AR1') + +## ----Summarise the mvgam autocorrelated error model, class.output="scroll-300"---- +summary(model4) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +plot_predictions(model4, + condition = "ndvi", + points = 0.5, rug = TRUE) + +## ----------------------------------------------------------------------------- +plot(model4, type = 'forecast', newdata = data_test) + +## ----------------------------------------------------------------------------- +plot(model4, type = 'trend', newdata = data_test) + +## ----------------------------------------------------------------------------- +loo_compare(model3, model4) + +## ----------------------------------------------------------------------------- +fc_mod3 <- forecast(model3) +fc_mod4 <- forecast(model4) +score_mod3 <- score(fc_mod3, score = 'drps') +score_mod4 <- score(fc_mod4, score = 'drps') +sum(score_mod4$PP$score, na.rm = TRUE) - sum(score_mod3$PP$score, na.rm = TRUE) + diff --git a/inst/doc/mvgam_overview.Rmd b/inst/doc/mvgam_overview.Rmd new file mode 100644 index 00000000..508c63d3 --- /dev/null +++ b/inst/doc/mvgam_overview.Rmd @@ -0,0 +1,630 @@ +--- +title: "Overview of the mvgam package" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Overview of the mvgam package} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to give a general overview of the `mvgam` package and its primary functions. + +## Dynamic GAMs +`mvgam` is designed to propagate unobserved temporal processes to capture latent dynamics in the observed time series. This works in a state-space format, with the temporal *trend* evolving independently of the observation process. An introduction to the package and some worked examples are also shown in this seminar: [Ecological Forecasting with Dynamic Generalized Additive Models](https://www.youtube.com/watch?v=0zZopLlomsQ){target="_blank"}. Briefly, assume $\tilde{\boldsymbol{y}}_{i,t}$ is the conditional expectation of response variable $\boldsymbol{i}$ at time $\boldsymbol{t}$. Assuming $\boldsymbol{y_i}$ is drawn from an exponential distribution with an invertible link function, the linear predictor for a multivariate Dynamic GAM can be written as: + +$$for~i~in~1:N_{series}~...$$ +$$for~t~in~1:N_{timepoints}~...$$ + +$$g^{-1}(\tilde{\boldsymbol{y}}_{i,t})=\alpha_{i}+\sum\limits_{j=1}^J\boldsymbol{s}_{i,j,t}\boldsymbol{x}_{j,t}+\boldsymbol{z}_{i,t}\,,$$ +Here $\alpha$ are the unknown intercepts, the $\boldsymbol{s}$'s are unknown smooth functions of covariates ($\boldsymbol{x}$'s), which can potentially vary among the response series, and $\boldsymbol{z}$ are dynamic latent processes. Each smooth function $\boldsymbol{s_j}$ is composed of basis expansions whose coefficients, which must be estimated, control the functional relationship between $\boldsymbol{x}_{j}$ and $g^{-1}(\tilde{\boldsymbol{y}})$. The size of the basis expansion limits the smooth’s potential complexity. A larger set of basis functions allows greater flexibility. For more information on GAMs and how they can smooth through data, see [this blogpost on how to interpret nonlinear effects from Generalized Additive Models](https://ecogambler.netlify.app/blog/interpreting-gams/){target="_blank"}. + +Several advantages of GAMs are that they can model a diversity of response families, including discrete distributions (i.e. Poisson, Negative Binomial, Gamma) that accommodate common ecological features such as zero-inflation or overdispersion, and that they can be formulated to include hierarchical smoothing for multivariate responses. `mvgam` supports a number of different observation families, which are summarized below: + +## Supported observation families + +|Distribution | Function | Support | Extra parameter(s) | +|:----------------:|:---------------:| :------------------------------------------------:|:--------------------:| +|Gaussian (identity link) | `gaussian()` | Real values in $(-\infty, \infty)$ | $\sigma$ | +|Student's T (identity link) | `student-t()` | Heavy-tailed real values in $(-\infty, \infty)$ | $\sigma$, $\nu$ | +|LogNormal (identity link) | `lognormal()` | Positive real values in $[0, \infty)$ | $\sigma$ | +|Gamma (log link) | `Gamma()` | Positive real values in $[0, \infty)$ | $\alpha$ | +|Beta (logit link) | `betar()` | Real values (proportional) in $[0,1]$ | $\phi$ | +|Bernoulli (logit link) | `bernoulli()` | Binary data in ${0,1}$ | - | +|Poisson (log link) | `poisson()` | Non-negative integers in $(0,1,2,...)$ | - | +|Negative Binomial2 (log link)| `nb()` | Non-negative integers in $(0,1,2,...)$ | $\phi$ | +|Binomial (logit link) | `binomial()` | Non-negative integers in $(0,1,2,...)$ | - | +|Beta-Binomial (logit link) | `beta_binomial()` | Non-negative integers in $(0,1,2,...)$ | $\phi$ | +|Poisson Binomial N-mixture (log link)| `nmix()` | Non-negative integers in $(0,1,2,...)$ | - | + +For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are by default estimated independently for each series. However, users can opt to force all series to share extra observation parameters using `share_obs_params = TRUE` in `mvgam()`. Note that default link functions cannot currently be changed. + +## Supported temporal dynamic processes +The dynamic processes can take a wide variety of forms, some of which can be multivariate to allow the different time series to interact or be correlated. When using the `mvgam()` function, the user chooses between different process models with the `trend_model` argument. Available process models are described in detail below. + +### Independent Random Walks +Use `trend_model = 'RW'` or `trend_model = RW()` to set up a model where each series in `data` has independent latent temporal dynamics of the form: + + +\begin{align*} +z_{i,t} & \sim \text{Normal}(z_{i,t-1}, \sigma_i) \end{align*} + +Process error parameters $\sigma$ are modeled independently for each series. If a moving average process is required, use `trend_model = RW(ma = TRUE)` to set up the following: + +\begin{align*} +z_{i,t} & = z_{i,t-1} + \theta_i * error_{i,t-1} + error_{i,t} \\ +error_{i,t} & \sim \text{Normal}(0, \sigma_i) \end{align*} + +Moving average coefficients $\theta$ are independently estimated for each series and will be forced to be stationary by default $(abs(\theta)<1)$. Only moving averages of order $q=1$ are currently allowed. + +### Multivariate Random Walks +If more than one series is included in `data` $(N_{series} > 1)$, a multivariate Random Walk can be set up using `trend_model = RW(cor = TRUE)`, resulting in the following: + +\begin{align*} +z_{t} & \sim \text{MVNormal}(z_{t-1}, \Sigma) \end{align*} + +Where the latent process estimate $z_t$ now takes the form of a vector. The covariance matrix $\Sigma$ will capture contemporaneously correlated process errors. It is parameterised using a Cholesky factorization, which requires priors on the series-level variances $\sigma$ and on the strength of correlations using `Stan`'s `lkj_corr_cholesky` distribution. + +Moving average terms can also be included for multivariate random walks, in which case the moving average coefficients $\theta$ will be parameterised as an $N_{series} * N_{series}$ matrix + +### Autoregressive processes +Autoregressive models up to $p=3$, in which the autoregressive coefficients are estimated independently for each series, can be used by specifying `trend_model = 'AR1'`, `trend_model = 'AR2'`, `trend_model = 'AR3'`, or `trend_model = AR(p = 1, 2, or 3)`. For example, a univariate AR(1) model takes the form: + +\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i * z_{i,t-1}, \sigma_i) \end{align*} + + +All options are the same as for Random Walks, but additional options will be available for placing priors on the autoregressive coefficients. By default, these coefficients will not be forced into stationarity, but users can impose this restriction by changing the upper and lower bounds on their priors. See `?get_mvgam_priors` for more details. + +### Vector Autoregressive processes +A Vector Autoregression of order $p=1$ can be specified if $N_{series} > 1$ using `trend_model = 'VAR1'` or `trend_model = VAR()`. A VAR(1) model takes the form: + +\begin{align*} +z_{t} & \sim \text{Normal}(A * z_{t-1}, \Sigma) \end{align*} + +Where $A$ is an $N_{series} * N_{series}$ matrix of autoregressive coefficients in which the diagonals capture lagged self-dependence (i.e. the effect of a process at time $t$ on its own estimate at time $t+1$), while off-diagonals capture lagged cross-dependence (i.e. the effect of a process at time $t$ on the process for another series at time $t+1$). By default, the covariance matrix $\Sigma$ will assume no process error covariance by fixing the off-diagonals to $0$. To allow for correlated errors, use `trend_model = 'VAR1cor'` or `trend_model = VAR(cor = TRUE)`. A moving average of order $q=1$ can also be included using `trend_model = VAR(ma = TRUE, cor = TRUE)`. + +Note that for all VAR models, stationarity of the process is enforced with a structured prior distribution that is described in detail in [Heaps 2022](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648) + +Heaps, Sarah E. "[Enforcing stationarity through the prior in vector autoregressions.](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648)" *Journal of Computational and Graphical Statistics* 32.1 (2023): 74-83. + +### Gaussian Processes +The final option for modelling temporal dynamics is to use a Gaussian Process with squared exponential kernel. These are set up independently for each series (there is currently no multivariate GP option), using `trend_model = 'GP'`. The dynamics for each latent process are modelled as: + +\begin{align*} +z & \sim \text{MVNormal}(0, \Sigma_{error}) \\ +\Sigma_{error}[t_i, t_j] & = \alpha^2 * exp(-0.5 * ((|t_i - t_j| / \rho))^2) \end{align*} + +The latent dynamic process evolves from a complex, high-dimensional Multivariate Normal distribution which depends on $\rho$ (often called the length scale parameter) to control how quickly the correlations between the model's errors decay as a function of time. For these models, covariance decays exponentially fast with the squared distance (in time) between the observations. The functions also depend on a parameter $\alpha$, which controls the marginal variability of the temporal function at all points; in other words it controls how much the GP term contributes to the linear predictor. `mvgam` capitalizes on some advances that allow GPs to be approximated using Hilbert space basis functions, which [considerably speed up computation at little cost to accuracy or prediction performance](https://link.springer.com/article/10.1007/s11222-022-10167-2){target="_blank"}. + +### Piecewise logistic and linear trends +Modeling growth for many types of time series is often similar to modeling population growth in natural ecosystems, where there series exhibits nonlinear growth that saturates at some particular carrying capacity. The logistic trend model available in {`mvgam`} allows for a time-varying capacity $C(t)$ as well as a non-constant growth rate. Changes in the base growth rate $k$ are incorporated by explicitly defining changepoints throughout the training period where the growth rate is allowed to vary. The changepoint vector $a$ is represented as a vector of `1`s and `0`s, and the rate of growth at time $t$ is represented as $k+a(t)^T\delta$. Potential changepoints are selected uniformly across the training period, and the number of changepoints, as well as the flexibility of the potential rate changes at these changepoints, can be controlled using `trend_model = PW()`. The full piecewise logistic growth model is then: + +\begin{align*} +z_t & = \frac{C_t}{1 + \exp(-(k+a(t)^T\delta)(t-(m+a(t)^T\gamma)))} \end{align*} + +For time series that do not appear to exhibit saturating growth, a piece-wise constant rate of growth can often provide a useful trend model. The piecewise linear trend is defined as: + +\begin{align*} +z_t & = (k+a(t)^T\delta)t + (m+a(t)^T\gamma) \end{align*} + +In both trend models, $m$ is an offset parameter that controls the trend intercept. Because of this parameter, it is not recommended that you include an intercept in your observation formula because this will not be identifiable. You can read about the full description of piecewise linear and logistic trends [in this paper by Taylor and Letham](https://www.tandfonline.com/doi/abs/10.1080/00031305.2017.1380080){target="_blank"}. + +Sean J. Taylor and Benjamin Letham. "[Forecasting at scale.](https://www.tandfonline.com/doi/full/10.1080/00031305.2017.1380080)" *The American Statistician* 72.1 (2018): 37-45. + +### Continuous time AR(1) processes +Most trend models in the `mvgam()` function expect time to be measured in regularly-spaced, discrete intervals (i.e. one measurement per week, or one per year for example). But some time series are taken at irregular intervals and we'd like to model autoregressive properties of these. The `trend_model = CAR()` can be useful to set up these models, which currently only support autoregressive processes of order `1`. The evolution of the latent dynamic process follows the form: + +\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i^{distance} * z_{i,t-1}, \sigma_i) \end{align*} + +Where $distance$ is a vector of non-negative measurements of the time differences between successive observations. See the **Examples** section in `?CAR` for an illustration of how to set these models up. + +## Regression formulae +`mvgam` supports an observation model regression formula, built off the `mvgcv` package, as well as an optional process model regression formula. The formulae supplied to \code{\link{mvgam}} are exactly like those supplied to `glm()` except that smooth terms, `s()`, +`te()`, `ti()` and `t2()`, time-varying effects using `dynamic()`, monotonically increasing (using `s(x, bs = 'moi')`) or decreasing splines (using `s(x, bs = 'mod')`; see `?smooth.construct.moi.smooth.spec` for details), as well as Gaussian Process functions using `gp()`, can be added to the right hand side (and `.` is not supported in `mvgam` formulae). See `?mvgam_formulae` for more guidance. + +For setting up State-Space models, the optional process model formula can be used (see [the State-Space model vignette](https://nicholasjclark.github.io/mvgam/articles/trend_formulas.html) and [the shared latent states vignette](https://nicholasjclark.github.io/mvgam/articles/trend_formulas.html) for guidance on using trend formulae). + +## Example time series data +The 'portal_data' object contains time series of rodent captures from the Portal Project, [a long-term monitoring study based near the town of Portal, Arizona](https://portal.weecology.org/){target="_blank"}. Researchers have been operating a standardized set of baited traps within 24 experimental plots at this site since the 1970's. Sampling follows the lunar monthly cycle, with observations occurring on average about 28 days apart. However, missing observations do occur due to difficulties accessing the site (weather events, COVID disruptions etc...). You can read about the full sampling protocol [in this preprint by Ernest et al on the Biorxiv](https://www.biorxiv.org/content/10.1101/332783v3.full){target="_blank"}. +```{r Access time series data} +data("portal_data") +``` + +As the data come pre-loaded with the `mvgam` package, you can read a little about it in the help page using `?portal_data`. Before working with data, it is important to inspect how the data are structured, first using `head`: +```{r Inspect data format and structure} +head(portal_data) +``` + +But the `glimpse` function in `dplyr` is also useful for understanding how variables are structured +```{r} +dplyr::glimpse(portal_data) +``` + +We will focus analyses on the time series of captures for one specific rodent species, the Desert Pocket Mouse *Chaetodipus penicillatus*. This species is interesting in that it goes into a kind of "hibernation" during the colder months, leading to very low captures during the winter period + +## Manipulating data for modelling + +Manipulating the data into a 'long' format is necessary for modelling in `mvgam`. By 'long' format, we mean that each `series x time` observation needs to have its own entry in the `dataframe` or `list` object that we wish to use as data for modelling. A simple example can be viewed by simulating data using the `sim_mvgam` function. See `?sim_mvgam` for more details +```{r} +data <- sim_mvgam(n_series = 4, T = 24) +head(data$data_train, 12) +``` + +Notice how we have four different time series in these simulated data, but we do not spread the outcome values into different columns. Rather, there is only a single column for the outcome variable, labelled `y` in these simulated data. We also must supply a variable labelled `time` to ensure the modelling software knows how to arrange the time series when building models. This setup still allows us to formulate multivariate time series models, as you can see in the [State-Space vignette](https://nicholasjclark.github.io/mvgam/articles/trend_formulas.html). Below are the steps needed to shape our `portal_data` object into the correct form. First, we create a `time` variable, select the column representing counts of our target species (`PP`), and select appropriate variables that we can use as predictors +```{r Wrangle data for modelling} +portal_data %>% + + # mvgam requires a 'time' variable be present in the data to index + # the temporal observations. This is especially important when tracking + # multiple time series. In the Portal data, the 'moon' variable indexes the + # lunar monthly timestep of the trapping sessions + dplyr::mutate(time = moon - (min(moon)) + 1) %>% + + # We can also provide a more informative name for the outcome variable, which + # is counts of the 'PP' species (Chaetodipus penicillatus) across all control + # plots + dplyr::mutate(count = PP) %>% + + # The other requirement for mvgam is a 'series' variable, which needs to be a + # factor variable to index which time series each row in the data belongs to. + # Again, this is more useful when you have multiple time series in the data + dplyr::mutate(series = as.factor('PP')) %>% + + # Select the variables of interest to keep in the model_data + dplyr::select(series, year, time, count, mintemp, ndvi) -> model_data +``` + +The data now contain six variables: + `series`, a factor indexing which time series each observation belongs to + `year`, the year of sampling + `time`, the indicator of which time step each observation belongs to + `count`, the response variable representing the number of captures of the species `PP` in each sampling observation + `mintemp`, the monthly average minimum temperature at each time step + `ndvi`, the monthly average Normalized Difference Vegetation Index at each time step + +Now check the data structure again +```{r} +head(model_data) +``` + +```{r} +dplyr::glimpse(model_data) +``` + +You can also summarize multiple variables, which is helpful to search for data ranges and identify missing values +```{r Summarise variables} +summary(model_data) +``` + +We have some `NA`s in our response variable `count`. Let's visualize the data as a heatmap to get a sense of where these are distributed (`NA`s are shown as red bars in the below plot) +```{r} +image(is.na(t(model_data %>% + dplyr::arrange(dplyr::desc(time)))), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(model_data)), labels = colnames(model_data)) +``` + +These observations will generally be thrown out by most modelling packages in \R. But as you will see when we work through the tutorials, `mvgam` keeps these in the data so that predictions can be automatically returned for the full dataset. The time series and some of its descriptive features can be plotted using `plot_mvgam_series()`: +```{r} +plot_mvgam_series(data = model_data, series = 1, y = 'count') +``` + +## GLMs with temporal random effects +Our first task will be to fit a Generalized Linear Model (GLM) that can adequately capture the features of our `count` observations (integer data, lower bound at zero, missing values) while also attempting to model temporal variation. We are almost ready to fit our first model, which will be a GLM with Poisson observations, a log link function and random (hierarchical) intercepts for `year`. This will allow us to capture our prior belief that, although each year is unique, having been sampled from the same population of effects, all years are connected and thus might contain valuable information about one another. This will be done by capitalizing on the partial pooling properties of hierarchical models. Hierarchical (also known as random) effects offer many advantages when modelling data with grouping structures (i.e. multiple species, locations, years etc...). The ability to incorporate these in time series models is a huge advantage over traditional models such as ARIMA or Exponential Smoothing. But before we fit the model, we will need to convert `year` to a factor so that we can use a random effect basis in `mvgam`. See `?smooth.terms` and +`?smooth.construct.re.smooth.spec` for details about the `re` basis construction that is used by both `mvgam` and `mgcv` +```{r} +model_data %>% + + # Create a 'year_fac' factor version of 'year' + dplyr::mutate(year_fac = factor(year)) -> model_data +``` + +Preview the dataset to ensure year is now a factor with a unique factor level for each year in the data +```{r} +dplyr::glimpse(model_data) +levels(model_data$year_fac) +``` + +We are now ready for our first `mvgam` model. The syntax will be familiar to users who have previously built models with `mgcv`. But for a refresher, see `?formula.gam` and the examples in `?gam`. Random effects can be specified using the `s` wrapper with the `re` basis. Note that we can also suppress the primary intercept using the usual `R` formula syntax `- 1`. `mvgam` has a number of possible observation families that can be used, see `?mvgam_families` for more information. We will use `Stan` as the fitting engine, which deploys Hamiltonian Monte Carlo (HMC) for full Bayesian inference. By default, 4 HMC chains will be run using a warmup of 500 iterations and collecting 500 posterior samples from each chain. The package will also aim to use the `Cmdstan` backend when possible, so it is recommended that users have an up-to-date installation of `Cmdstan` and the associated `cmdstanr` interface on their machines (note that you can set the backend yourself using the `backend` argument: see `?mvgam` for details). Interested users should consult the [`Stan` user's guide](https://mc-stan.org/docs/stan-users-guide/index.html){target="_blank"} for more information about the software and the enormous variety of models that can be tackled with HMC. +```{r model1, include=FALSE, results='hide'} +model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data, + parallel = FALSE) +``` + +```{r eval=FALSE} +model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data) +``` + +The model can be described mathematically for each timepoint $t$ as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) \end{align*} + +Where the $\beta_{year}$ effects are drawn from a *population* distribution that is parameterized by a common mean $(\mu_{year})$ and variance $(\sigma_{year})$. Priors on most of the model parameters can be interrogated and changed using similar functionality to the options available in `brms`. For example, the default priors on $(\mu_{year})$ and $(\sigma_{year})$ can be viewed using the following code: +```{r} +get_mvgam_priors(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = model_data) +``` + +See examples in `?get_mvgam_priors` to find out different ways that priors can be altered. +Once the model has finished, the first step is to inspect the `summary` to ensure no major diagnostic warnings have been produced and to quickly summarise posterior distributions for key parameters +```{r} +summary(model1) +``` + +The diagnostic messages at the bottom of the summary show that the HMC sampler did not encounter any problems or difficult posterior spaces. This is a good sign. Posterior distributions for model parameters can be extracted in any way that an object of class `brmsfit` can (see `?mvgam::mvgam_draws` for details). For example, we can extract the coefficients related to the GAM linear predictor (i.e. the $\beta$'s) into a `data.frame` using: +```{r Extract coefficient posteriors} +beta_post <- as.data.frame(model1, variable = 'betas') +dplyr::glimpse(beta_post) +``` + +With any model fitted in `mvgam`, the underlying `Stan` code can be viewed using the `code` function: +```{r} +code(model1) +``` + +### Plotting effects and residuals + +Now for interrogating the model. We can get some sense of the variation in yearly intercepts from the summary above, but it is easier to understand them using targeted plots. Plot posterior distributions of the temporal random effects using `plot.mvgam` with `type = 're'`. See `?plot.mvgam` for more details about the types of plots that can be produced from fitted `mvgam` objects +```{r Plot random effect estimates} +plot(model1, type = 're') +``` + +### `bayesplot` support +We can also capitalize on most of the useful MCMC plotting functions from the `bayesplot` package to visualize posterior distributions and diagnostics (see `?mvgam::mcmc_plot.mvgam` for details): +```{r} +mcmc_plot(object = model1, + variable = 'betas', + type = 'areas') +``` + +We can also use the wide range of posterior checking functions available in `bayesplot` (see `?mvgam::ppc_check.mvgam` for details): +```{r} +pp_check(object = model1) +pp_check(model1, type = "rootogram") +``` + +There is clearly some variation in these yearly intercept estimates. But how do these translate into time-varying predictions? To understand this, we can plot posterior hindcasts from this model for the training period using `plot.mvgam` with `type = 'forecast'` +```{r Plot posterior hindcasts} +plot(model1, type = 'forecast') +``` + +If you wish to extract these hindcasts for other downstream analyses, the `hindcast` function can be used. This will return a list object of class `mvgam_forecast`. In the `hindcasts` slot, a matrix of posterior retrodictions will be returned for each series in the data (only one series in our example): +```{r Extract posterior hindcast} +hc <- hindcast(model1) +str(hc) +``` + +You can also extract these hindcasts on the linear predictor scale, which in this case is the log scale (our Poisson GLM used a log link function). Sometimes this can be useful for asking more targeted questions about drivers of variation: +```{r Extract hindcasts on the linear predictor scale} +hc <- hindcast(model1, type = 'link') +range(hc$hindcasts$PP) +``` + +Objects of class `mvgam_forecast` have an associated plot function as well: +```{r Plot hindcasts on the linear predictor scale} +plot(hc) +``` + +This plot can look a bit confusing as it seems like there is linear interpolation from the end of one year to the start of the next. But this is just due to the way the lines are automatically connected in base \R plots + +In any regression analysis, a key question is whether the residuals show any patterns that can be indicative of un-modelled sources of variation. For GLMs, we can use a modified residual called the [Dunn-Smyth, or randomized quantile, residual](https://www.jstor.org/stable/1390802){target="_blank"}. Inspect Dunn-Smyth residuals from the model using `plot.mvgam` with `type = 'residuals'` +```{r Plot posterior residuals} +plot(model1, type = 'residuals') +``` + +## Automatic forecasting for new data +These temporal random effects do not have a sense of "time". Because of this, each yearly random intercept is not restricted in some way to be similar to the previous yearly intercept. This drawback becomes evident when we predict for a new year. To do this, we can repeat the exercise above but this time will split the data into training and testing sets before re-running the model. We can then supply the test set as `newdata`. For splitting, we will make use of the `filter` function from `dplyr` +```{r} +model_data %>% + dplyr::filter(time <= 160) -> data_train +model_data %>% + dplyr::filter(time > 160) -> data_test +``` + +```{r include=FALSE, message=FALSE, warning=FALSE} +model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) +``` + +```{r eval=FALSE} +model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1, + family = poisson(), + data = data_train, + newdata = data_test) +``` + +Repeating the plots above gives insight into how the model's hierarchical prior formulation provides all the structure needed to sample values for un-modelled years +```{r} +plot(model1b, type = 're') +``` + + +```{r} +plot(model1b, type = 'forecast') +``` + +We can also view the test data in the forecast plot to see that the predictions do not capture the temporal variation in the test set +```{r Plotting predictions against test data} +plot(model1b, type = 'forecast', newdata = data_test) +``` + +As with the `hindcast` function, we can use the `forecast` function to automatically extract the posterior distributions for these predictions. This also returns an object of class `mvgam_forecast`, but now it will contain both the hindcasts and forecasts for each series in the data: +```{r Extract posterior forecasts} +fc <- forecast(model1b) +str(fc) +``` + +## Adding predictors as "fixed" effects +Any users familiar with GLMs will know that we nearly always wish to include predictor variables that may explain some of the variation in our observations. Predictors are easily incorporated into GLMs / GAMs. Here, we will update the model from above by including a parametric (fixed) effect of `ndvi` as a linear predictor: +```{r model2, include=FALSE, message=FALSE, warning=FALSE} +model2 <- mvgam(count ~ s(year_fac, bs = 're') + + ndvi - 1, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) +``` + +```{r eval=FALSE} +model2 <- mvgam(count ~ s(year_fac, bs = 're') + + ndvi - 1, + family = poisson(), + data = data_train, + newdata = data_test) +``` + +The model can be described mathematically as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} + \beta_{ndvi} * \boldsymbol{ndvi}_t \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*} + +Where the $\beta_{year}$ effects are the same as before but we now have another predictor $(\beta_{ndvi})$ that applies to the `ndvi` value at each timepoint $t$. Inspect the summary of this model + +```{r, class.output="scroll-300"} +summary(model2) +``` + +Rather than printing the summary each time, we can also quickly look at the posterior empirical quantiles for the fixed effect of `ndvi` (and other linear predictor coefficients) using `coef`: +```{r Posterior quantiles of model coefficients} +coef(model2) +``` + +Look at the estimated effect of `ndvi` using `plot.mvgam` with `type = 'pterms'` +```{r Plot NDVI effect} +plot(model2, type = 'pterms') +``` + +This plot indicates a positive linear effect of `ndvi` on `log(counts)`. But it may be easier to visualise using a histogram, especially for parametric (linear) effects. This can be done by first extracting the posterior coefficients as we did in the first example: +```{r} +beta_post <- as.data.frame(model2, variable = 'betas') +dplyr::glimpse(beta_post) +``` + +The posterior distribution for the effect of `ndvi` is stored in the `ndvi` column. A quick histogram confirms our inference that `log(counts)` respond positively to increases in `ndvi`: +```{r Histogram of NDVI effects} +hist(beta_post$ndvi, + xlim = c(-1 * max(abs(beta_post$ndvi)), + max(abs(beta_post$ndvi))), + col = 'darkred', + border = 'white', + xlab = expression(beta[NDVI]), + ylab = '', + yaxt = 'n', + main = '', + lwd = 2) +abline(v = 0, lwd = 2.5) +``` + +### `marginaleffects` support +Given our model used a nonlinear link function (log link in this example), it can still be difficult to fully understand what relationship our model is estimating between a predictor and the response. Fortunately, the `marginaleffects` package makes this relatively straightforward. Objects of class `mvgam` can be used with `marginaleffects` to inspect contrasts, scenario-based predictions, conditional and marginal effects, all on the outcome scale. Here we will use the `plot_predictions` function from `marginaleffects` to inspect the conditional effect of `ndvi` (use `?plot_predictions` for guidance on how to modify these plots): +```{r warning=FALSE} +plot_predictions(model2, + condition = "ndvi", + # include the observed count values + # as points, and show rugs for the observed + # ndvi and count values on the axes + points = 0.5, rug = TRUE) +``` + +Now it is easier to get a sense of the nonlinear but positive relationship estimated between `ndvi` and `count`. Like `brms`, `mvgam` has the simple `conditional_effects` function to make quick and informative plots for main effects. This will likely be your go-to function for quickly understanding patterns from fitted `mvgam` models +```{r warning=FALSE} +plot(conditional_effects(model2), ask = FALSE) +``` + +## Adding predictors as smooths + +Smooth functions, using penalized splines, are a major feature of `mvgam`. Nonlinear splines are commonly viewed as variations of random effects in which the coefficients that control the shape of the spline are drawn from a joint, penalized distribution. This strategy is very often used in ecological time series analysis to capture smooth temporal variation in the processes we seek to study. When we construct smoothing splines, the workhorse package `mgcv` will calculate a set of basis functions that will collectively control the shape and complexity of the resulting spline. It is often helpful to visualize these basis functions to get a better sense of how splines work. We'll create a set of 6 basis functions to represent possible variation in the effect of `time` on our outcome.In addition to constructing the basis functions, `mgcv` also creates a penalty matrix $S$, which contains **known** coefficients that work to constrain the wiggliness of the resulting smooth function. When fitting a GAM to data, we must estimate the smoothing parameters ($\lambda$) that will penalize these matrices, resulting in constrained basis coefficients and smoother functions that are less likely to overfit the data. This is the key to fitting GAMs in a Bayesian framework, as we can jointly estimate the $\lambda$'s using informative priors to prevent overfitting and expand the complexity of models we can tackle. To see this in practice, we can now fit a model that replaces the yearly random effects with a smooth function of `time`. We will need a reasonably complex function (large `k`) to try and accommodate the temporal variation in our observations. Following some [useful advice by Gavin Simpson](https://fromthebottomoftheheap.net/2020/06/03/extrapolating-with-gams/){target="_blank"}, we will use a b-spline basis for the temporal smooth. Because we no longer have intercepts for each year, we also retain the primary intercept term in this model (there is no `-1` in the formula now): +```{r model3, include=FALSE, message=FALSE, warning=FALSE} +model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + + ndvi, + family = poisson(), + data = data_train, + newdata = data_test, + parallel = FALSE) +``` + +```{r eval=FALSE} +model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + + ndvi, + family = poisson(), + data = data_train, + newdata = data_test) +``` + +The model can be described mathematically as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{time})_t + \beta_{ndvi} * \boldsymbol{ndvi}_t \\ +f(\boldsymbol{time}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*} + + +Where the smooth function $f_{time}$ is built by summing across a set of weighted basis functions. The basis functions $(b)$ are constructed using a thin plate regression basis in `mgcv`. The weights $(\beta_{smooth})$ are drawn from a penalized multivariate normal distribution where the precision matrix $(\Omega$) is multiplied by a smoothing penalty $(\lambda)$. If $\lambda$ becomes large, this acts to *squeeze* the covariances among the weights $(\beta_{smooth})$, leading to a less wiggly spline. Note that sometimes there are multiple smoothing penalties that contribute to the covariance matrix, but I am only showing one here for simplicity. View the summary as before +```{r} +summary(model3) +``` + +The summary above now contains posterior estimates for the smoothing parameters as well as the basis coefficients for the nonlinear effect of `time`. We can visualize the conditional `time` effect using the `plot` function with `type = 'smooths'`: +```{r} +plot(model3, type = 'smooths') +``` + +By default this plots shows posterior empirical quantiles, but it can also be helpful to view some realizations of the underlying function (here, each line is a different potential curve drawn from the posterior of all possible curves): +```{r} +plot(model3, type = 'smooths', realisations = TRUE, + n_realisations = 30) +``` + +### Derivatives of smooths +A useful question when modelling using GAMs is to identify where the function is changing most rapidly. To address this, we can plot estimated 1st derivatives of the spline: +```{r Plot smooth term derivatives, warning = FALSE, fig.asp = 1} +plot(model3, type = 'smooths', derivatives = TRUE) +``` + +Here, values above `>0` indicate the function was increasing at that time point, while values `<0` indicate the function was declining. The most rapid declines appear to have been happening around timepoints 50 and again toward the end of the training period, for example. + +Use `conditional_effects` again for useful plots on the outcome scale: +```{r warning=FALSE} +plot(conditional_effects(model3), ask = FALSE) +``` + +Or on the link scale: +```{r warning=FALSE} +plot(conditional_effects(model3, type = 'link'), ask = FALSE) +``` + +Inspect the underlying `Stan` code to gain some idea of how the spline is being penalized: +```{r, class.output="scroll-300"} +code(model3) +``` + +The line below `// prior for s(time)...` shows how the spline basis coefficients are drawn from a zero-centred multivariate normal distribution. The precision matrix $S$ is penalized by two different smoothing parameters (the $\lambda$'s) to enforce smoothness and reduce overfitting + +## Latent dynamics in `mvgam` + +Forecasts from the above model are not ideal: +```{r} +plot(model3, type = 'forecast', newdata = data_test) +``` + +Why is this happening? The forecasts are driven almost entirely by variation in the temporal spline, which is extrapolating linearly *forever* beyond the edge of the training data. Any slight wiggles near the end of the training set will result in wildly different forecasts. To visualize this, we can plot the extrapolated temporal functions into the out-of-sample test set for the two models. Here are the extrapolated functions for the first model, with 15 basis functions: +```{r Plot extrapolated temporal functions using newdata} +plot_mvgam_smooth(model3, smooth = 's(time)', + # feed newdata to the plot function to generate + # predictions of the temporal smooth to the end of the + # testing period + newdata = data.frame(time = 1:max(data_test$time), + ndvi = 0)) +abline(v = max(data_train$time), lty = 'dashed', lwd = 2) +``` + +This model is not doing well. Clearly we need to somehow account for the strong temporal autocorrelation when modelling these data without using a smooth function of `time`. Now onto another prominent feature of `mvgam`: the ability to include (possibly latent) autocorrelated residuals in regression models. To do so, we use the `trend_model` argument (see `?mvgam_trends` for details of different dynamic trend models that are supported). This model will use a separate sub-model for latent residuals that evolve as an AR1 process (i.e. the error in the current time point is a function of the error in the previous time point, plus some stochastic noise). We also include a smooth function of `ndvi` in this model, rather than the parametric term that was used above, to showcase that `mvgam` can include combinations of smooths and dynamic components: +```{r model4, include=FALSE} +model4 <- mvgam(count ~ s(ndvi, k = 6), + family = poisson(), + data = data_train, + newdata = data_test, + trend_model = 'AR1', + parallel = FALSE) +``` + +```{r eval=FALSE} +model4 <- mvgam(count ~ s(ndvi, k = 6), + family = poisson(), + data = data_train, + newdata = data_test, + trend_model = 'AR1') +``` + +The model can be described mathematically as follows: +\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{ndvi})_t + z_t \\ +z_t & \sim \text{Normal}(ar1 * z_{t-1}, \sigma_{error}) \\ +ar1 & \sim \text{Normal}(0, 1)[-1, 1] \\ +\sigma_{error} & \sim \text{Exponential}(2) \\ +f(\boldsymbol{ndvi}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) \end{align*} + +Here the term $z_t$ captures autocorrelated latent residuals, which are modelled using an AR1 process. You can also notice that this model is estimating autocorrelated errors for the full time period, even though some of these time points have missing observations. This is useful for getting more realistic estimates of the residual autocorrelation parameters. Summarise the model to see how it now returns posterior summaries for the latent AR1 process: +```{r Summarise the mvgam autocorrelated error model, class.output="scroll-300"} +summary(model4) +``` + +View conditional smooths for the `ndvi` effect: +```{r warning=FALSE, message=FALSE} +plot_predictions(model4, + condition = "ndvi", + points = 0.5, rug = TRUE) +``` + +View posterior hindcasts / forecasts and compare against the out of sample test data +```{r} +plot(model4, type = 'forecast', newdata = data_test) +``` + +The trend is evolving as an AR1 process, which we can also view: +```{r} +plot(model4, type = 'trend', newdata = data_test) +``` + +In-sample model performance can be interrogated using leave-one-out cross-validation utilities from the `loo` package (a higher value is preferred for this metric): +```{r} +loo_compare(model3, model4) +``` + +The higher estimated log predictive density (ELPD) value for the dynamic model suggests it provides a better fit to the in-sample data. + +Though it should be obvious that this model provides better forecasts, we can quantify forecast performance for models 3 and 4 using the `forecast` and `score` functions. Here we will compare models based on their Discrete Ranked Probability Scores (a lower value is preferred for this metric) +```{r} +fc_mod3 <- forecast(model3) +fc_mod4 <- forecast(model4) +score_mod3 <- score(fc_mod3, score = 'drps') +score_mod4 <- score(fc_mod4, score = 'drps') +sum(score_mod4$PP$score, na.rm = TRUE) - sum(score_mod3$PP$score, na.rm = TRUE) +``` + +A strongly negative value here suggests the score for the dynamic model (model 4) is much smaller than the score for the model with a smooth function of time (model 3) + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/inst/doc/mvgam_overview.html b/inst/doc/mvgam_overview.html new file mode 100644 index 00000000..6ebc839b --- /dev/null +++ b/inst/doc/mvgam_overview.html @@ -0,0 +1,1890 @@ + + + + + + + + + + + + + + + + +Overview of the mvgam package + + + + + + + + + + + + + + + + + + + + + + + + + + +

Overview of the mvgam package

+

Nicholas J Clark

+

2024-04-16

+ + +
+ +
+ +

The purpose of this vignette is to give a general overview of the +mvgam package and its primary functions.

+
+

Dynamic GAMs

+

mvgam is designed to propagate unobserved temporal +processes to capture latent dynamics in the observed time series. This +works in a state-space format, with the temporal trend evolving +independently of the observation process. An introduction to the package +and some worked examples are also shown in this seminar: Ecological Forecasting with Dynamic Generalized Additive +Models. Briefly, assume \(\tilde{\boldsymbol{y}}_{i,t}\) is the +conditional expectation of response variable \(\boldsymbol{i}\) at time \(\boldsymbol{t}\). Assuming \(\boldsymbol{y_i}\) is drawn from an +exponential distribution with an invertible link function, the linear +predictor for a multivariate Dynamic GAM can be written as:

+

\[for~i~in~1:N_{series}~...\] \[for~t~in~1:N_{timepoints}~...\]

+

\[g^{-1}(\tilde{\boldsymbol{y}}_{i,t})=\alpha_{i}+\sum\limits_{j=1}^J\boldsymbol{s}_{i,j,t}\boldsymbol{x}_{j,t}+\boldsymbol{z}_{i,t}\,,\] +Here \(\alpha\) are the unknown +intercepts, the \(\boldsymbol{s}\)’s +are unknown smooth functions of covariates (\(\boldsymbol{x}\)’s), which can potentially +vary among the response series, and \(\boldsymbol{z}\) are dynamic latent +processes. Each smooth function \(\boldsymbol{s_j}\) is composed of basis +expansions whose coefficients, which must be estimated, control the +functional relationship between \(\boldsymbol{x}_{j}\) and \(g^{-1}(\tilde{\boldsymbol{y}})\). The size +of the basis expansion limits the smooth’s potential complexity. A +larger set of basis functions allows greater flexibility. For more +information on GAMs and how they can smooth through data, see this blogpost on how to interpret nonlinear effects from +Generalized Additive Models.

+

Several advantages of GAMs are that they can model a diversity of +response families, including discrete distributions (i.e. Poisson, +Negative Binomial, Gamma) that accommodate common ecological features +such as zero-inflation or overdispersion, and that they can be +formulated to include hierarchical smoothing for multivariate responses. +mvgam supports a number of different observation families, +which are summarized below:

+
+
+

Supported observation families

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DistributionFunctionSupportExtra parameter(s)
Gaussian (identity link)gaussian()Real values in \((-\infty, +\infty)\)\(\sigma\)
Student’s T (identity link)student-t()Heavy-tailed real values in \((-\infty, \infty)\)\(\sigma\), \(\nu\)
LogNormal (identity link)lognormal()Positive real values in \([0, \infty)\)\(\sigma\)
Gamma (log link)Gamma()Positive real values in \([0, \infty)\)\(\alpha\)
Beta (logit link)betar()Real values (proportional) in \([0,1]\)\(\phi\)
Bernoulli (logit link)bernoulli()Binary data in \({0,1}\)-
Poisson (log link)poisson()Non-negative integers in \((0,1,2,...)\)-
Negative Binomial2 (log link)nb()Non-negative integers in \((0,1,2,...)\)\(\phi\)
Binomial (logit link)binomial()Non-negative integers in \((0,1,2,...)\)-
Beta-Binomial (logit link)beta_binomial()Non-negative integers in \((0,1,2,...)\)\(\phi\)
Poisson Binomial N-mixture (log link)nmix()Non-negative integers in \((0,1,2,...)\)-
+

For all supported observation families, any extra parameters that +need to be estimated (i.e. the \(\sigma\) in a Gaussian model or the \(\phi\) in a Negative Binomial model) are by +default estimated independently for each series. However, users can opt +to force all series to share extra observation parameters using +share_obs_params = TRUE in mvgam(). Note that +default link functions cannot currently be changed.

+
+
+

Supported temporal dynamic processes

+

The dynamic processes can take a wide variety of forms, some of which +can be multivariate to allow the different time series to interact or be +correlated. When using the mvgam() function, the user +chooses between different process models with the +trend_model argument. Available process models are +described in detail below.

+
+

Independent Random Walks

+

Use trend_model = 'RW' or +trend_model = RW() to set up a model where each series in +data has independent latent temporal dynamics of the +form:

+

\[\begin{align*} +z_{i,t} & \sim \text{Normal}(z_{i,t-1}, \sigma_i) +\end{align*}\]

+

Process error parameters \(\sigma\) +are modeled independently for each series. If a moving average process +is required, use trend_model = RW(ma = TRUE) to set up the +following:

+

\[\begin{align*} +z_{i,t} & = z_{i,t-1} + \theta_i * error_{i,t-1} + error_{i,t} \\ +error_{i,t} & \sim \text{Normal}(0, \sigma_i) +\end{align*}\]

+

Moving average coefficients \(\theta\) are independently estimated for +each series and will be forced to be stationary by default \((abs(\theta)<1)\). Only moving averages +of order \(q=1\) are currently +allowed.

+
+
+

Multivariate Random Walks

+

If more than one series is included in data \((N_{series} > 1)\), a multivariate +Random Walk can be set up using +trend_model = RW(cor = TRUE), resulting in the +following:

+

\[\begin{align*} +z_{t} & \sim \text{MVNormal}(z_{t-1}, \Sigma) +\end{align*}\]

+

Where the latent process estimate \(z_t\) now takes the form of a vector. The +covariance matrix \(\Sigma\) will +capture contemporaneously correlated process errors. It is parameterised +using a Cholesky factorization, which requires priors on the +series-level variances \(\sigma\) and +on the strength of correlations using Stan’s +lkj_corr_cholesky distribution.

+

Moving average terms can also be included for multivariate random +walks, in which case the moving average coefficients \(\theta\) will be parameterised as an \(N_{series} * N_{series}\) matrix

+
+
+

Autoregressive processes

+

Autoregressive models up to \(p=3\), +in which the autoregressive coefficients are estimated independently for +each series, can be used by specifying trend_model = 'AR1', +trend_model = 'AR2', trend_model = 'AR3', or +trend_model = AR(p = 1, 2, or 3). For example, a univariate +AR(1) model takes the form:

+

\[\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i * z_{i,t-1}, \sigma_i) +\end{align*}\]

+

All options are the same as for Random Walks, but additional options +will be available for placing priors on the autoregressive coefficients. +By default, these coefficients will not be forced into stationarity, but +users can impose this restriction by changing the upper and lower bounds +on their priors. See ?get_mvgam_priors for more +details.

+
+
+

Vector Autoregressive processes

+

A Vector Autoregression of order \(p=1\) can be specified if \(N_{series} > 1\) using +trend_model = 'VAR1' or trend_model = VAR(). A +VAR(1) model takes the form:

+

\[\begin{align*} +z_{t} & \sim \text{Normal}(A * z_{t-1}, \Sigma) +\end{align*}\]

+

Where \(A\) is an \(N_{series} * N_{series}\) matrix of +autoregressive coefficients in which the diagonals capture lagged +self-dependence (i.e. the effect of a process at time \(t\) on its own estimate at time \(t+1\)), while off-diagonals capture lagged +cross-dependence (i.e. the effect of a process at time \(t\) on the process for another series at +time \(t+1\)). By default, the +covariance matrix \(\Sigma\) will +assume no process error covariance by fixing the off-diagonals to \(0\). To allow for correlated errors, use +trend_model = 'VAR1cor' or +trend_model = VAR(cor = TRUE). A moving average of order +\(q=1\) can also be included using +trend_model = VAR(ma = TRUE, cor = TRUE).

+

Note that for all VAR models, stationarity of the process is enforced +with a structured prior distribution that is described in detail in Heaps +2022

+

Heaps, Sarah E. “Enforcing +stationarity through the prior in vector autoregressions.” +Journal of Computational and Graphical Statistics 32.1 (2023): +74-83.

+
+
+

Gaussian Processes

+

The final option for modelling temporal dynamics is to use a Gaussian +Process with squared exponential kernel. These are set up independently +for each series (there is currently no multivariate GP option), using +trend_model = 'GP'. The dynamics for each latent process +are modelled as:

+

\[\begin{align*} +z & \sim \text{MVNormal}(0, \Sigma_{error}) \\ +\Sigma_{error}[t_i, t_j] & = \alpha^2 * exp(-0.5 * ((|t_i - t_j| / +\rho))^2) \end{align*}\]

+

The latent dynamic process evolves from a complex, high-dimensional +Multivariate Normal distribution which depends on \(\rho\) (often called the length scale +parameter) to control how quickly the correlations between the model’s +errors decay as a function of time. For these models, covariance decays +exponentially fast with the squared distance (in time) between the +observations. The functions also depend on a parameter \(\alpha\), which controls the marginal +variability of the temporal function at all points; in other words it +controls how much the GP term contributes to the linear predictor. +mvgam capitalizes on some advances that allow GPs to be +approximated using Hilbert space basis functions, which considerably speed up computation at little cost to +accuracy or prediction performance.

+
+ +
+

Continuous time AR(1) processes

+

Most trend models in the mvgam() function expect time to +be measured in regularly-spaced, discrete intervals (i.e. one +measurement per week, or one per year for example). But some time series +are taken at irregular intervals and we’d like to model autoregressive +properties of these. The trend_model = CAR() can be useful +to set up these models, which currently only support autoregressive +processes of order 1. The evolution of the latent dynamic +process follows the form:

+

\[\begin{align*} +z_{i,t} & \sim \text{Normal}(ar1_i^{distance} * z_{i,t-1}, \sigma_i) +\end{align*}\]

+

Where \(distance\) is a vector of +non-negative measurements of the time differences between successive +observations. See the Examples section in +?CAR for an illustration of how to set these models up.

+
+
+
+

Regression formulae

+

mvgam supports an observation model regression formula, +built off the mvgcv package, as well as an optional process +model regression formula. The formulae supplied to are exactly like +those supplied to glm() except that smooth terms, +s(), te(), ti() and +t2(), time-varying effects using dynamic(), +monotonically increasing (using s(x, bs = 'moi')) or +decreasing splines (using s(x, bs = 'mod'); see +?smooth.construct.moi.smooth.spec for details), as well as +Gaussian Process functions using gp(), can be added to the +right hand side (and . is not supported in +mvgam formulae). See ?mvgam_formulae for more +guidance.

+

For setting up State-Space models, the optional process model formula +can be used (see the +State-Space model vignette and the +shared latent states vignette for guidance on using trend +formulae).

+
+
+

Example time series data

+

The ‘portal_data’ object contains time series of rodent captures from +the Portal Project, a long-term monitoring study based near the town of +Portal, Arizona. Researchers have been operating a standardized set +of baited traps within 24 experimental plots at this site since the +1970’s. Sampling follows the lunar monthly cycle, with observations +occurring on average about 28 days apart. However, missing observations +do occur due to difficulties accessing the site (weather events, COVID +disruptions etc…). You can read about the full sampling protocol in this preprint by Ernest et al on the Biorxiv.

+
data("portal_data")
+

As the data come pre-loaded with the mvgam package, you +can read a little about it in the help page using +?portal_data. Before working with data, it is important to +inspect how the data are structured, first using head:

+
head(portal_data)
+#>   moon DM DO PP OT year month mintemp precipitation     ndvi
+#> 1  329 10  6  0  2 2004     1  -9.710          37.8 1.465889
+#> 2  330 14  8  1  0 2004     2  -5.924           8.7 1.558507
+#> 3  331  9  1  2  1 2004     3  -0.220          43.5 1.337817
+#> 4  332 NA NA NA NA 2004     4   1.931          23.9 1.658913
+#> 5  333 15  8 10  1 2004     5   6.568           0.9 1.853656
+#> 6  334 NA NA NA NA 2004     6  11.590           1.4 1.761330
+

But the glimpse function in dplyr is also +useful for understanding how variables are structured

+
dplyr::glimpse(portal_data)
+#> Rows: 199
+#> Columns: 10
+#> $ moon          <int> 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 3…
+#> $ DM            <int> 10, 14, 9, NA, 15, NA, NA, 9, 5, 8, NA, 14, 7, NA, NA, 9…
+#> $ DO            <int> 6, 8, 1, NA, 8, NA, NA, 3, 3, 4, NA, 3, 8, NA, NA, 3, NA…
+#> $ PP            <int> 0, 1, 2, NA, 10, NA, NA, 16, 18, 12, NA, 3, 2, NA, NA, 1…
+#> $ OT            <int> 2, 0, 1, NA, 1, NA, NA, 1, 0, 0, NA, 2, 1, NA, NA, 1, NA…
+#> $ year          <int> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 20…
+#> $ month         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6,…
+#> $ mintemp       <dbl> -9.710, -5.924, -0.220, 1.931, 6.568, 11.590, 14.370, 16…
+#> $ precipitation <dbl> 37.8, 8.7, 43.5, 23.9, 0.9, 1.4, 20.3, 91.0, 60.5, 25.2,…
+#> $ ndvi          <dbl> 1.4658889, 1.5585069, 1.3378172, 1.6589129, 1.8536561, 1…
+

We will focus analyses on the time series of captures for one +specific rodent species, the Desert Pocket Mouse Chaetodipus +penicillatus. This species is interesting in that it goes into a +kind of “hibernation” during the colder months, leading to very low +captures during the winter period

+
+
+

Manipulating data for modelling

+

Manipulating the data into a ‘long’ format is necessary for modelling +in mvgam. By ‘long’ format, we mean that each +series x time observation needs to have its own entry in +the dataframe or list object that we wish to +use as data for modelling. A simple example can be viewed by simulating +data using the sim_mvgam function. See +?sim_mvgam for more details

+
data <- sim_mvgam(n_series = 4, T = 24)
+head(data$data_train, 12)
+#>    y season year   series time
+#> 1  1      1    1 series_1    1
+#> 2  0      1    1 series_2    1
+#> 3  1      1    1 series_3    1
+#> 4  1      1    1 series_4    1
+#> 5  1      2    1 series_1    2
+#> 6  1      2    1 series_2    2
+#> 7  0      2    1 series_3    2
+#> 8  1      2    1 series_4    2
+#> 9  2      3    1 series_1    3
+#> 10 0      3    1 series_2    3
+#> 11 0      3    1 series_3    3
+#> 12 1      3    1 series_4    3
+

Notice how we have four different time series in these simulated +data, but we do not spread the outcome values into different columns. +Rather, there is only a single column for the outcome variable, labelled +y in these simulated data. We also must supply a variable +labelled time to ensure the modelling software knows how to +arrange the time series when building models. This setup still allows us +to formulate multivariate time series models, as you can see in the State-Space +vignette. Below are the steps needed to shape our +portal_data object into the correct form. First, we create +a time variable, select the column representing counts of +our target species (PP), and select appropriate variables +that we can use as predictors

+
portal_data %>%
+  
+  # mvgam requires a 'time' variable be present in the data to index
+  # the temporal observations. This is especially important when tracking 
+  # multiple time series. In the Portal data, the 'moon' variable indexes the
+  # lunar monthly timestep of the trapping sessions
+  dplyr::mutate(time = moon - (min(moon)) + 1) %>%
+  
+  # We can also provide a more informative name for the outcome variable, which 
+  # is counts of the 'PP' species (Chaetodipus penicillatus) across all control
+  # plots
+  dplyr::mutate(count = PP) %>%
+  
+  # The other requirement for mvgam is a 'series' variable, which needs to be a
+  # factor variable to index which time series each row in the data belongs to.
+  # Again, this is more useful when you have multiple time series in the data
+  dplyr::mutate(series = as.factor('PP')) %>%
+  
+  # Select the variables of interest to keep in the model_data
+  dplyr::select(series, year, time, count, mintemp, ndvi) -> model_data
+

The data now contain six variables:
+series, a factor indexing which time series each +observation belongs to
+year, the year of sampling
+time, the indicator of which time step each observation +belongs to
+count, the response variable representing the number of +captures of the species PP in each sampling +observation
+mintemp, the monthly average minimum temperature at each +time step
+ndvi, the monthly average Normalized Difference Vegetation +Index at each time step

+

Now check the data structure again

+
head(model_data)
+#>   series year time count mintemp     ndvi
+#> 1     PP 2004    1     0  -9.710 1.465889
+#> 2     PP 2004    2     1  -5.924 1.558507
+#> 3     PP 2004    3     2  -0.220 1.337817
+#> 4     PP 2004    4    NA   1.931 1.658913
+#> 5     PP 2004    5    10   6.568 1.853656
+#> 6     PP 2004    6    NA  11.590 1.761330
+
dplyr::glimpse(model_data)
+#> Rows: 199
+#> Columns: 6
+#> $ series  <fct> PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP…
+#> $ year    <int> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 20…
+#> $ time    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
+#> $ count   <int> 0, 1, 2, NA, 10, NA, NA, 16, 18, 12, NA, 3, 2, NA, NA, 13, NA,…
+#> $ mintemp <dbl> -9.710, -5.924, -0.220, 1.931, 6.568, 11.590, 14.370, 16.520, …
+#> $ ndvi    <dbl> 1.4658889, 1.5585069, 1.3378172, 1.6589129, 1.8536561, 1.76132…
+

You can also summarize multiple variables, which is helpful to search +for data ranges and identify missing values

+
summary(model_data)
+#>  series        year           time           count          mintemp       
+#>  PP:199   Min.   :2004   Min.   :  1.0   Min.   : 0.00   Min.   :-24.000  
+#>           1st Qu.:2008   1st Qu.: 50.5   1st Qu.: 2.50   1st Qu.: -3.884  
+#>           Median :2012   Median :100.0   Median :12.00   Median :  2.130  
+#>           Mean   :2012   Mean   :100.0   Mean   :15.14   Mean   :  3.504  
+#>           3rd Qu.:2016   3rd Qu.:149.5   3rd Qu.:24.00   3rd Qu.: 12.310  
+#>           Max.   :2020   Max.   :199.0   Max.   :65.00   Max.   : 18.140  
+#>                                          NA's   :36                       
+#>       ndvi       
+#>  Min.   :0.2817  
+#>  1st Qu.:1.0741  
+#>  Median :1.3501  
+#>  Mean   :1.4709  
+#>  3rd Qu.:1.8178  
+#>  Max.   :3.9126  
+#> 
+

We have some NAs in our response variable +count. Let’s visualize the data as a heatmap to get a sense +of where these are distributed (NAs are shown as red bars +in the below plot)

+
image(is.na(t(model_data %>%
+                dplyr::arrange(dplyr::desc(time)))), axes = F,
+      col = c('grey80', 'darkred'))
+axis(3, at = seq(0,1, len = NCOL(model_data)), labels = colnames(model_data))
+

+

These observations will generally be thrown out by most modelling +packages in . But as you will see when we work through the tutorials, +mvgam keeps these in the data so that predictions can be +automatically returned for the full dataset. The time series and some of +its descriptive features can be plotted using +plot_mvgam_series():

+
plot_mvgam_series(data = model_data, series = 1, y = 'count')
+

+
+
+

GLMs with temporal random effects

+

Our first task will be to fit a Generalized Linear Model (GLM) that +can adequately capture the features of our count +observations (integer data, lower bound at zero, missing values) while +also attempting to model temporal variation. We are almost ready to fit +our first model, which will be a GLM with Poisson observations, a log +link function and random (hierarchical) intercepts for +year. This will allow us to capture our prior belief that, +although each year is unique, having been sampled from the same +population of effects, all years are connected and thus might contain +valuable information about one another. This will be done by +capitalizing on the partial pooling properties of hierarchical models. +Hierarchical (also known as random) effects offer many advantages when +modelling data with grouping structures (i.e. multiple species, +locations, years etc…). The ability to incorporate these in time series +models is a huge advantage over traditional models such as ARIMA or +Exponential Smoothing. But before we fit the model, we will need to +convert year to a factor so that we can use a random effect +basis in mvgam. See ?smooth.terms and +?smooth.construct.re.smooth.spec for details about the +re basis construction that is used by both +mvgam and mgcv

+
model_data %>%
+  
+  # Create a 'year_fac' factor version of 'year'
+  dplyr::mutate(year_fac = factor(year)) -> model_data
+

Preview the dataset to ensure year is now a factor with a unique +factor level for each year in the data

+
dplyr::glimpse(model_data)
+#> Rows: 199
+#> Columns: 7
+#> $ series   <fct> PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, PP, P…
+#> $ year     <int> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2…
+#> $ time     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
+#> $ count    <int> 0, 1, 2, NA, 10, NA, NA, 16, 18, 12, NA, 3, 2, NA, NA, 13, NA…
+#> $ mintemp  <dbl> -9.710, -5.924, -0.220, 1.931, 6.568, 11.590, 14.370, 16.520,…
+#> $ ndvi     <dbl> 1.4658889, 1.5585069, 1.3378172, 1.6589129, 1.8536561, 1.7613…
+#> $ year_fac <fct> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2…
+levels(model_data$year_fac)
+#>  [1] "2004" "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013"
+#> [11] "2014" "2015" "2016" "2017" "2018" "2019" "2020"
+

We are now ready for our first mvgam model. The syntax +will be familiar to users who have previously built models with +mgcv. But for a refresher, see ?formula.gam +and the examples in ?gam. Random effects can be specified +using the s wrapper with the re basis. Note +that we can also suppress the primary intercept using the usual +R formula syntax - 1. mvgam has a +number of possible observation families that can be used, see +?mvgam_families for more information. We will use +Stan as the fitting engine, which deploys Hamiltonian Monte +Carlo (HMC) for full Bayesian inference. By default, 4 HMC chains will +be run using a warmup of 500 iterations and collecting 500 posterior +samples from each chain. The package will also aim to use the +Cmdstan backend when possible, so it is recommended that +users have an up-to-date installation of Cmdstan and the +associated cmdstanr interface on their machines (note that +you can set the backend yourself using the backend +argument: see ?mvgam for details). Interested users should +consult the Stan user’s guide for more information +about the software and the enormous variety of models that can be +tackled with HMC.

+
model1 <- mvgam(count ~ s(year_fac, bs = 're') - 1,
+                family = poisson(),
+                data = model_data)
+

The model can be described mathematically for each timepoint \(t\) as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) +\end{align*}\]

+

Where the \(\beta_{year}\) effects +are drawn from a population distribution that is parameterized +by a common mean \((\mu_{year})\) and +variance \((\sigma_{year})\). Priors on +most of the model parameters can be interrogated and changed using +similar functionality to the options available in brms. For +example, the default priors on \((\mu_{year})\) and \((\sigma_{year})\) can be viewed using the +following code:

+
get_mvgam_priors(count ~ s(year_fac, bs = 're') - 1,
+                 family = poisson(),
+                 data = model_data)
+#>                      param_name param_length           param_info
+#> 1             vector[1] mu_raw;            1 s(year_fac) pop mean
+#> 2 vector<lower=0>[1] sigma_raw;            1   s(year_fac) pop sd
+#>                               prior                example_change
+#> 1            mu_raw ~ std_normal();  mu_raw ~ normal(0.17, 0.76);
+#> 2 sigma_raw ~ student_t(3, 0, 2.5); sigma_raw ~ exponential(0.7);
+#>   new_lowerbound new_upperbound
+#> 1             NA             NA
+#> 2             NA             NA
+

See examples in ?get_mvgam_priors to find out different +ways that priors can be altered. Once the model has finished, the first +step is to inspect the summary to ensure no major +diagnostic warnings have been produced and to quickly summarise +posterior distributions for key parameters

+
summary(model1)
+#> GAM formula:
+#> count ~ s(year_fac, bs = "re") - 1
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 199 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>                 2.5% 50% 97.5% Rhat n_eff
+#> s(year_fac).1   1.80 2.1   2.3 1.00  2663
+#> s(year_fac).2   2.50 2.7   2.8 1.00  2468
+#> s(year_fac).3   3.00 3.1   3.2 1.00  3105
+#> s(year_fac).4   3.10 3.3   3.4 1.00  2822
+#> s(year_fac).5   1.90 2.1   2.3 1.00  3348
+#> s(year_fac).6   1.50 1.8   2.0 1.00  2859
+#> s(year_fac).7   1.80 2.0   2.3 1.00  2995
+#> s(year_fac).8   2.80 3.0   3.1 1.00  3126
+#> s(year_fac).9   3.10 3.3   3.4 1.00  2816
+#> s(year_fac).10  2.60 2.8   2.9 1.00  2289
+#> s(year_fac).11  3.00 3.1   3.2 1.00  2725
+#> s(year_fac).12  3.10 3.2   3.3 1.00  2581
+#> s(year_fac).13  2.00 2.2   2.5 1.00  2885
+#> s(year_fac).14  2.50 2.6   2.8 1.00  2749
+#> s(year_fac).15  1.90 2.2   2.4 1.00  2943
+#> s(year_fac).16  1.90 2.1   2.3 1.00  2991
+#> s(year_fac).17 -0.33 1.1   1.9 1.01   356
+#> 
+#> GAM group-level estimates:
+#>                   2.5%  50% 97.5% Rhat n_eff
+#> mean(s(year_fac)) 2.00 2.40   2.7 1.01   193
+#> sd(s(year_fac))   0.44 0.67   1.1 1.02   172
+#> 
+#> Approximate significance of GAM smooths:
+#>              edf Ref.df Chi.sq p-value    
+#> s(year_fac) 13.8     17  23477  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 12:59:57 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The diagnostic messages at the bottom of the summary show that the +HMC sampler did not encounter any problems or difficult posterior +spaces. This is a good sign. Posterior distributions for model +parameters can be extracted in any way that an object of class +brmsfit can (see ?mvgam::mvgam_draws for +details). For example, we can extract the coefficients related to the +GAM linear predictor (i.e. the \(\beta\)’s) into a data.frame +using:

+
beta_post <- as.data.frame(model1, variable = 'betas')
+dplyr::glimpse(beta_post)
+#> Rows: 2,000
+#> Columns: 17
+#> $ `s(year_fac).1`  <dbl> 2.17023, 2.08413, 1.99815, 2.17572, 2.11308, 2.03050,…
+#> $ `s(year_fac).2`  <dbl> 2.70488, 2.69887, 2.65551, 2.79651, 2.76044, 2.75108,…
+#> $ `s(year_fac).3`  <dbl> 3.08617, 3.13429, 3.04575, 3.14824, 3.10917, 3.09809,…
+#> $ `s(year_fac).4`  <dbl> 3.29529, 3.21044, 3.22018, 3.26644, 3.29880, 3.25638,…
+#> $ `s(year_fac).5`  <dbl> 2.11053, 2.14516, 2.13959, 2.05244, 2.26847, 2.20820,…
+#> $ `s(year_fac).6`  <dbl> 1.80418, 1.83343, 1.75987, 1.76972, 1.64782, 1.70765,…
+#> $ `s(year_fac).7`  <dbl> 1.99033, 1.95772, 1.98093, 2.01777, 2.04849, 1.97815,…
+#> $ `s(year_fac).8`  <dbl> 3.01204, 2.91291, 3.14762, 2.83082, 2.90250, 3.04050,…
+#> $ `s(year_fac).9`  <dbl> 3.22248, 3.20205, 3.30373, 3.23181, 3.24927, 3.25232,…
+#> $ `s(year_fac).10` <dbl> 2.71922, 2.62225, 2.82574, 2.65027, 2.69077, 2.75249,…
+#> $ `s(year_fac).11` <dbl> 3.10525, 3.03951, 3.12914, 3.03849, 3.01198, 3.14391,…
+#> $ `s(year_fac).12` <dbl> 3.20887, 3.23337, 3.24350, 3.16821, 3.23516, 3.18216,…
+#> $ `s(year_fac).13` <dbl> 2.18530, 2.15358, 2.39908, 2.21862, 2.14648, 2.17067,…
+#> $ `s(year_fac).14` <dbl> 2.66153, 2.67202, 2.64594, 2.57457, 2.38109, 2.44175,…
+#> $ `s(year_fac).15` <dbl> 2.24898, 2.24912, 2.03587, 2.33842, 2.27868, 2.24643,…
+#> $ `s(year_fac).16` <dbl> 2.20947, 2.21717, 2.03610, 2.17374, 2.16442, 2.14900,…
+#> $ `s(year_fac).17` <dbl> 0.1428430, 0.8005170, -0.0136294, 0.6880930, 0.192034…
+

With any model fitted in mvgam, the underlying +Stan code can be viewed using the code +function:

+
code(model1)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_series; // number of series
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   
+#>   // random effect variances
+#>   vector<lower=0>[1] sigma_raw;
+#>   
+#>   // random effect means
+#>   vector[1] mu_raw;
+#> }
+#> transformed parameters {
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   b[1 : 17] = mu_raw[1] + b_raw[1 : 17] * sigma_raw[1];
+#> }
+#> model {
+#>   // prior for random effect population variances
+#>   sigma_raw ~ student_t(3, 0, 2.5);
+#>   
+#>   // prior for random effect population means
+#>   mu_raw ~ std_normal();
+#>   
+#>   // prior (non-centred) for s(year_fac)...
+#>   b_raw[1 : 17] ~ std_normal();
+#>   {
+#>     // likelihood functions
+#>     flat_ys ~ poisson_log_glm(flat_xs, 0.0, b);
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   array[n, n_series] int ypred;
+#>   
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+
+

Plotting effects and residuals

+

Now for interrogating the model. We can get some sense of the +variation in yearly intercepts from the summary above, but it is easier +to understand them using targeted plots. Plot posterior distributions of +the temporal random effects using plot.mvgam with +type = 're'. See ?plot.mvgam for more details +about the types of plots that can be produced from fitted +mvgam objects

+
plot(model1, type = 're')
+

+
+
+

bayesplot support

+

We can also capitalize on most of the useful MCMC plotting functions +from the bayesplot package to visualize posterior +distributions and diagnostics (see ?mvgam::mcmc_plot.mvgam +for details):

+
mcmc_plot(object = model1,
+          variable = 'betas',
+          type = 'areas')
+

+

We can also use the wide range of posterior checking functions +available in bayesplot (see +?mvgam::ppc_check.mvgam for details):

+
pp_check(object = model1)
+#> Using 10 posterior draws for ppc type 'dens_overlay' by default.
+#> Warning in pp_check.mvgam(object = model1): NA responses are not shown in
+#> 'pp_check'.
+

+
pp_check(model1, type = "rootogram")
+#> Using all posterior draws for ppc type 'rootogram' by default.
+#> Warning in pp_check.mvgam(model1, type = "rootogram"): NA responses are not
+#> shown in 'pp_check'.
+

+

There is clearly some variation in these yearly intercept estimates. +But how do these translate into time-varying predictions? To understand +this, we can plot posterior hindcasts from this model for the training +period using plot.mvgam with +type = 'forecast'

+
plot(model1, type = 'forecast')
+

+

If you wish to extract these hindcasts for other downstream analyses, +the hindcast function can be used. This will return a list +object of class mvgam_forecast. In the +hindcasts slot, a matrix of posterior retrodictions will be +returned for each series in the data (only one series in our +example):

+
hc <- hindcast(model1)
+str(hc)
+#> List of 15
+#>  $ call              :Class 'formula'  language count ~ s(year_fac, bs = "re") - 1
+#>   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
+#>  $ trend_call        : NULL
+#>  $ family            : chr "poisson"
+#>  $ trend_model       : chr "None"
+#>  $ drift             : logi FALSE
+#>  $ use_lv            : logi FALSE
+#>  $ fit_engine        : chr "stan"
+#>  $ type              : chr "response"
+#>  $ series_names      : chr "PP"
+#>  $ train_observations:List of 1
+#>   ..$ PP: int [1:199] 0 1 2 NA 10 NA NA 16 18 12 ...
+#>  $ train_times       : num [1:199] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ test_observations : NULL
+#>  $ test_times        : NULL
+#>  $ hindcasts         :List of 1
+#>   ..$ PP: num [1:2000, 1:199] 9 6 10 6 12 8 10 5 8 6 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:199] "ypred[1,1]" "ypred[2,1]" "ypred[3,1]" "ypred[4,1]" ...
+#>  $ forecasts         : NULL
+#>  - attr(*, "class")= chr "mvgam_forecast"
+

You can also extract these hindcasts on the linear predictor scale, +which in this case is the log scale (our Poisson GLM used a log link +function). Sometimes this can be useful for asking more targeted +questions about drivers of variation:

+
hc <- hindcast(model1, type = 'link')
+range(hc$hindcasts$PP)
+#> [1] -1.51216  3.46162
+

Objects of class mvgam_forecast have an associated plot +function as well:

+
plot(hc)
+

+

This plot can look a bit confusing as it seems like there is linear +interpolation from the end of one year to the start of the next. But +this is just due to the way the lines are automatically connected in +base plots

+

In any regression analysis, a key question is whether the residuals +show any patterns that can be indicative of un-modelled sources of +variation. For GLMs, we can use a modified residual called the Dunn-Smyth, +or randomized quantile, residual. Inspect Dunn-Smyth residuals from +the model using plot.mvgam with +type = 'residuals'

+
plot(model1, type = 'residuals')
+

+
+
+
+

Automatic forecasting for new data

+

These temporal random effects do not have a sense of “time”. Because +of this, each yearly random intercept is not restricted in some way to +be similar to the previous yearly intercept. This drawback becomes +evident when we predict for a new year. To do this, we can repeat the +exercise above but this time will split the data into training and +testing sets before re-running the model. We can then supply the test +set as newdata. For splitting, we will make use of the +filter function from dplyr

+
model_data %>% 
+  dplyr::filter(time <= 160) -> data_train 
+model_data %>% 
+  dplyr::filter(time > 160) -> data_test
+
model1b <- mvgam(count ~ s(year_fac, bs = 're') - 1,
+                family = poisson(),
+                data = data_train,
+                newdata = data_test)
+

Repeating the plots above gives insight into how the model’s +hierarchical prior formulation provides all the structure needed to +sample values for un-modelled years

+
plot(model1b, type = 're')
+

+
plot(model1b, type = 'forecast')
+

+
#> Out of sample DRPS:
+#> [1] 182.6177
+

We can also view the test data in the forecast plot to see that the +predictions do not capture the temporal variation in the test set

+
plot(model1b, type = 'forecast', newdata = data_test)
+

+
#> Out of sample DRPS:
+#> [1] 182.6177
+

As with the hindcast function, we can use the +forecast function to automatically extract the posterior +distributions for these predictions. This also returns an object of +class mvgam_forecast, but now it will contain both the +hindcasts and forecasts for each series in the data:

+
fc <- forecast(model1b)
+str(fc)
+#> List of 16
+#>  $ call              :Class 'formula'  language count ~ s(year_fac, bs = "re") - 1
+#>   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
+#>  $ trend_call        : NULL
+#>  $ family            : chr "poisson"
+#>  $ family_pars       : NULL
+#>  $ trend_model       : chr "None"
+#>  $ drift             : logi FALSE
+#>  $ use_lv            : logi FALSE
+#>  $ fit_engine        : chr "stan"
+#>  $ type              : chr "response"
+#>  $ series_names      : Factor w/ 1 level "PP": 1
+#>  $ train_observations:List of 1
+#>   ..$ PP: int [1:160] 0 1 2 NA 10 NA NA 16 18 12 ...
+#>  $ train_times       : num [1:160] 1 2 3 4 5 6 7 8 9 10 ...
+#>  $ test_observations :List of 1
+#>   ..$ PP: int [1:39] NA 0 0 10 3 14 18 NA 28 46 ...
+#>  $ test_times        : num [1:39] 161 162 163 164 165 166 167 168 169 170 ...
+#>  $ hindcasts         :List of 1
+#>   ..$ PP: num [1:2000, 1:160] 10 7 8 11 6 11 9 11 5 2 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:160] "ypred[1,1]" "ypred[2,1]" "ypred[3,1]" "ypred[4,1]" ...
+#>  $ forecasts         :List of 1
+#>   ..$ PP: num [1:2000, 1:39] 5 12 10 7 7 8 11 14 8 12 ...
+#>   .. ..- attr(*, "dimnames")=List of 2
+#>   .. .. ..$ : NULL
+#>   .. .. ..$ : chr [1:39] "ypred[161,1]" "ypred[162,1]" "ypred[163,1]" "ypred[164,1]" ...
+#>  - attr(*, "class")= chr "mvgam_forecast"
+
+
+

Adding predictors as “fixed” effects

+

Any users familiar with GLMs will know that we nearly always wish to +include predictor variables that may explain some of the variation in +our observations. Predictors are easily incorporated into GLMs / GAMs. +Here, we will update the model from above by including a parametric +(fixed) effect of ndvi as a linear predictor:

+
model2 <- mvgam(count ~ s(year_fac, bs = 're') + 
+                  ndvi - 1,
+                family = poisson(),
+                data = data_train,
+                newdata = data_test)
+

The model can be described mathematically as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = \beta_{year[year_t]} + \beta_{ndvi} * +\boldsymbol{ndvi}_t \\ +\beta_{year} & \sim \text{Normal}(\mu_{year}, \sigma_{year}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*}\]

+

Where the \(\beta_{year}\) effects +are the same as before but we now have another predictor \((\beta_{ndvi})\) that applies to the +ndvi value at each timepoint \(t\). Inspect the summary of this model

+
summary(model2)
+#> GAM formula:
+#> count ~ ndvi + s(year_fac, bs = "re") - 1
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 160 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>                2.5%  50% 97.5% Rhat n_eff
+#> ndvi           0.32 0.39  0.46    1  1696
+#> s(year_fac).1  1.10 1.40  1.70    1  2512
+#> s(year_fac).2  1.80 2.00  2.20    1  2210
+#> s(year_fac).3  2.20 2.40  2.60    1  2109
+#> s(year_fac).4  2.30 2.50  2.70    1  1780
+#> s(year_fac).5  1.20 1.40  1.60    1  2257
+#> s(year_fac).6  1.00 1.30  1.50    1  2827
+#> s(year_fac).7  1.10 1.40  1.70    1  2492
+#> s(year_fac).8  2.10 2.30  2.50    1  2188
+#> s(year_fac).9  2.70 2.90  3.00    1  2014
+#> s(year_fac).10 2.00 2.20  2.40    1  2090
+#> s(year_fac).11 2.30 2.40  2.60    1  1675
+#> s(year_fac).12 2.50 2.70  2.80    1  2108
+#> s(year_fac).13 1.40 1.60  1.80    1  2161
+#> s(year_fac).14 0.46 2.00  3.20    1  1849
+#> s(year_fac).15 0.53 2.00  3.30    1  1731
+#> s(year_fac).16 0.53 2.00  3.30    1  1859
+#> s(year_fac).17 0.59 1.90  3.20    1  1761
+#> 
+#> GAM group-level estimates:
+#>                   2.5%  50% 97.5% Rhat n_eff
+#> mean(s(year_fac))  1.6 2.00   2.3 1.01   397
+#> sd(s(year_fac))    0.4 0.59   1.0 1.01   395
+#> 
+#> Approximate significance of GAM smooths:
+#>              edf Ref.df Chi.sq p-value    
+#> s(year_fac) 11.2     17   3096  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:00:50 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Rather than printing the summary each time, we can also quickly look +at the posterior empirical quantiles for the fixed effect of +ndvi (and other linear predictor coefficients) using +coef:

+
coef(model2)
+#>                     2.5%       50%     97.5% Rhat n_eff
+#> ndvi           0.3198694 0.3899835 0.4571083    1  1696
+#> s(year_fac).1  1.1176373 1.4085900 1.6603838    1  2512
+#> s(year_fac).2  1.8008470 2.0005000 2.2003670    1  2210
+#> s(year_fac).3  2.1842727 2.3822950 2.5699363    1  2109
+#> s(year_fac).4  2.3267037 2.5022700 2.6847912    1  1780
+#> s(year_fac).5  1.1945853 1.4215950 1.6492038    1  2257
+#> s(year_fac).6  1.0332160 1.2743050 1.5091052    1  2827
+#> s(year_fac).7  1.1467567 1.4119100 1.6751850    1  2492
+#> s(year_fac).8  2.0710285 2.2713050 2.4596285    1  2188
+#> s(year_fac).9  2.7198967 2.8557300 2.9874662    1  2014
+#> s(year_fac).10 1.9798730 2.1799600 2.3932595    1  2090
+#> s(year_fac).11 2.2734940 2.4374700 2.6130482    1  1675
+#> s(year_fac).12 2.5421157 2.6935350 2.8431822    1  2108
+#> s(year_fac).13 1.3786087 1.6177850 1.8495872    1  2161
+#> s(year_fac).14 0.4621041 1.9744700 3.2480377    1  1849
+#> s(year_fac).15 0.5293684 2.0014200 3.2766722    1  1731
+#> s(year_fac).16 0.5285142 1.9786450 3.2859085    1  1859
+#> s(year_fac).17 0.5909969 1.9462850 3.2306940    1  1761
+

Look at the estimated effect of ndvi using +plot.mvgam with type = 'pterms'

+
plot(model2, type = 'pterms')
+

+

This plot indicates a positive linear effect of ndvi on +log(counts). But it may be easier to visualise using a +histogram, especially for parametric (linear) effects. This can be done +by first extracting the posterior coefficients as we did in the first +example:

+
beta_post <- as.data.frame(model2, variable = 'betas')
+dplyr::glimpse(beta_post)
+#> Rows: 2,000
+#> Columns: 18
+#> $ ndvi             <dbl> 0.330568, 0.398734, 0.357498, 0.484288, 0.380087, 0.3…
+#> $ `s(year_fac).1`  <dbl> 1.55868, 1.27949, 1.24414, 1.02997, 1.64712, 1.07519,…
+#> $ `s(year_fac).2`  <dbl> 1.98967, 2.00846, 2.07493, 1.84431, 2.01590, 2.16466,…
+#> $ `s(year_fac).3`  <dbl> 2.41434, 2.16020, 2.67324, 2.33332, 2.32415, 2.45516,…
+#> $ `s(year_fac).4`  <dbl> 2.62215, 2.53992, 2.50659, 2.23671, 2.56663, 2.40054,…
+#> $ `s(year_fac).5`  <dbl> 1.37221, 1.44795, 1.53019, 1.27623, 1.50771, 1.49515,…
+#> $ `s(year_fac).6`  <dbl> 1.323980, 1.220200, 1.165610, 1.271620, 1.193820, 1.3…
+#> $ `s(year_fac).7`  <dbl> 1.52005, 1.30735, 1.42566, 1.13335, 1.61581, 1.31853,…
+#> $ `s(year_fac).8`  <dbl> 2.40223, 2.20021, 2.44366, 2.17192, 2.20837, 2.33066,…
+#> $ `s(year_fac).9`  <dbl> 2.91580, 2.90942, 2.87679, 2.64941, 2.85401, 2.78744,…
+#> $ `s(year_fac).10` <dbl> 2.46559, 2.01466, 2.08319, 2.01400, 2.22965, 2.26523,…
+#> $ `s(year_fac).11` <dbl> 2.52118, 2.45406, 2.46667, 2.20664, 2.42495, 2.46256,…
+#> $ `s(year_fac).12` <dbl> 2.72360, 2.63546, 2.86718, 2.59035, 2.76576, 2.56130,…
+#> $ `s(year_fac).13` <dbl> 1.67388, 1.50790, 1.52463, 1.39004, 1.72927, 1.61023,…
+#> $ `s(year_fac).14` <dbl> 2.583650, 2.034240, 1.819820, 1.579280, 2.426880, 1.8…
+#> $ `s(year_fac).15` <dbl> 2.57365, 2.28723, 1.67404, 1.46796, 2.49512, 2.71230,…
+#> $ `s(year_fac).16` <dbl> 1.801660, 2.185540, 1.756500, 2.098760, 2.270640, 1.8…
+#> $ `s(year_fac).17` <dbl> 0.886081, 3.409300, -0.371795, 2.494990, 1.822150, 2.…
+

The posterior distribution for the effect of ndvi is +stored in the ndvi column. A quick histogram confirms our +inference that log(counts) respond positively to increases +in ndvi:

+
hist(beta_post$ndvi,
+     xlim = c(-1 * max(abs(beta_post$ndvi)),
+              max(abs(beta_post$ndvi))),
+     col = 'darkred',
+     border = 'white',
+     xlab = expression(beta[NDVI]),
+     ylab = '',
+     yaxt = 'n',
+     main = '',
+     lwd = 2)
+abline(v = 0, lwd = 2.5)
+

+
+

marginaleffects support

+

Given our model used a nonlinear link function (log link in this +example), it can still be difficult to fully understand what +relationship our model is estimating between a predictor and the +response. Fortunately, the marginaleffects package makes +this relatively straightforward. Objects of class mvgam can +be used with marginaleffects to inspect contrasts, +scenario-based predictions, conditional and marginal effects, all on the +outcome scale. Here we will use the plot_predictions +function from marginaleffects to inspect the conditional +effect of ndvi (use ?plot_predictions for +guidance on how to modify these plots):

+
plot_predictions(model2, 
+                 condition = "ndvi",
+                 # include the observed count values
+                 # as points, and show rugs for the observed
+                 # ndvi and count values on the axes
+                 points = 0.5, rug = TRUE)
+

+

Now it is easier to get a sense of the nonlinear but positive +relationship estimated between ndvi and count. +Like brms, mvgam has the simple +conditional_effects function to make quick and informative +plots for main effects. This will likely be your go-to function for +quickly understanding patterns from fitted mvgam models

+
plot(conditional_effects(model2), ask = FALSE)
+

+
+
+
+

Adding predictors as smooths

+

Smooth functions, using penalized splines, are a major feature of +mvgam. Nonlinear splines are commonly viewed as variations +of random effects in which the coefficients that control the shape of +the spline are drawn from a joint, penalized distribution. This strategy +is very often used in ecological time series analysis to capture smooth +temporal variation in the processes we seek to study. When we construct +smoothing splines, the workhorse package mgcv will +calculate a set of basis functions that will collectively control the +shape and complexity of the resulting spline. It is often helpful to +visualize these basis functions to get a better sense of how splines +work. We’ll create a set of 6 basis functions to represent possible +variation in the effect of time on our outcome.In addition +to constructing the basis functions, mgcv also creates a +penalty matrix \(S\), which contains +known coefficients that work to constrain the +wiggliness of the resulting smooth function. When fitting a GAM to data, +we must estimate the smoothing parameters (\(\lambda\)) that will penalize these +matrices, resulting in constrained basis coefficients and smoother +functions that are less likely to overfit the data. This is the key to +fitting GAMs in a Bayesian framework, as we can jointly estimate the +\(\lambda\)’s using informative priors +to prevent overfitting and expand the complexity of models we can +tackle. To see this in practice, we can now fit a model that replaces +the yearly random effects with a smooth function of time. +We will need a reasonably complex function (large k) to try +and accommodate the temporal variation in our observations. Following +some useful advice by Gavin Simpson, we will use a +b-spline basis for the temporal smooth. Because we no longer have +intercepts for each year, we also retain the primary intercept term in +this model (there is no -1 in the formula now):

+
model3 <- mvgam(count ~ s(time, bs = 'bs', k = 15) + 
+                  ndvi,
+                family = poisson(),
+                data = data_train,
+                newdata = data_test)
+

The model can be described mathematically as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{time})_t + \beta_{ndvi} * +\boldsymbol{ndvi}_t \\ +f(\boldsymbol{time}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) \\ +\beta_{ndvi} & \sim \text{Normal}(0, 1) \end{align*}\]

+

Where the smooth function \(f_{time}\) is built by summing across a set +of weighted basis functions. The basis functions \((b)\) are constructed using a thin plate +regression basis in mgcv. The weights \((\beta_{smooth})\) are drawn from a +penalized multivariate normal distribution where the precision matrix +\((\Omega\)) is multiplied by a +smoothing penalty \((\lambda)\). If +\(\lambda\) becomes large, this acts to +squeeze the covariances among the weights \((\beta_{smooth})\), leading to a less +wiggly spline. Note that sometimes there are multiple smoothing +penalties that contribute to the covariance matrix, but I am only +showing one here for simplicity. View the summary as before

+
summary(model3)
+#> GAM formula:
+#> count ~ s(time, bs = "bs", k = 15) + ndvi
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 160 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>              2.5%   50%  97.5% Rhat n_eff
+#> (Intercept)  2.00  2.10  2.200 1.00   903
+#> ndvi         0.26  0.33  0.390 1.00   942
+#> s(time).1   -2.10 -1.10  0.029 1.01   484
+#> s(time).2    0.45  1.30  2.400 1.01   411
+#> s(time).3   -0.43  0.45  1.500 1.02   347
+#> s(time).4    1.60  2.50  3.600 1.02   342
+#> s(time).5   -1.10 -0.22  0.880 1.02   375
+#> s(time).6   -0.53  0.36  1.600 1.01   352
+#> s(time).7   -1.50 -0.51  0.560 1.01   406
+#> s(time).8    0.63  1.50  2.600 1.02   340
+#> s(time).9    1.20  2.10  3.200 1.02   346
+#> s(time).10  -0.34  0.54  1.600 1.01   364
+#> s(time).11   0.92  1.80  2.900 1.02   332
+#> s(time).12   0.67  1.50  2.500 1.01   398
+#> s(time).13  -1.20 -0.32  0.700 1.01   420
+#> s(time).14  -7.90 -4.20 -1.200 1.01   414
+#> 
+#> Approximate significance of GAM smooths:
+#>          edf Ref.df Chi.sq p-value    
+#> s(time) 9.41     14    790  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:01:29 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The summary above now contains posterior estimates for the smoothing +parameters as well as the basis coefficients for the nonlinear effect of +time. We can visualize the conditional time +effect using the plot function with +type = 'smooths':

+
plot(model3, type = 'smooths')
+

+

By default this plots shows posterior empirical quantiles, but it can +also be helpful to view some realizations of the underlying function +(here, each line is a different potential curve drawn from the posterior +of all possible curves):

+
plot(model3, type = 'smooths', realisations = TRUE,
+     n_realisations = 30)
+

+
+

Derivatives of smooths

+

A useful question when modelling using GAMs is to identify where the +function is changing most rapidly. To address this, we can plot +estimated 1st derivatives of the spline:

+
plot(model3, type = 'smooths', derivatives = TRUE)
+

+

Here, values above >0 indicate the function was +increasing at that time point, while values <0 indicate +the function was declining. The most rapid declines appear to have been +happening around timepoints 50 and again toward the end of the training +period, for example.

+

Use conditional_effects again for useful plots on the +outcome scale:

+
plot(conditional_effects(model3), ask = FALSE)
+

+

Or on the link scale:

+
plot(conditional_effects(model3, type = 'link'), ask = FALSE)
+

+

Inspect the underlying Stan code to gain some idea of +how the spline is being penalized:

+
code(model3)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_sp; // number of smoothing parameters
+#>   int<lower=0> n_series; // number of series
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   vector[num_basis] zero; // prior locations for basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   matrix[14, 28] S1; // mgcv smooth penalty matrix S1
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   
+#>   // smoothing parameters
+#>   vector<lower=0>[n_sp] lambda;
+#> }
+#> transformed parameters {
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   b[1 : num_basis] = b_raw[1 : num_basis];
+#> }
+#> model {
+#>   // prior for (Intercept)...
+#>   b_raw[1] ~ student_t(3, 2.6, 2.5);
+#>   
+#>   // prior for ndvi...
+#>   b_raw[2] ~ student_t(3, 0, 2);
+#>   
+#>   // prior for s(time)...
+#>   b_raw[3 : 16] ~ multi_normal_prec(zero[3 : 16],
+#>                                     S1[1 : 14, 1 : 14] * lambda[1]
+#>                                     + S1[1 : 14, 15 : 28] * lambda[2]);
+#>   
+#>   // priors for smoothing parameters
+#>   lambda ~ normal(5, 30);
+#>   {
+#>     // likelihood functions
+#>     flat_ys ~ poisson_log_glm(flat_xs, 0.0, b);
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   vector[n_sp] rho;
+#>   array[n, n_series] int ypred;
+#>   rho = log(lambda);
+#>   
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+

The line below // prior for s(time)... shows how the +spline basis coefficients are drawn from a zero-centred multivariate +normal distribution. The precision matrix \(S\) is penalized by two different smoothing +parameters (the \(\lambda\)’s) to +enforce smoothness and reduce overfitting

+
+
+
+

Latent dynamics in mvgam

+

Forecasts from the above model are not ideal:

+
plot(model3, type = 'forecast', newdata = data_test)
+

+
#> Out of sample DRPS:
+#> [1] 288.3844
+

Why is this happening? The forecasts are driven almost entirely by +variation in the temporal spline, which is extrapolating linearly +forever beyond the edge of the training data. Any slight +wiggles near the end of the training set will result in wildly different +forecasts. To visualize this, we can plot the extrapolated temporal +functions into the out-of-sample test set for the two models. Here are +the extrapolated functions for the first model, with 15 basis +functions:

+
plot_mvgam_smooth(model3, smooth = 's(time)',
+                  # feed newdata to the plot function to generate
+                  # predictions of the temporal smooth to the end of the 
+                  # testing period
+                  newdata = data.frame(time = 1:max(data_test$time),
+                                       ndvi = 0))
+abline(v = max(data_train$time), lty = 'dashed', lwd = 2)
+

+

This model is not doing well. Clearly we need to somehow account for +the strong temporal autocorrelation when modelling these data without +using a smooth function of time. Now onto another prominent +feature of mvgam: the ability to include (possibly latent) +autocorrelated residuals in regression models. To do so, we use the +trend_model argument (see ?mvgam_trends for +details of different dynamic trend models that are supported). This +model will use a separate sub-model for latent residuals that evolve as +an AR1 process (i.e. the error in the current time point is a function +of the error in the previous time point, plus some stochastic noise). We +also include a smooth function of ndvi in this model, +rather than the parametric term that was used above, to showcase that +mvgam can include combinations of smooths and dynamic +components:

+
model4 <- mvgam(count ~ s(ndvi, k = 6),
+                family = poisson(),
+                data = data_train,
+                newdata = data_test,
+                trend_model = 'AR1')
+

The model can be described mathematically as follows: \[\begin{align*} +\boldsymbol{count}_t & \sim \text{Poisson}(\lambda_t) \\ +log(\lambda_t) & = f(\boldsymbol{ndvi})_t + z_t \\ +z_t & \sim \text{Normal}(ar1 * z_{t-1}, \sigma_{error}) \\ +ar1 & \sim \text{Normal}(0, 1)[-1, 1] \\ +\sigma_{error} & \sim \text{Exponential}(2) \\ +f(\boldsymbol{ndvi}) & = \sum_{k=1}^{K}b * \beta_{smooth} \\ +\beta_{smooth} & \sim \text{MVNormal}(0, (\Omega * \lambda)^{-1}) +\end{align*}\]

+

Here the term \(z_t\) captures +autocorrelated latent residuals, which are modelled using an AR1 +process. You can also notice that this model is estimating +autocorrelated errors for the full time period, even though some of +these time points have missing observations. This is useful for getting +more realistic estimates of the residual autocorrelation parameters. +Summarise the model to see how it now returns posterior summaries for +the latent AR1 process:

+
summary(model4)
+#> GAM formula:
+#> count ~ s(ndvi, k = 6)
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> AR1
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 160 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM coefficient (beta) estimates:
+#>               2.5%     50% 97.5% Rhat n_eff
+#> (Intercept)  1.200  1.9000 2.500 1.03    63
+#> s(ndvi).1   -0.066  0.0100 0.160 1.01   318
+#> s(ndvi).2   -0.110  0.0190 0.340 1.00   286
+#> s(ndvi).3   -0.048 -0.0019 0.051 1.00   560
+#> s(ndvi).4   -0.210  0.1200 1.500 1.01   198
+#> s(ndvi).5   -0.079  0.1500 0.360 1.01   350
+#> 
+#> Approximate significance of GAM smooths:
+#>          edf Ref.df Chi.sq p-value
+#> s(ndvi) 2.26      5   87.8     0.1
+#> 
+#> Latent trend parameter AR estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> ar1[1]   0.70 0.81  0.92 1.01   234
+#> sigma[1] 0.68 0.80  0.96 1.00   488
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Tue Apr 16 1:02:26 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

View conditional smooths for the ndvi effect:

+
plot_predictions(model4, 
+                 condition = "ndvi",
+                 points = 0.5, rug = TRUE)
+

+

View posterior hindcasts / forecasts and compare against the out of +sample test data

+
plot(model4, type = 'forecast', newdata = data_test)
+

+
#> Out of sample DRPS:
+#> [1] 150.5241
+

The trend is evolving as an AR1 process, which we can also view:

+
plot(model4, type = 'trend', newdata = data_test)
+

+

In-sample model performance can be interrogated using leave-one-out +cross-validation utilities from the loo package (a higher +value is preferred for this metric):

+
loo_compare(model3, model4)
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+#>        elpd_diff se_diff
+#> model4    0.0       0.0 
+#> model3 -558.9      66.4
+

The higher estimated log predictive density (ELPD) value for the +dynamic model suggests it provides a better fit to the in-sample +data.

+

Though it should be obvious that this model provides better +forecasts, we can quantify forecast performance for models 3 and 4 using +the forecast and score functions. Here we will +compare models based on their Discrete Ranked Probability Scores (a +lower value is preferred for this metric)

+
fc_mod3 <- forecast(model3)
+fc_mod4 <- forecast(model4)
+score_mod3 <- score(fc_mod3, score = 'drps')
+score_mod4 <- score(fc_mod4, score = 'drps')
+sum(score_mod4$PP$score, na.rm = TRUE) - sum(score_mod3$PP$score, na.rm = TRUE)
+#> [1] -137.8603
+

A strongly negative value here suggests the score for the dynamic +model (model 4) is much smaller than the score for the model with a +smooth function of time (model 3)

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/nmixtures.R b/inst/doc/nmixtures.R new file mode 100644 index 00000000..a4e7a766 --- /dev/null +++ b/inst/doc/nmixtures.R @@ -0,0 +1,396 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +library(dplyr) +# A custom ggplot2 theme +theme_set(theme_classic(base_size = 12, base_family = 'serif') + + theme(axis.line.x.bottom = element_line(colour = "black", + size = 1), + axis.line.y.left = element_line(colour = "black", + size = 1))) +options(ggplot2.discrete.colour = c("#A25050", + "#00008b", + 'darkred', + "#010048"), + ggplot2.discrete.fill = c("#A25050", + "#00008b", + 'darkred', + "#010048")) + +## ----------------------------------------------------------------------------- +set.seed(999) +# Simulate observations for species 1, which shows a declining trend and 0.7 detection probability +data.frame(site = 1, + # five replicates per year; six years + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_1', + # true abundance declines nonlinearly + truth = c(rep(28, 5), + rep(26, 5), + rep(23, 5), + rep(16, 5), + rep(14, 5), + rep(14, 5)), + # observations are taken with detection prob = 0.7 + obs = c(rbinom(5, 28, 0.7), + rbinom(5, 26, 0.7), + rbinom(5, 23, 0.7), + rbinom(5, 15, 0.7), + rbinom(5, 14, 0.7), + rbinom(5, 14, 0.7))) %>% + # add 'series' information, which is an identifier of site, replicate and species + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + # add a 'cap' variable that defines the maximum latent N to + # marginalize over when estimating latent abundance; in other words + # how large do we realistically think the true abundance could be? + cap = 100) %>% + dplyr::select(- replicate) -> testdat + +# Now add another species that has a different temporal trend and a smaller +# detection probability (0.45 for this species) +testdat = testdat %>% + dplyr::bind_rows(data.frame(site = 1, + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_2', + truth = c(rep(4, 5), + rep(7, 5), + rep(15, 5), + rep(16, 5), + rep(19, 5), + rep(18, 5)), + obs = c(rbinom(5, 4, 0.45), + rbinom(5, 7, 0.45), + rbinom(5, 15, 0.45), + rbinom(5, 16, 0.45), + rbinom(5, 19, 0.45), + rbinom(5, 18, 0.45))) %>% + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + cap = 50) %>% + dplyr::select(-replicate)) + +## ----------------------------------------------------------------------------- +testdat$species <- factor(testdat$species, + levels = unique(testdat$species)) +testdat$series <- factor(testdat$series, + levels = unique(testdat$series)) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(testdat) +head(testdat, 12) + +## ----------------------------------------------------------------------------- +testdat %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map +trend_map + +## ----include = FALSE, results='hide'------------------------------------------ +mod <- mvgam( + # the observation formula sets up linear predictors for + # detection probability on the logit scale + formula = obs ~ species - 1, + + # the trend_formula sets up the linear predictors for + # the latent abundance processes on the log scale + trend_formula = ~ s(time, by = trend, k = 4) + species, + + # the trend_map takes care of the mapping + trend_map = trend_map, + + # nmix() family and data + family = nmix(), + data = testdat, + + # priors can be set in the usual way + priors = c(prior(std_normal(), class = b), + prior(normal(1, 1.5), class = Intercept_trend)), + samples = 1000) + +## ----eval = FALSE------------------------------------------------------------- +# mod <- mvgam( +# # the observation formula sets up linear predictors for +# # detection probability on the logit scale +# formula = obs ~ species - 1, +# +# # the trend_formula sets up the linear predictors for +# # the latent abundance processes on the log scale +# trend_formula = ~ s(time, by = trend, k = 4) + species, +# +# # the trend_map takes care of the mapping +# trend_map = trend_map, +# +# # nmix() family and data +# family = nmix(), +# data = testdat, +# +# # priors can be set in the usual way +# priors = c(prior(std_normal(), class = b), +# prior(normal(1, 1.5), class = Intercept_trend)), +# samples = 1000) + +## ----------------------------------------------------------------------------- +code(mod) + +## ----------------------------------------------------------------------------- +summary(mod) + +## ----------------------------------------------------------------------------- +loo(mod) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths', trend_effects = TRUE) + +## ----------------------------------------------------------------------------- +plot_predictions(mod, condition = 'species', + type = 'detection') + + ylab('Pr(detection)') + + ylim(c(0, 1)) + + theme_classic() + + theme(legend.position = 'none') + +## ----------------------------------------------------------------------------- +hc <- hindcast(mod, type = 'latent_N') + +# Function to plot latent abundance estimates vs truth +plot_latentN = function(hindcasts, data, species = 'sp_1'){ + all_series <- unique(data %>% + dplyr::filter(species == !!species) %>% + dplyr::pull(series)) + + # Grab the first replicate that represents this series + # so we can get the true simulated values + series <- as.numeric(all_series[1]) + truths <- data %>% + dplyr::arrange(time, series) %>% + dplyr::filter(series == !!levels(data$series)[series]) %>% + dplyr::pull(truth) + + # In case some replicates have missing observations, + # pull out predictions for ALL replicates and average over them + hcs <- do.call(rbind, lapply(all_series, function(x){ + ind <- which(names(hindcasts$hindcasts) %in% as.character(x)) + hindcasts$hindcasts[[ind]] + })) + + # Calculate posterior empirical quantiles of predictions + pred_quantiles <- data.frame(t(apply(hcs, 2, function(x) + quantile(x, probs = c(0.05, 0.2, 0.3, 0.4, + 0.5, 0.6, 0.7, 0.8, 0.95))))) + pred_quantiles$time <- 1:NROW(pred_quantiles) + pred_quantiles$truth <- truths + + # Grab observations + data %>% + dplyr::filter(series %in% all_series) %>% + dplyr::select(time, obs) -> observations + + # Plot + ggplot(pred_quantiles, aes(x = time, group = 1)) + + geom_ribbon(aes(ymin = X5., ymax = X95.), fill = "#DCBCBC") + + geom_ribbon(aes(ymin = X30., ymax = X70.), fill = "#B97C7C") + + geom_line(aes(x = time, y = truth), + colour = 'black', linewidth = 1) + + geom_point(aes(x = time, y = truth), + shape = 21, colour = 'white', fill = 'black', + size = 2.5) + + geom_jitter(data = observations, aes(x = time, y = obs), + width = 0.06, + shape = 21, fill = 'darkred', colour = 'white', size = 2.5) + + labs(y = 'Latent abundance (N)', + x = 'Time', + title = species) +} + +## ----------------------------------------------------------------------------- +plot_latentN(hc, testdat, species = 'sp_1') +plot_latentN(hc, testdat, species = 'sp_2') + +## ----------------------------------------------------------------------------- +# Date link +load(url('https://github.com/doserjef/spAbundance/raw/main/data/dataNMixSim.rda')) +data.one.sp <- dataNMixSim + +# Pull out observations for one species +data.one.sp$y <- data.one.sp$y[1, , ] + +# Abundance covariates that don't change across repeat sampling observations +abund.cov <- dataNMixSim$abund.covs[, 1] +abund.factor <- as.factor(dataNMixSim$abund.covs[, 2]) + +# Detection covariates that can change across repeat sampling observations +# Note that `NA`s are not allowed for covariates in mvgam, so we randomly +# impute them here +det.cov <- dataNMixSim$det.covs$det.cov.1[,] +det.cov[is.na(det.cov)] <- rnorm(length(which(is.na(det.cov)))) +det.cov2 <- dataNMixSim$det.covs$det.cov.2 +det.cov2[is.na(det.cov2)] <- rnorm(length(which(is.na(det.cov2)))) + +## ----------------------------------------------------------------------------- +mod_data <- do.call(rbind, + lapply(1:NROW(data.one.sp$y), function(x){ + data.frame(y = data.one.sp$y[x,], + abund_cov = abund.cov[x], + abund_fac = abund.factor[x], + det_cov = det.cov[x,], + det_cov2 = det.cov2[x,], + replicate = 1:NCOL(data.one.sp$y), + site = paste0('site', x)) + })) %>% + dplyr::mutate(species = 'sp_1', + series = as.factor(paste0(site, '_', species, '_', replicate))) %>% + dplyr::mutate(site = factor(site, levels = unique(site)), + species = factor(species, levels = unique(species)), + time = 1, + cap = max(data.one.sp$y, na.rm = TRUE) + 20) + +## ----------------------------------------------------------------------------- +NROW(mod_data) +dplyr::glimpse(mod_data) +head(mod_data) + +## ----------------------------------------------------------------------------- +mod_data %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map + +trend_map %>% + dplyr::arrange(trend) %>% + head(12) + +## ----include = FALSE, results='hide'------------------------------------------ +mod <- mvgam( + # effects of covariates on detection probability; + # here we use penalized splines for both continuous covariates + formula = y ~ s(det_cov, k = 3) + s(det_cov2, k = 3), + + # effects of the covariates on latent abundance; + # here we use a penalized spline for the continuous covariate and + # hierarchical intercepts for the factor covariate + trend_formula = ~ s(abund_cov, k = 3) + + s(abund_fac, bs = 're'), + + # link multiple observations to each site + trend_map = trend_map, + + # nmix() family and supplied data + family = nmix(), + data = mod_data, + + # standard normal priors on key regression parameters + priors = c(prior(std_normal(), class = 'b'), + prior(std_normal(), class = 'Intercept'), + prior(std_normal(), class = 'Intercept_trend'), + prior(std_normal(), class = 'sigma_raw_trend')), + + # use Stan's variational inference for quicker results + algorithm = 'meanfield', + + # no need to compute "series-level" residuals + residuals = FALSE, + samples = 1000) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam( +# # effects of covariates on detection probability; +# # here we use penalized splines for both continuous covariates +# formula = y ~ s(det_cov, k = 4) + s(det_cov2, k = 4), +# +# # effects of the covariates on latent abundance; +# # here we use a penalized spline for the continuous covariate and +# # hierarchical intercepts for the factor covariate +# trend_formula = ~ s(abund_cov, k = 4) + +# s(abund_fac, bs = 're'), +# +# # link multiple observations to each site +# trend_map = trend_map, +# +# # nmix() family and supplied data +# family = nmix(), +# data = mod_data, +# +# # standard normal priors on key regression parameters +# priors = c(prior(std_normal(), class = 'b'), +# prior(std_normal(), class = 'Intercept'), +# prior(std_normal(), class = 'Intercept_trend'), +# prior(std_normal(), class = 'sigma_raw_trend')), +# +# # use Stan's variational inference for quicker results +# algorithm = 'meanfield', +# +# # no need to compute "series-level" residuals +# residuals = FALSE, +# samples = 1000) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +avg_predictions(mod, type = 'detection') + +## ----------------------------------------------------------------------------- +abund_plots <- plot(conditional_effects(mod, + type = 'link', + effects = c('abund_cov', + 'abund_fac')), + plot = FALSE) + +## ----------------------------------------------------------------------------- +abund_plots[[1]] + + ylab('Expected latent abundance') + +## ----------------------------------------------------------------------------- +abund_plots[[2]] + + ylab('Expected latent abundance') + +## ----------------------------------------------------------------------------- +det_plots <- plot(conditional_effects(mod, + type = 'detection', + effects = c('det_cov', + 'det_cov2')), + plot = FALSE) + +## ----------------------------------------------------------------------------- +det_plots[[1]] + + ylab('Pr(detection)') +det_plots[[2]] + + ylab('Pr(detection)') + +## ----------------------------------------------------------------------------- +fivenum_round = function(x)round(fivenum(x, na.rm = TRUE), 2) + +plot_predictions(mod, + newdata = datagrid(det_cov = unique, + det_cov2 = fivenum_round), + by = c('det_cov', 'det_cov2'), + type = 'detection') + + theme_classic() + + ylab('Pr(detection)') + diff --git a/inst/doc/nmixtures.Rmd b/inst/doc/nmixtures.Rmd new file mode 100644 index 00000000..62d4a08e --- /dev/null +++ b/inst/doc/nmixtures.Rmd @@ -0,0 +1,509 @@ +--- +title: "N-mixtures in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{N-mixtures in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +library(dplyr) +# A custom ggplot2 theme +theme_set(theme_classic(base_size = 12, base_family = 'serif') + + theme(axis.line.x.bottom = element_line(colour = "black", + size = 1), + axis.line.y.left = element_line(colour = "black", + size = 1))) +options(ggplot2.discrete.colour = c("#A25050", + "#00008b", + 'darkred', + "#010048"), + ggplot2.discrete.fill = c("#A25050", + "#00008b", + 'darkred', + "#010048")) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to fit and interrogate N-mixture models for population abundance counts made with imperfect detection. + +## N-mixture models +An N-mixture model is a fairly recent addition to the ecological modeller's toolkit that is designed to make inferences about variation in the abundance of species when observations are imperfect ([Royle 2004](https://onlinelibrary.wiley.com/doi/10.1111/j.0006-341X.2004.00142.x){target="_blank"}). Briefly, assume $\boldsymbol{Y_{i,r}}$ is the number of individuals recorded at site $i$ during replicate sampling observation $r$ (recorded as a non-negative integer). If multiple replicate surveys are done within a short enough period to satisfy the assumption that the population remained closed (i.e. there was no substantial change in true population size between replicate surveys), we can account for the fact that observations aren't perfect. This is done by assuming that these replicate observations are Binomial random variables that are parameterized by the true "latent" abundance $N$ and a detection probability $p$: + +\begin{align*} +\boldsymbol{Y_{i,r}} & \sim \text{Binomial}(N_i, p_r) \\ +N_{i} & \sim \text{Poisson}(\lambda_i) \end{align*} + +Using a set of linear predictors, we can estimate effects of covariates $\boldsymbol{X}$ on the expected latent abundance (with a log link for $\lambda$) and, jointly, effects of possibly different covariates (call them $\boldsymbol{Q}$) on detection probability (with a logit link for $p$): + +\begin{align*} +log(\lambda) & = \beta \boldsymbol{X} \\ +logit(p) & = \gamma \boldsymbol{Q}\end{align*} + +`mvgam` can handle this type of model because it is designed to propagate unobserved temporal processes that evolve independently of the observation process in a State-space format. This setup adapts well to N-mixture models because they can be thought of as State-space models in which the latent state is a discrete variable representing the "true" but unknown population size. This is very convenient because we can incorporate any of the package's diverse effect types (i.e. multidimensional splines, time-varying effects, monotonic effects, random effects etc...) into the linear predictors. All that is required for this to work is a marginalization trick that allows `Stan`'s sampling algorithms to handle discrete parameters (see more about how this method of "integrating out" discrete parameters works in [this nice blog post by Maxwell Joseph](https://mbjoseph.github.io/posts/2020-04-28-a-step-by-step-guide-to-marginalizing-over-discrete-parameters-for-ecologists-using-stan/){target="_blank"}). + +The family `nmix()` is used to set up N-mixture models in `mvgam`, but we still need to do a little bit of data wrangling to ensure the data are set up in the correct format (this is especially true when we have more than one replicate survey per time period). The most important aspects are: (1) how we set up the observation `series` and `trend_map` arguments to ensure replicate surveys are mapped to the correct latent abundance model and (2) the inclusion of a `cap` variable that defines the maximum possible integer value to use for each observation when estimating latent abundance. The two examples below give a reasonable overview of how this can be done. + +## Example 1: a two-species system with nonlinear trends +First we will use a simple simulation in which multiple replicate observations are taken at each timepoint for two different species. The simulation produces observations at a single site over six years, with five replicate surveys per year. Each species is simulated to have different nonlinear temporal trends and different detection probabilities. For now, detection probability is fixed (i.e. it does not change over time or in association with any covariates). Notice that we add the `cap` variable, which does not need to be static, to define the maximum possible value that we think the latent abundance could be for each timepoint. This simply needs to be large enough that we get a reasonable idea of which latent N values are most likely, without adding too much computational cost: + +```{r} +set.seed(999) +# Simulate observations for species 1, which shows a declining trend and 0.7 detection probability +data.frame(site = 1, + # five replicates per year; six years + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_1', + # true abundance declines nonlinearly + truth = c(rep(28, 5), + rep(26, 5), + rep(23, 5), + rep(16, 5), + rep(14, 5), + rep(14, 5)), + # observations are taken with detection prob = 0.7 + obs = c(rbinom(5, 28, 0.7), + rbinom(5, 26, 0.7), + rbinom(5, 23, 0.7), + rbinom(5, 15, 0.7), + rbinom(5, 14, 0.7), + rbinom(5, 14, 0.7))) %>% + # add 'series' information, which is an identifier of site, replicate and species + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + # add a 'cap' variable that defines the maximum latent N to + # marginalize over when estimating latent abundance; in other words + # how large do we realistically think the true abundance could be? + cap = 100) %>% + dplyr::select(- replicate) -> testdat + +# Now add another species that has a different temporal trend and a smaller +# detection probability (0.45 for this species) +testdat = testdat %>% + dplyr::bind_rows(data.frame(site = 1, + replicate = rep(1:5, 6), + time = sort(rep(1:6, 5)), + species = 'sp_2', + truth = c(rep(4, 5), + rep(7, 5), + rep(15, 5), + rep(16, 5), + rep(19, 5), + rep(18, 5)), + obs = c(rbinom(5, 4, 0.45), + rbinom(5, 7, 0.45), + rbinom(5, 15, 0.45), + rbinom(5, 16, 0.45), + rbinom(5, 19, 0.45), + rbinom(5, 18, 0.45))) %>% + dplyr::mutate(series = paste0('site_', site, + '_', species, + '_rep_', replicate), + time = as.numeric(time), + cap = 50) %>% + dplyr::select(-replicate)) +``` + +This data format isn't too difficult to set up, but it does differ from the traditional multidimensional array setup that is commonly used for fitting N-mixture models in other software packages. Next we ensure that species and series IDs are included as factor variables, in case we'd like to allow certain effects to vary by species +```{r} +testdat$species <- factor(testdat$species, + levels = unique(testdat$species)) +testdat$series <- factor(testdat$series, + levels = unique(testdat$series)) +``` + +Preview the dataset to get an idea of how it is structured: +```{r} +dplyr::glimpse(testdat) +head(testdat, 12) +``` + +### Setting up the `trend_map` + +Finally, we need to set up the `trend_map` object. This is crucial for allowing multiple observations to be linked to the same latent process model (see more information about this argument in the [Shared latent states vignette](https://nicholasjclark.github.io/mvgam/articles/shared_states.html){target="_blank"}). In this case, the mapping operates by species and site to state that each set of replicate observations from the same time point should all share the exact same latent abundance model: +```{r} +testdat %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map +trend_map +``` + +Notice how all of the replicates for species 1 in site 1 share the same process (i.e. the same `trend`). This will ensure that all replicates are Binomial draws of the same latent N. + +### Modelling with the `nmix()` family + +Now we are ready to fit a model using `mvgam()`. This model will allow each species to have different detection probabilities and different temporal trends. We will use `Cmdstan` as the backend, which by default will use Hamiltonian Monte Carlo for full Bayesian inference + +```{r include = FALSE, results='hide'} +mod <- mvgam( + # the observation formula sets up linear predictors for + # detection probability on the logit scale + formula = obs ~ species - 1, + + # the trend_formula sets up the linear predictors for + # the latent abundance processes on the log scale + trend_formula = ~ s(time, by = trend, k = 4) + species, + + # the trend_map takes care of the mapping + trend_map = trend_map, + + # nmix() family and data + family = nmix(), + data = testdat, + + # priors can be set in the usual way + priors = c(prior(std_normal(), class = b), + prior(normal(1, 1.5), class = Intercept_trend)), + samples = 1000) +``` + +```{r eval = FALSE} +mod <- mvgam( + # the observation formula sets up linear predictors for + # detection probability on the logit scale + formula = obs ~ species - 1, + + # the trend_formula sets up the linear predictors for + # the latent abundance processes on the log scale + trend_formula = ~ s(time, by = trend, k = 4) + species, + + # the trend_map takes care of the mapping + trend_map = trend_map, + + # nmix() family and data + family = nmix(), + data = testdat, + + # priors can be set in the usual way + priors = c(prior(std_normal(), class = b), + prior(normal(1, 1.5), class = Intercept_trend)), + samples = 1000) +``` + +View the automatically-generated `Stan` code to get a sense of how the marginalization over latent N works +```{r} +code(mod) +``` + +The posterior summary of this model shows that it has converged nicely +```{r} +summary(mod) +``` + +`loo()` functionality works just as it does for all `mvgam` models to aid in model comparison / selection (though note that Pareto K values often give warnings for mixture models so these may not be too helpful) +```{r} +loo(mod) +``` + +Plot the estimated smooths of time from each species' latent abundance process (on the log scale) +```{r} +plot(mod, type = 'smooths', trend_effects = TRUE) +``` + +`marginaleffects` support allows for more useful prediction-based interrogations on different scales (though note that at the time of writing this Vignette, you must have the development version of `marginaleffects` installed for `nmix()` models to be supported; use `remotes::install_github('vincentarelbundock/marginaleffects')` to install). Objects that use family `nmix()` have a few additional prediction scales that can be used (i.e. `link`, `response`, `detection` or `latent_N`). For example, here are the estimated detection probabilities per species, which show that the model has done a nice job of estimating these parameters: +```{r} +plot_predictions(mod, condition = 'species', + type = 'detection') + + ylab('Pr(detection)') + + ylim(c(0, 1)) + + theme_classic() + + theme(legend.position = 'none') +``` + +A common goal in N-mixture modelling is to estimate the true latent abundance. The model has automatically generated predictions for the unknown latent abundance that are conditional on the observations. We can extract these and produce decent plots using a small function +```{r} +hc <- hindcast(mod, type = 'latent_N') + +# Function to plot latent abundance estimates vs truth +plot_latentN = function(hindcasts, data, species = 'sp_1'){ + all_series <- unique(data %>% + dplyr::filter(species == !!species) %>% + dplyr::pull(series)) + + # Grab the first replicate that represents this series + # so we can get the true simulated values + series <- as.numeric(all_series[1]) + truths <- data %>% + dplyr::arrange(time, series) %>% + dplyr::filter(series == !!levels(data$series)[series]) %>% + dplyr::pull(truth) + + # In case some replicates have missing observations, + # pull out predictions for ALL replicates and average over them + hcs <- do.call(rbind, lapply(all_series, function(x){ + ind <- which(names(hindcasts$hindcasts) %in% as.character(x)) + hindcasts$hindcasts[[ind]] + })) + + # Calculate posterior empirical quantiles of predictions + pred_quantiles <- data.frame(t(apply(hcs, 2, function(x) + quantile(x, probs = c(0.05, 0.2, 0.3, 0.4, + 0.5, 0.6, 0.7, 0.8, 0.95))))) + pred_quantiles$time <- 1:NROW(pred_quantiles) + pred_quantiles$truth <- truths + + # Grab observations + data %>% + dplyr::filter(series %in% all_series) %>% + dplyr::select(time, obs) -> observations + + # Plot + ggplot(pred_quantiles, aes(x = time, group = 1)) + + geom_ribbon(aes(ymin = X5., ymax = X95.), fill = "#DCBCBC") + + geom_ribbon(aes(ymin = X30., ymax = X70.), fill = "#B97C7C") + + geom_line(aes(x = time, y = truth), + colour = 'black', linewidth = 1) + + geom_point(aes(x = time, y = truth), + shape = 21, colour = 'white', fill = 'black', + size = 2.5) + + geom_jitter(data = observations, aes(x = time, y = obs), + width = 0.06, + shape = 21, fill = 'darkred', colour = 'white', size = 2.5) + + labs(y = 'Latent abundance (N)', + x = 'Time', + title = species) +} +``` + +Latent abundance plots vs the simulated truths for each species are shown below. Here, the red points show the imperfect observations, the black line shows the true latent abundance, and the ribbons show credible intervals of our estimates: +```{r} +plot_latentN(hc, testdat, species = 'sp_1') +plot_latentN(hc, testdat, species = 'sp_2') +``` + +We can see that estimates for both species have correctly captured the true temporal variation and magnitudes in abundance + +## Example 2: a larger survey with possible nonlinear effects + +Now for another example with a larger dataset. We will use data from [Jeff Doser's simulation example from the wonderful `spAbundance` package](https://www.jeffdoser.com/files/spabundance-web/articles/nmixturemodels){target="_blank"}. The simulated data include one continuous site-level covariate, one factor site-level covariate and two continuous sample-level covariates. This example will allow us to examine how we can include possibly nonlinear effects in the latent process and detection probability models. + +Download the data and grab observations / covariate measurements for one species +```{r} +# Date link +load(url('https://github.com/doserjef/spAbundance/raw/main/data/dataNMixSim.rda')) +data.one.sp <- dataNMixSim + +# Pull out observations for one species +data.one.sp$y <- data.one.sp$y[1, , ] + +# Abundance covariates that don't change across repeat sampling observations +abund.cov <- dataNMixSim$abund.covs[, 1] +abund.factor <- as.factor(dataNMixSim$abund.covs[, 2]) + +# Detection covariates that can change across repeat sampling observations +# Note that `NA`s are not allowed for covariates in mvgam, so we randomly +# impute them here +det.cov <- dataNMixSim$det.covs$det.cov.1[,] +det.cov[is.na(det.cov)] <- rnorm(length(which(is.na(det.cov)))) +det.cov2 <- dataNMixSim$det.covs$det.cov.2 +det.cov2[is.na(det.cov2)] <- rnorm(length(which(is.na(det.cov2)))) +``` + +Next we wrangle into the appropriate 'long' data format, adding indicators of `time` and `series` for working in `mvgam`. We also add the `cap` variable to represent the maximum latent N to marginalize over for each observation +```{r} +mod_data <- do.call(rbind, + lapply(1:NROW(data.one.sp$y), function(x){ + data.frame(y = data.one.sp$y[x,], + abund_cov = abund.cov[x], + abund_fac = abund.factor[x], + det_cov = det.cov[x,], + det_cov2 = det.cov2[x,], + replicate = 1:NCOL(data.one.sp$y), + site = paste0('site', x)) + })) %>% + dplyr::mutate(species = 'sp_1', + series = as.factor(paste0(site, '_', species, '_', replicate))) %>% + dplyr::mutate(site = factor(site, levels = unique(site)), + species = factor(species, levels = unique(species)), + time = 1, + cap = max(data.one.sp$y, na.rm = TRUE) + 20) +``` + +The data include observations for 225 sites with three replicates per site, though some observations are missing +```{r} +NROW(mod_data) +dplyr::glimpse(mod_data) +head(mod_data) +``` + +The final step for data preparation is of course the `trend_map`, which sets up the mapping between observation replicates and the latent abundance models. This is done in the same way as in the example above +```{r} +mod_data %>% + # each unique combination of site*species is a separate process + dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>% + dplyr::select(trend, series) %>% + dplyr::distinct() -> trend_map + +trend_map %>% + dplyr::arrange(trend) %>% + head(12) +``` + +Now we are ready to fit a model using `mvgam()`. Here we will use penalized splines for each of the continuous covariate effects to detect possible nonlinear associations. We also showcase how `mvgam` can make use of the different approximation algorithms available in `Stan` by using the meanfield variational Bayes approximator (this reduces computation time to around 12 seconds for this example) +```{r include = FALSE, results='hide'} +mod <- mvgam( + # effects of covariates on detection probability; + # here we use penalized splines for both continuous covariates + formula = y ~ s(det_cov, k = 3) + s(det_cov2, k = 3), + + # effects of the covariates on latent abundance; + # here we use a penalized spline for the continuous covariate and + # hierarchical intercepts for the factor covariate + trend_formula = ~ s(abund_cov, k = 3) + + s(abund_fac, bs = 're'), + + # link multiple observations to each site + trend_map = trend_map, + + # nmix() family and supplied data + family = nmix(), + data = mod_data, + + # standard normal priors on key regression parameters + priors = c(prior(std_normal(), class = 'b'), + prior(std_normal(), class = 'Intercept'), + prior(std_normal(), class = 'Intercept_trend'), + prior(std_normal(), class = 'sigma_raw_trend')), + + # use Stan's variational inference for quicker results + algorithm = 'meanfield', + + # no need to compute "series-level" residuals + residuals = FALSE, + samples = 1000) +``` + +```{r eval=FALSE} +mod <- mvgam( + # effects of covariates on detection probability; + # here we use penalized splines for both continuous covariates + formula = y ~ s(det_cov, k = 4) + s(det_cov2, k = 4), + + # effects of the covariates on latent abundance; + # here we use a penalized spline for the continuous covariate and + # hierarchical intercepts for the factor covariate + trend_formula = ~ s(abund_cov, k = 4) + + s(abund_fac, bs = 're'), + + # link multiple observations to each site + trend_map = trend_map, + + # nmix() family and supplied data + family = nmix(), + data = mod_data, + + # standard normal priors on key regression parameters + priors = c(prior(std_normal(), class = 'b'), + prior(std_normal(), class = 'Intercept'), + prior(std_normal(), class = 'Intercept_trend'), + prior(std_normal(), class = 'sigma_raw_trend')), + + # use Stan's variational inference for quicker results + algorithm = 'meanfield', + + # no need to compute "series-level" residuals + residuals = FALSE, + samples = 1000) +``` + +Inspect the model summary but don't bother looking at estimates for all individual spline coefficients. Notice how we no longer receive information on convergence because we did not use MCMC sampling for this model +```{r} +summary(mod, include_betas = FALSE) +``` + +Again we can make use of `marginaleffects` support for interrogating the model through targeted predictions. First, we can inspect the estimated average detection probability +```{r} +avg_predictions(mod, type = 'detection') +``` + +Next investigate estimated effects of covariates on latent abundance using the `conditional_effects()` function and specifying `type = 'link'`; this will return plots on the expectation scale +```{r} +abund_plots <- plot(conditional_effects(mod, + type = 'link', + effects = c('abund_cov', + 'abund_fac')), + plot = FALSE) +``` + +The effect of the continuous covariate on expected latent abundance +```{r} +abund_plots[[1]] + + ylab('Expected latent abundance') +``` + +The effect of the factor covariate on expected latent abundance, estimated as a hierarchical random effect +```{r} +abund_plots[[2]] + + ylab('Expected latent abundance') +``` + +Now we can investigate estimated effects of covariates on detection probability using `type = 'detection'` +```{r} +det_plots <- plot(conditional_effects(mod, + type = 'detection', + effects = c('det_cov', + 'det_cov2')), + plot = FALSE) +``` + +The covariate smooths were estimated to be somewhat nonlinear on the logit scale according to the model summary (based on their approximate significances). But inspecting conditional effects of each covariate on the probability scale is more intuitive and useful +```{r} +det_plots[[1]] + + ylab('Pr(detection)') +det_plots[[2]] + + ylab('Pr(detection)') +``` + +More targeted predictions are also easy with `marginaleffects` support. For example, we can ask: How does detection probability change as we change *both* detection covariates? +```{r} +fivenum_round = function(x)round(fivenum(x, na.rm = TRUE), 2) + +plot_predictions(mod, + newdata = datagrid(det_cov = unique, + det_cov2 = fivenum_round), + by = c('det_cov', 'det_cov2'), + type = 'detection') + + theme_classic() + + ylab('Pr(detection)') +``` + +The model has found support for some important covariate effects, but of course we'd want to interrogate how well the model predicts and think about possible spatial effects to capture unmodelled variation in latent abundance (which can easily be incorporated into both linear predictors using spatial smooths). + +## Further reading +The following papers and resources offer useful material about N-mixture models for ecological population dynamics investigations: + +Guélat, Jérôme, and Kéry, Marc. “[Effects of Spatial Autocorrelation and Imperfect Detection on Species Distribution Models.](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12983)” *Methods in Ecology and Evolution* 9 (2018): 1614–25. + +Kéry, Marc, and Royle Andrew J. "[Applied hierarchical modeling in ecology: Analysis of distribution, abundance and species richness in R and BUGS: Volume 2: Dynamic and advanced models](https://www.sciencedirect.com/book/9780128237687/applied-hierarchical-modeling-in-ecology-analysis-of-distribution-abundance-and-species-richness-in-r-and-bugs)". London, UK: Academic Press (2020). + +Royle, Andrew J. "[N‐mixture models for estimating population size from spatially replicated counts.](https://onlinelibrary.wiley.com/doi/full/10.1111/j.0006-341X.2004.00142.x)" *Biometrics* 60.1 (2004): 108-115. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/inst/doc/nmixtures.html b/inst/doc/nmixtures.html new file mode 100644 index 00000000..f92147dd --- /dev/null +++ b/inst/doc/nmixtures.html @@ -0,0 +1,1202 @@ + + + + + + + + + + + + + + + + +N-mixtures in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

N-mixtures in mvgam

+

Nicholas J Clark

+

2024-04-16

+ + + +

The purpose of this vignette is to show how the mvgam +package can be used to fit and interrogate N-mixture models for +population abundance counts made with imperfect detection.

+
+

N-mixture models

+

An N-mixture model is a fairly recent addition to the ecological +modeller’s toolkit that is designed to make inferences about variation +in the abundance of species when observations are imperfect (Royle 2004). Briefly, assume \(\boldsymbol{Y_{i,r}}\) is the number of +individuals recorded at site \(i\) +during replicate sampling observation \(r\) (recorded as a non-negative integer). +If multiple replicate surveys are done within a short enough period to +satisfy the assumption that the population remained closed (i.e. there +was no substantial change in true population size between replicate +surveys), we can account for the fact that observations aren’t perfect. +This is done by assuming that these replicate observations are Binomial +random variables that are parameterized by the true “latent” abundance +\(N\) and a detection probability \(p\):

+

\[\begin{align*} +\boldsymbol{Y_{i,r}} & \sim \text{Binomial}(N_i, p_r) \\ +N_{i} & \sim \text{Poisson}(\lambda_i) \end{align*}\]

+

Using a set of linear predictors, we can estimate effects of +covariates \(\boldsymbol{X}\) on the +expected latent abundance (with a log link for \(\lambda\)) and, jointly, effects of +possibly different covariates (call them \(\boldsymbol{Q}\)) on detection probability +(with a logit link for \(p\)):

+

\[\begin{align*} +log(\lambda) & = \beta \boldsymbol{X} \\ +logit(p) & = \gamma \boldsymbol{Q}\end{align*}\]

+

mvgam can handle this type of model because it is +designed to propagate unobserved temporal processes that evolve +independently of the observation process in a State-space format. This +setup adapts well to N-mixture models because they can be thought of as +State-space models in which the latent state is a discrete variable +representing the “true” but unknown population size. This is very +convenient because we can incorporate any of the package’s diverse +effect types (i.e. multidimensional splines, time-varying effects, +monotonic effects, random effects etc…) into the linear predictors. All +that is required for this to work is a marginalization trick that allows +Stan’s sampling algorithms to handle discrete parameters +(see more about how this method of “integrating out” discrete parameters +works in this nice blog post by Maxwell Joseph).

+

The family nmix() is used to set up N-mixture models in +mvgam, but we still need to do a little bit of data +wrangling to ensure the data are set up in the correct format (this is +especially true when we have more than one replicate survey per time +period). The most important aspects are: (1) how we set up the +observation series and trend_map arguments to +ensure replicate surveys are mapped to the correct latent abundance +model and (2) the inclusion of a cap variable that defines +the maximum possible integer value to use for each observation when +estimating latent abundance. The two examples below give a reasonable +overview of how this can be done.

+
+ +
+

Example 2: a larger survey with possible nonlinear effects

+

Now for another example with a larger dataset. We will use data from +Jeff Doser’s simulation example from the wonderful +spAbundance package. The simulated data include one +continuous site-level covariate, one factor site-level covariate and two +continuous sample-level covariates. This example will allow us to +examine how we can include possibly nonlinear effects in the latent +process and detection probability models.

+

Download the data and grab observations / covariate measurements for +one species

+
# Date link
+load(url('https://github.com/doserjef/spAbundance/raw/main/data/dataNMixSim.rda'))
+data.one.sp <- dataNMixSim
+
+# Pull out observations for one species
+data.one.sp$y <- data.one.sp$y[1, , ]
+
+# Abundance covariates that don't change across repeat sampling observations
+abund.cov <- dataNMixSim$abund.covs[, 1]
+abund.factor <- as.factor(dataNMixSim$abund.covs[, 2])
+
+# Detection covariates that can change across repeat sampling observations
+# Note that `NA`s are not allowed for covariates in mvgam, so we randomly
+# impute them here
+det.cov <- dataNMixSim$det.covs$det.cov.1[,]
+det.cov[is.na(det.cov)] <- rnorm(length(which(is.na(det.cov))))
+det.cov2 <- dataNMixSim$det.covs$det.cov.2
+det.cov2[is.na(det.cov2)] <- rnorm(length(which(is.na(det.cov2))))
+

Next we wrangle into the appropriate ‘long’ data format, adding +indicators of time and series for working in +mvgam. We also add the cap variable to +represent the maximum latent N to marginalize over for each +observation

+
mod_data <- do.call(rbind,
+                    lapply(1:NROW(data.one.sp$y), function(x){
+                      data.frame(y = data.one.sp$y[x,],
+                                 abund_cov = abund.cov[x],
+                                 abund_fac = abund.factor[x],
+                                 det_cov = det.cov[x,],
+                                 det_cov2 = det.cov2[x,],
+                                 replicate = 1:NCOL(data.one.sp$y),
+                                 site = paste0('site', x))
+                    })) %>%
+  dplyr::mutate(species = 'sp_1',
+                series = as.factor(paste0(site, '_', species, '_', replicate))) %>%
+  dplyr::mutate(site = factor(site, levels = unique(site)),
+                species = factor(species, levels = unique(species)),
+                time = 1,
+                cap = max(data.one.sp$y, na.rm = TRUE) + 20)
+

The data include observations for 225 sites with three replicates per +site, though some observations are missing

+
NROW(mod_data)
+#> [1] 675
+dplyr::glimpse(mod_data)
+#> Rows: 675
+#> Columns: 11
+#> $ y         <int> 1, NA, NA, NA, 2, 2, NA, 1, NA, NA, 0, 1, 0, 0, 0, 0, NA, NA…
+#> $ abund_cov <dbl> -0.3734384, -0.3734384, -0.3734384, 0.7064305, 0.7064305, 0.…
+#> $ abund_fac <fct> 3, 3, 3, 4, 4, 4, 9, 9, 9, 2, 2, 2, 3, 3, 3, 2, 2, 2, 1, 1, …
+#> $ det_cov   <dbl> -1.28279990, -0.08474811, 0.44789392, 1.71731815, 0.19548086…
+#> $ det_cov2  <dbl> 2.03047314, -1.42459158, 1.68497337, 0.75026787, 1.04555361,…
+#> $ replicate <int> 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, …
+#> $ site      <fct> site1, site1, site1, site2, site2, site2, site3, site3, site…
+#> $ species   <fct> sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, sp_1, …
+#> $ series    <fct> site1_sp_1_1, site1_sp_1_2, site1_sp_1_3, site2_sp_1_1, site…
+#> $ time      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
+#> $ cap       <dbl> 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, …
+head(mod_data)
+#>    y  abund_cov abund_fac     det_cov   det_cov2 replicate  site species
+#> 1  1 -0.3734384         3 -1.28279990  2.0304731         1 site1    sp_1
+#> 2 NA -0.3734384         3 -0.08474811 -1.4245916         2 site1    sp_1
+#> 3 NA -0.3734384         3  0.44789392  1.6849734         3 site1    sp_1
+#> 4 NA  0.7064305         4  1.71731815  0.7502679         1 site2    sp_1
+#> 5  2  0.7064305         4  0.19548086  1.0455536         2 site2    sp_1
+#> 6  2  0.7064305         4  0.96730338  1.9197118         3 site2    sp_1
+#>         series time cap
+#> 1 site1_sp_1_1    1  33
+#> 2 site1_sp_1_2    1  33
+#> 3 site1_sp_1_3    1  33
+#> 4 site2_sp_1_1    1  33
+#> 5 site2_sp_1_2    1  33
+#> 6 site2_sp_1_3    1  33
+

The final step for data preparation is of course the +trend_map, which sets up the mapping between observation +replicates and the latent abundance models. This is done in the same way +as in the example above

+
mod_data %>%
+  # each unique combination of site*species is a separate process
+  dplyr::mutate(trend = as.numeric(factor(paste0(site, species)))) %>%
+  dplyr::select(trend, series) %>%
+  dplyr::distinct() -> trend_map
+
+trend_map %>%
+  dplyr::arrange(trend) %>%
+  head(12)
+#>    trend         series
+#> 1      1 site100_sp_1_1
+#> 2      1 site100_sp_1_2
+#> 3      1 site100_sp_1_3
+#> 4      2 site101_sp_1_1
+#> 5      2 site101_sp_1_2
+#> 6      2 site101_sp_1_3
+#> 7      3 site102_sp_1_1
+#> 8      3 site102_sp_1_2
+#> 9      3 site102_sp_1_3
+#> 10     4 site103_sp_1_1
+#> 11     4 site103_sp_1_2
+#> 12     4 site103_sp_1_3
+

Now we are ready to fit a model using mvgam(). Here we +will use penalized splines for each of the continuous covariate effects +to detect possible nonlinear associations. We also showcase how +mvgam can make use of the different approximation +algorithms available in Stan by using the meanfield +variational Bayes approximator (this reduces computation time to around +12 seconds for this example)

+
mod <- mvgam(
+  # effects of covariates on detection probability;
+  # here we use penalized splines for both continuous covariates
+  formula = y ~ s(det_cov, k = 4) + s(det_cov2, k = 4),
+  
+  # effects of the covariates on latent abundance;
+  # here we use a penalized spline for the continuous covariate and
+  # hierarchical intercepts for the factor covariate
+  trend_formula = ~ s(abund_cov, k = 4) +
+    s(abund_fac, bs = 're'),
+  
+  # link multiple observations to each site
+  trend_map = trend_map,
+  
+  # nmix() family and supplied data
+  family = nmix(),
+  data = mod_data,
+  
+  # standard normal priors on key regression parameters
+  priors = c(prior(std_normal(), class = 'b'),
+             prior(std_normal(), class = 'Intercept'),
+             prior(std_normal(), class = 'Intercept_trend'),
+             prior(std_normal(), class = 'sigma_raw_trend')),
+  
+  # use Stan's variational inference for quicker results
+  algorithm = 'meanfield',
+  
+  # no need to compute "series-level" residuals
+  residuals = FALSE,
+  samples = 1000)
+

Inspect the model summary but don’t bother looking at estimates for +all individual spline coefficients. Notice how we no longer receive +information on convergence because we did not use MCMC sampling for this +model

+
summary(mod, include_betas = FALSE)
+#> GAM observation formula:
+#> y ~ s(det_cov, k = 3) + s(det_cov2, k = 3)
+#> 
+#> GAM process formula:
+#> ~s(abund_cov, k = 3) + s(abund_fac, bs = "re")
+#> 
+#> Family:
+#> nmix
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> None
+#> 
+#> N process models:
+#> 225 
+#> 
+#> N series:
+#> 675 
+#> 
+#> N timepoints:
+#> 1 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 1 chains, each with iter = 1000; warmup = ; thin = 1 
+#> Total post-warmup draws = 1000
+#> 
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>              2.5% 50% 97.5% Rhat n.eff
+#> (Intercept) 0.052 0.4  0.71  NaN   NaN
+#> 
+#> Approximate significance of GAM observation smooths:
+#>              edf Ref.df Chi.sq p-value    
+#> s(det_cov)  1.22      2   52.3  0.0011 ** 
+#> s(det_cov2) 1.07      2  307.1  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> GAM process model coefficient (beta) estimates:
+#>                    2.5%    50% 97.5% Rhat n.eff
+#> (Intercept)_trend -0.25 -0.081 0.079  NaN   NaN
+#> 
+#> GAM process model group-level estimates:
+#>                           2.5%    50% 97.5% Rhat n.eff
+#> mean(s(abund_fac))_trend -0.18 0.0038  0.19  NaN   NaN
+#> sd(s(abund_fac))_trend    0.26 0.3900  0.56  NaN   NaN
+#> 
+#> Approximate significance of GAM process smooths:
+#>               edf Ref.df    F p-value  
+#> s(abund_cov) 1.19      2 2.38   0.299  
+#> s(abund_fac) 8.82     10 2.79   0.025 *
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Posterior approximation used: no diagnostics to compute
+

Again we can make use of marginaleffects support for +interrogating the model through targeted predictions. First, we can +inspect the estimated average detection probability

+
avg_predictions(mod, type = 'detection')
+#> 
+#>  Estimate 2.5 % 97.5 %
+#>     0.579  0.51  0.644
+#> 
+#> Columns: estimate, conf.low, conf.high 
+#> Type:  detection
+

Next investigate estimated effects of covariates on latent abundance +using the conditional_effects() function and specifying +type = 'link'; this will return plots on the expectation +scale

+
abund_plots <- plot(conditional_effects(mod,
+                                        type = 'link',
+                                        effects = c('abund_cov',
+                                                    'abund_fac')),
+                    plot = FALSE)
+

The effect of the continuous covariate on expected latent +abundance

+
abund_plots[[1]] +
+  ylab('Expected latent abundance')
+

+

The effect of the factor covariate on expected latent abundance, +estimated as a hierarchical random effect

+
abund_plots[[2]] +
+  ylab('Expected latent abundance')
+

+

Now we can investigate estimated effects of covariates on detection +probability using type = 'detection'

+
det_plots <- plot(conditional_effects(mod,
+                                      type = 'detection',
+                                      effects = c('det_cov',
+                                                  'det_cov2')),
+                  plot = FALSE)
+

The covariate smooths were estimated to be somewhat nonlinear on the +logit scale according to the model summary (based on their approximate +significances). But inspecting conditional effects of each covariate on +the probability scale is more intuitive and useful

+
det_plots[[1]] +
+  ylab('Pr(detection)')
+

+
det_plots[[2]] +
+  ylab('Pr(detection)')
+

+

More targeted predictions are also easy with +marginaleffects support. For example, we can ask: How does +detection probability change as we change both detection +covariates?

+
fivenum_round = function(x)round(fivenum(x, na.rm = TRUE), 2)
+
+plot_predictions(mod, 
+                 newdata = datagrid(det_cov = unique,
+                                    det_cov2 = fivenum_round),
+                 by = c('det_cov', 'det_cov2'),
+                 type = 'detection') +
+  theme_classic() +
+  ylab('Pr(detection)')
+

+

The model has found support for some important covariate effects, but +of course we’d want to interrogate how well the model predicts and think +about possible spatial effects to capture unmodelled variation in latent +abundance (which can easily be incorporated into both linear predictors +using spatial smooths).

+
+
+

Further reading

+

The following papers and resources offer useful material about +N-mixture models for ecological population dynamics investigations:

+

Guélat, Jérôme, and Kéry, Marc. “Effects +of Spatial Autocorrelation and Imperfect Detection on Species +Distribution Models.Methods in Ecology and Evolution 9 +(2018): 1614–25.

+

Kéry, Marc, and Royle Andrew J. “Applied +hierarchical modeling in ecology: Analysis of distribution, abundance +and species richness in R and BUGS: Volume 2: Dynamic and advanced +models”. London, UK: Academic Press (2020).

+

Royle, Andrew J. “N‐mixture +models for estimating population size from spatially replicated +counts.Biometrics 60.1 (2004): 108-115.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/shared_states.R b/inst/doc/shared_states.R new file mode 100644 index 00000000..4eabc053 --- /dev/null +++ b/inst/doc/shared_states.R @@ -0,0 +1,260 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +set.seed(122) +simdat <- sim_mvgam(trend_model = 'AR1', + prop_trend = 0.6, + mu = c(0, 1, 2), + family = poisson()) +trend_map <- data.frame(series = unique(simdat$data_train$series), + trend = c(1, 1, 2)) +trend_map + +## ----------------------------------------------------------------------------- +all.equal(levels(trend_map$series), levels(simdat$data_train$series)) + +## ----------------------------------------------------------------------------- +fake_mod <- mvgam(y ~ + # observation model formula, which has a + # different intercept per series + series - 1, + + # process model formula, which has a shared seasonal smooth + # (each latent process model shares the SAME smooth) + trend_formula = ~ s(season, bs = 'cc', k = 6), + + # AR1 dynamics (each latent process model has DIFFERENT) + # dynamics + trend_model = 'AR1', + + # supplied trend_map + trend_map = trend_map, + + # data and observation family + family = poisson(), + data = simdat$data_train, + run_model = FALSE) + +## ----------------------------------------------------------------------------- +code(fake_mod) + +## ----------------------------------------------------------------------------- +fake_mod$model_data$Z + +## ----full_mod, include = FALSE, results='hide'-------------------------------- +full_mod <- mvgam(y ~ series - 1, + trend_formula = ~ s(season, bs = 'cc', k = 6), + trend_model = 'AR1', + trend_map = trend_map, + family = poisson(), + data = simdat$data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# full_mod <- mvgam(y ~ series - 1, +# trend_formula = ~ s(season, bs = 'cc', k = 6), +# trend_model = 'AR1', +# trend_map = trend_map, +# family = poisson(), +# data = simdat$data_train) + +## ----------------------------------------------------------------------------- +summary(full_mod) + +## ----------------------------------------------------------------------------- +plot(conditional_effects(full_mod, type = 'link'), ask = FALSE) + +## ----------------------------------------------------------------------------- +plot(full_mod, type = 'trend', series = 1) +plot(full_mod, type = 'trend', series = 2) +plot(full_mod, type = 'trend', series = 3) + +## ----------------------------------------------------------------------------- +plot(full_mod, type = 'forecast', series = 1) +plot(full_mod, type = 'forecast', series = 2) +plot(full_mod, type = 'forecast', series = 3) + +## ----------------------------------------------------------------------------- +set.seed(543210) +# simulate a nonlinear relationship using the mgcv function gamSim +signal_dat <- gamSim(n = 100, eg = 1, scale = 1) + +# productivity is one of the variables in the simulated data +productivity <- signal_dat$x2 + +# simulate the true signal, which already has a nonlinear relationship +# with productivity; we will add in a fairly strong AR1 process to +# contribute to the signal +true_signal <- as.vector(scale(signal_dat$y) + + arima.sim(100, model = list(ar = 0.8, sd = 0.1))) + +## ----------------------------------------------------------------------------- +plot(true_signal, type = 'l', + bty = 'l', lwd = 2, + ylab = 'True signal', + xlab = 'Time') + +## ----------------------------------------------------------------------------- +plot(true_signal ~ productivity, + pch = 16, bty = 'l', + ylab = 'True signal', + xlab = 'Productivity') + +## ----------------------------------------------------------------------------- +set.seed(543210) +sim_series = function(n_series = 3, true_signal){ + temp_effects <- gamSim(n = 100, eg = 7, scale = 0.1) + temperature <- temp_effects$y + alphas <- rnorm(n_series, sd = 2) + + do.call(rbind, lapply(seq_len(n_series), function(series){ + data.frame(observed = rnorm(length(true_signal), + mean = alphas[series] + + 1.5*as.vector(scale(temp_effects[, series + 1])) + + true_signal, + sd = runif(1, 1, 2)), + series = paste0('sensor_', series), + time = 1:length(true_signal), + temperature = temperature, + productivity = productivity, + true_signal = true_signal) + })) + } +model_dat <- sim_series(true_signal = true_signal) %>% + dplyr::mutate(series = factor(series)) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = model_dat, y = 'observed', + series = 'all') + +## ----------------------------------------------------------------------------- + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_1'), + pch = 16, bty = 'l', + ylab = 'Sensor 1', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_2'), + pch = 16, bty = 'l', + ylab = 'Sensor 2', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_3'), + pch = 16, bty = 'l', + ylab = 'Sensor 3', + xlab = 'Temperature') + +## ----sensor_mod, include = FALSE, results='hide'------------------------------ +mod <- mvgam(formula = + # formula for observations, allowing for different + # intercepts and smooth effects of temperature + observed ~ series + + s(temperature, k = 10) + + s(series, temperature, bs = 'sz', k = 8), + + trend_formula = + # formula for the latent signal, which can depend + # nonlinearly on productivity + ~ s(productivity, k = 8), + + trend_model = + # in addition to productivity effects, the signal is + # assumed to exhibit temporal autocorrelation + 'AR1', + + trend_map = + # trend_map forces all sensors to track the same + # latent signal + data.frame(series = unique(model_dat$series), + trend = c(1, 1, 1)), + + # informative priors on process error + # and observation error will help with convergence + priors = c(prior(normal(2, 0.5), class = sigma), + prior(normal(1, 0.5), class = sigma_obs)), + + # Gaussian observations + family = gaussian(), + burnin = 600, + adapt_delta = 0.95, + data = model_dat) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam(formula = +# # formula for observations, allowing for different +# # intercepts and hierarchical smooth effects of temperature +# observed ~ series + +# s(temperature, k = 10) + +# s(series, temperature, bs = 'sz', k = 8), +# +# trend_formula = +# # formula for the latent signal, which can depend +# # nonlinearly on productivity +# ~ s(productivity, k = 8), +# +# trend_model = +# # in addition to productivity effects, the signal is +# # assumed to exhibit temporal autocorrelation +# 'AR1', +# +# trend_map = +# # trend_map forces all sensors to track the same +# # latent signal +# data.frame(series = unique(model_dat$series), +# trend = c(1, 1, 1)), +# +# # informative priors on process error +# # and observation error will help with convergence +# priors = c(prior(normal(2, 0.5), class = sigma), +# prior(normal(1, 0.5), class = sigma_obs)), +# +# # Gaussian observations +# family = gaussian(), +# data = model_dat) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths', trend_effects = TRUE) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths') + +## ----------------------------------------------------------------------------- +plot(conditional_effects(mod, type = 'link'), ask = FALSE) + +## ----------------------------------------------------------------------------- +plot_predictions(mod, + condition = c('temperature', 'series', 'series'), + points = 0.5) + + theme(legend.position = 'none') + +## ----------------------------------------------------------------------------- +pairs(mod, variable = c('sigma[1]', 'sigma_obs[1]')) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'trend') + +# Overlay the true simulated signal +points(true_signal, pch = 16, cex = 1, col = 'white') +points(true_signal, pch = 16, cex = 0.8) + diff --git a/inst/doc/shared_states.Rmd b/inst/doc/shared_states.Rmd new file mode 100644 index 00000000..fcdb58be --- /dev/null +++ b/inst/doc/shared_states.Rmd @@ -0,0 +1,346 @@ +--- +title: "Shared latent states in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Shared latent states in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +This vignette gives an example of how `mvgam` can be used to estimate models where multiple observed time series share the same latent process model. For full details on the basic `mvgam` functionality, please see [the introductory vignette](https://nicholasjclark.github.io/mvgam/articles/mvgam_overview.html). + +## The `trend_map` argument +The `trend_map` argument in the `mvgam()` function is an optional `data.frame` that can be used to specify which series should depend on which latent process models (called "trends" in `mvgam`). This can be particularly useful if we wish to force multiple observed time series to depend on the same latent trend process, but with different observation processes. If this argument is supplied, a latent factor model is set up by setting `use_lv = TRUE` and using the supplied `trend_map` to set up the shared trends. Users familiar with the `MARSS` family of packages will recognize this as a way of specifying the $Z$ matrix. This `data.frame` needs to have column names `series` and `trend`, with integer values in the `trend` column to state which trend each series should depend on. The `series` column should have a single unique entry for each time series in the data, with names that perfectly match the factor levels of the `series` variable in `data`). For example, if we were to simulate a collection of three integer-valued time series (using `sim_mvgam`), the following `trend_map` would force the first two series to share the same latent trend process: +```{r} +set.seed(122) +simdat <- sim_mvgam(trend_model = 'AR1', + prop_trend = 0.6, + mu = c(0, 1, 2), + family = poisson()) +trend_map <- data.frame(series = unique(simdat$data_train$series), + trend = c(1, 1, 2)) +trend_map +``` + +We can see that the factor levels in `trend_map` match those in the data: +```{r} +all.equal(levels(trend_map$series), levels(simdat$data_train$series)) +``` + +### Checking `trend_map` with `run_model = FALSE` +Supplying this `trend_map` to the `mvgam` function for a simple model, but setting `run_model = FALSE`, allows us to inspect the constructed `Stan` code and the data objects that would be used to condition the model. Here we will set up a model in which each series has a different observation process (with only a different intercept per series in this case), and the two latent dynamic process models evolve as independent AR1 processes that also contain a shared nonlinear smooth function to capture repeated seasonality. This model is not too complicated but it does show how we can learn shared and independent effects for collections of time series in the `mvgam` framework: +```{r} +fake_mod <- mvgam(y ~ + # observation model formula, which has a + # different intercept per series + series - 1, + + # process model formula, which has a shared seasonal smooth + # (each latent process model shares the SAME smooth) + trend_formula = ~ s(season, bs = 'cc', k = 6), + + # AR1 dynamics (each latent process model has DIFFERENT) + # dynamics + trend_model = 'AR1', + + # supplied trend_map + trend_map = trend_map, + + # data and observation family + family = poisson(), + data = simdat$data_train, + run_model = FALSE) +``` + +Inspecting the `Stan` code shows how this model is a dynamic factor model in which the loadings are constructed to reflect the supplied `trend_map`: +```{r} +code(fake_mod) +``` + +Notice the line that states "lv_coefs = Z;". This uses the supplied $Z$ matrix to construct the loading coefficients. The supplied matrix now looks exactly like what you'd use if you were to create a similar model in the `MARSS` package: +```{r} +fake_mod$model_data$Z +``` + +### Fitting and inspecting the model +Though this model doesn't perfectly match the data-generating process (which allowed each series to have different underlying dynamics), we can still fit it to show what the resulting inferences look like: +```{r full_mod, include = FALSE, results='hide'} +full_mod <- mvgam(y ~ series - 1, + trend_formula = ~ s(season, bs = 'cc', k = 6), + trend_model = 'AR1', + trend_map = trend_map, + family = poisson(), + data = simdat$data_train) +``` + +```{r eval=FALSE} +full_mod <- mvgam(y ~ series - 1, + trend_formula = ~ s(season, bs = 'cc', k = 6), + trend_model = 'AR1', + trend_map = trend_map, + family = poisson(), + data = simdat$data_train) +``` + +The summary of this model is informative as it shows that only two latent process models have been estimated, even though we have three observed time series. The model converges well +```{r} +summary(full_mod) +``` + +Quick plots of all main effects can be made using `conditional_effects()`: +```{r} +plot(conditional_effects(full_mod, type = 'link'), ask = FALSE) +``` + +Even more informative are the plots of the latent processes. Both series 1 and 2 share the exact same estimates, while the estimates for series 3 are different: +```{r} +plot(full_mod, type = 'trend', series = 1) +plot(full_mod, type = 'trend', series = 2) +plot(full_mod, type = 'trend', series = 3) +``` + +However, the forecasts for series' 1 and 2 differ because they have different intercepts in the observation model +```{r} +plot(full_mod, type = 'forecast', series = 1) +plot(full_mod, type = 'forecast', series = 2) +plot(full_mod, type = 'forecast', series = 3) +``` + +## Example: signal detection +Now we will explore a more complicated example. Here we simulate a true hidden signal that we are trying to track. This signal depends nonlinearly on some covariate (called `productivity`, which represents a measure of how productive the landscape is). The signal also demonstrates a fairly large amount of temporal autocorrelation: +```{r} +set.seed(543210) +# simulate a nonlinear relationship using the mgcv function gamSim +signal_dat <- gamSim(n = 100, eg = 1, scale = 1) + +# productivity is one of the variables in the simulated data +productivity <- signal_dat$x2 + +# simulate the true signal, which already has a nonlinear relationship +# with productivity; we will add in a fairly strong AR1 process to +# contribute to the signal +true_signal <- as.vector(scale(signal_dat$y) + + arima.sim(100, model = list(ar = 0.8, sd = 0.1))) +``` + +Plot the signal to inspect it's evolution over time +```{r} +plot(true_signal, type = 'l', + bty = 'l', lwd = 2, + ylab = 'True signal', + xlab = 'Time') +``` + +And plot the relationship between the signal and the `productivity` covariate: +```{r} +plot(true_signal ~ productivity, + pch = 16, bty = 'l', + ylab = 'True signal', + xlab = 'Productivity') +``` + +Next we simulate three sensors that are trying to track the same hidden signal. All of these sensors have different observation errors that can depend nonlinearly on a second external covariate, called `temperature` in this example. Again this makes use of `gamSim` +```{r} +set.seed(543210) +sim_series = function(n_series = 3, true_signal){ + temp_effects <- gamSim(n = 100, eg = 7, scale = 0.1) + temperature <- temp_effects$y + alphas <- rnorm(n_series, sd = 2) + + do.call(rbind, lapply(seq_len(n_series), function(series){ + data.frame(observed = rnorm(length(true_signal), + mean = alphas[series] + + 1.5*as.vector(scale(temp_effects[, series + 1])) + + true_signal, + sd = runif(1, 1, 2)), + series = paste0('sensor_', series), + time = 1:length(true_signal), + temperature = temperature, + productivity = productivity, + true_signal = true_signal) + })) + } +model_dat <- sim_series(true_signal = true_signal) %>% + dplyr::mutate(series = factor(series)) +``` + +Plot the sensor observations +```{r} +plot_mvgam_series(data = model_dat, y = 'observed', + series = 'all') +``` + +And now plot the observed relationships between the three sensors and the `temperature` covariate +```{r} + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_1'), + pch = 16, bty = 'l', + ylab = 'Sensor 1', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_2'), + pch = 16, bty = 'l', + ylab = 'Sensor 2', + xlab = 'Temperature') + plot(observed ~ temperature, data = model_dat %>% + dplyr::filter(series == 'sensor_3'), + pch = 16, bty = 'l', + ylab = 'Sensor 3', + xlab = 'Temperature') +``` + +### The shared signal model +Now we can formulate and fit a model that allows each sensor's observation error to depend nonlinearly on `temperature` while allowing the true signal to depend nonlinearly on `productivity`. By fixing all of the values in the `trend` column to `1` in the `trend_map`, we are assuming that all observation sensors are tracking the same latent signal. We use informative priors on the two variance components (process error and observation error), which reflect our prior belief that the observation error is smaller overall than the true process error +```{r sensor_mod, include = FALSE, results='hide'} +mod <- mvgam(formula = + # formula for observations, allowing for different + # intercepts and smooth effects of temperature + observed ~ series + + s(temperature, k = 10) + + s(series, temperature, bs = 'sz', k = 8), + + trend_formula = + # formula for the latent signal, which can depend + # nonlinearly on productivity + ~ s(productivity, k = 8), + + trend_model = + # in addition to productivity effects, the signal is + # assumed to exhibit temporal autocorrelation + 'AR1', + + trend_map = + # trend_map forces all sensors to track the same + # latent signal + data.frame(series = unique(model_dat$series), + trend = c(1, 1, 1)), + + # informative priors on process error + # and observation error will help with convergence + priors = c(prior(normal(2, 0.5), class = sigma), + prior(normal(1, 0.5), class = sigma_obs)), + + # Gaussian observations + family = gaussian(), + burnin = 600, + adapt_delta = 0.95, + data = model_dat) +``` + +```{r eval=FALSE} +mod <- mvgam(formula = + # formula for observations, allowing for different + # intercepts and hierarchical smooth effects of temperature + observed ~ series + + s(temperature, k = 10) + + s(series, temperature, bs = 'sz', k = 8), + + trend_formula = + # formula for the latent signal, which can depend + # nonlinearly on productivity + ~ s(productivity, k = 8), + + trend_model = + # in addition to productivity effects, the signal is + # assumed to exhibit temporal autocorrelation + 'AR1', + + trend_map = + # trend_map forces all sensors to track the same + # latent signal + data.frame(series = unique(model_dat$series), + trend = c(1, 1, 1)), + + # informative priors on process error + # and observation error will help with convergence + priors = c(prior(normal(2, 0.5), class = sigma), + prior(normal(1, 0.5), class = sigma_obs)), + + # Gaussian observations + family = gaussian(), + data = model_dat) +``` + +View a reduced version of the model summary because there will be many spline coefficients in this model +```{r} +summary(mod, include_betas = FALSE) +``` + +### Inspecting effects on both process and observation models +Don't pay much attention to the approximate *p*-values of the smooth terms. The calculation for these values is incredibly sensitive to the estimates for the smoothing parameters so I don't tend to find them to be very meaningful. What are meaningful, however, are prediction-based plots of the smooth functions. For example, here is the estimated response of the underlying signal to `productivity`: +```{r} +plot(mod, type = 'smooths', trend_effects = TRUE) +``` + +And here are the estimated relationships between the sensor observations and the `temperature` covariate: +```{r} +plot(mod, type = 'smooths') +``` + +All main effects can be quickly plotted with `conditional_effects`: +```{r} +plot(conditional_effects(mod, type = 'link'), ask = FALSE) +``` + +`conditional_effects` is simply a wrapper to the more flexible `plot_predictions` function from the `marginaleffects` package. We can get more useful plots of these effects using this function for further customisation: +```{r} +plot_predictions(mod, + condition = c('temperature', 'series', 'series'), + points = 0.5) + + theme(legend.position = 'none') +``` + +We have successfully estimated effects, some of them nonlinear, that impact the hidden process AND the observations. All in a single joint model. But there can always be challenges with these models, particularly when estimating both process and observation error at the same time. For example, a `pairs` plot for the observation error for sensor 1 and the hidden process error shows some strong correlations that we might want to deal with by using a more structured prior: +```{r} +pairs(mod, variable = c('sigma[1]', 'sigma_obs[1]')) +``` + +But we will leave the model as-is for this example + +### Recovering the hidden signal +A final but very key question is whether we can successfully recover the true hidden signal. The `trend` slot in the returned model parameters has the estimates for this signal, which we can easily plot using the `mvgam` S3 method for `plot`. We can also overlay the true values for the hidden signal, which shows that our model has done a good job of recovering it: +```{r} +plot(mod, type = 'trend') + +# Overlay the true simulated signal +points(true_signal, pch = 16, cex = 1, col = 'white') +points(true_signal, pch = 16, cex = 0.8) +``` + +## Further reading +The following papers and resources offer a lot of useful material about other types of State-Space models and how they can be applied in practice: + +Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. "[MARSS: multivariate autoregressive state-space models for analyzing time-series data.](https://journal.r-project.org/archive/2012/RJ-2012-002/index.html)" *R Journal*. 4.1 (2012): 11. + +Ward, Eric J., et al. "[Inferring spatial structure from time‐series data: using multivariate state‐space models to detect metapopulation structure of California sea lions in the Gulf of California, Mexico.](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/j.1365-2664.2009.01745.x)" *Journal of Applied Ecology* 47.1 (2010): 47-56. + +Auger‐Méthé, Marie, et al. ["A guide to state–space modeling of ecological time series.](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecm.1470)" *Ecological Monographs* 91.4 (2021): e01470. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/inst/doc/shared_states.html b/inst/doc/shared_states.html new file mode 100644 index 00000000..f302f7c2 --- /dev/null +++ b/inst/doc/shared_states.html @@ -0,0 +1,997 @@ + + + + + + + + + + + + + + + + +Shared latent states in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Shared latent states in mvgam

+

Nicholas J Clark

+

2024-04-19

+ + +
+ +
+ +

This vignette gives an example of how mvgam can be used +to estimate models where multiple observed time series share the same +latent process model. For full details on the basic mvgam +functionality, please see the +introductory vignette.

+
+

The trend_map argument

+

The trend_map argument in the mvgam() +function is an optional data.frame that can be used to +specify which series should depend on which latent process models +(called “trends” in mvgam). This can be particularly useful +if we wish to force multiple observed time series to depend on the same +latent trend process, but with different observation processes. If this +argument is supplied, a latent factor model is set up by setting +use_lv = TRUE and using the supplied trend_map +to set up the shared trends. Users familiar with the MARSS +family of packages will recognize this as a way of specifying the \(Z\) matrix. This data.frame +needs to have column names series and trend, +with integer values in the trend column to state which +trend each series should depend on. The series column +should have a single unique entry for each time series in the data, with +names that perfectly match the factor levels of the series +variable in data). For example, if we were to simulate a +collection of three integer-valued time series (using +sim_mvgam), the following trend_map would +force the first two series to share the same latent trend process:

+
set.seed(122)
+simdat <- sim_mvgam(trend_model = 'AR1',
+                    prop_trend = 0.6,
+                    mu = c(0, 1, 2),
+                    family = poisson())
+trend_map <- data.frame(series = unique(simdat$data_train$series),
+                        trend = c(1, 1, 2))
+trend_map
+#>     series trend
+#> 1 series_1     1
+#> 2 series_2     1
+#> 3 series_3     2
+

We can see that the factor levels in trend_map match +those in the data:

+
all.equal(levels(trend_map$series), levels(simdat$data_train$series))
+#> [1] TRUE
+
+

Checking trend_map with +run_model = FALSE

+

Supplying this trend_map to the mvgam +function for a simple model, but setting run_model = FALSE, +allows us to inspect the constructed Stan code and the data +objects that would be used to condition the model. Here we will set up a +model in which each series has a different observation process (with +only a different intercept per series in this case), and the two latent +dynamic process models evolve as independent AR1 processes that also +contain a shared nonlinear smooth function to capture repeated +seasonality. This model is not too complicated but it does show how we +can learn shared and independent effects for collections of time series +in the mvgam framework:

+
fake_mod <- mvgam(y ~ 
+                    # observation model formula, which has a 
+                    # different intercept per series
+                    series - 1,
+                  
+                  # process model formula, which has a shared seasonal smooth
+                  # (each latent process model shares the SAME smooth)
+                  trend_formula = ~ s(season, bs = 'cc', k = 6),
+                  
+                  # AR1 dynamics (each latent process model has DIFFERENT)
+                  # dynamics
+                  trend_model = 'AR1',
+                  
+                  # supplied trend_map
+                  trend_map = trend_map,
+                  
+                  # data and observation family
+                  family = poisson(),
+                  data = simdat$data_train,
+                  run_model = FALSE)
+

Inspecting the Stan code shows how this model is a +dynamic factor model in which the loadings are constructed to reflect +the supplied trend_map:

+
code(fake_mod)
+#> // Stan model code generated by package mvgam
+#> data {
+#>   int<lower=0> total_obs; // total number of observations
+#>   int<lower=0> n; // number of timepoints per series
+#>   int<lower=0> n_sp_trend; // number of trend smoothing parameters
+#>   int<lower=0> n_lv; // number of dynamic factors
+#>   int<lower=0> n_series; // number of series
+#>   matrix[n_series, n_lv] Z; // matrix mapping series to latent states
+#>   int<lower=0> num_basis; // total number of basis coefficients
+#>   int<lower=0> num_basis_trend; // number of trend basis coefficients
+#>   vector[num_basis_trend] zero_trend; // prior locations for trend basis coefficients
+#>   matrix[total_obs, num_basis] X; // mgcv GAM design matrix
+#>   matrix[n * n_lv, num_basis_trend] X_trend; // trend model design matrix
+#>   array[n, n_series] int<lower=0> ytimes; // time-ordered matrix (which col in X belongs to each [time, series] observation?)
+#>   array[n, n_lv] int ytimes_trend;
+#>   int<lower=0> n_nonmissing; // number of nonmissing observations
+#>   matrix[4, 4] S_trend1; // mgcv smooth penalty matrix S_trend1
+#>   array[n_nonmissing] int<lower=0> flat_ys; // flattened nonmissing observations
+#>   matrix[n_nonmissing, num_basis] flat_xs; // X values for nonmissing observations
+#>   array[n_nonmissing] int<lower=0> obs_ind; // indices of nonmissing observations
+#> }
+#> transformed data {
+#>   
+#> }
+#> parameters {
+#>   // raw basis coefficients
+#>   vector[num_basis] b_raw;
+#>   vector[num_basis_trend] b_raw_trend;
+#>   
+#>   // latent state SD terms
+#>   vector<lower=0>[n_lv] sigma;
+#>   
+#>   // latent state AR1 terms
+#>   vector<lower=-1.5, upper=1.5>[n_lv] ar1;
+#>   
+#>   // latent states
+#>   matrix[n, n_lv] LV;
+#>   
+#>   // smoothing parameters
+#>   vector<lower=0>[n_sp_trend] lambda_trend;
+#> }
+#> transformed parameters {
+#>   // latent states and loading matrix
+#>   vector[n * n_lv] trend_mus;
+#>   matrix[n, n_series] trend;
+#>   
+#>   // basis coefficients
+#>   vector[num_basis] b;
+#>   vector[num_basis_trend] b_trend;
+#>   
+#>   // observation model basis coefficients
+#>   b[1 : num_basis] = b_raw[1 : num_basis];
+#>   
+#>   // process model basis coefficients
+#>   b_trend[1 : num_basis_trend] = b_raw_trend[1 : num_basis_trend];
+#>   
+#>   // latent process linear predictors
+#>   trend_mus = X_trend * b_trend;
+#>   
+#>   // derived latent states
+#>   for (i in 1 : n) {
+#>     for (s in 1 : n_series) {
+#>       trend[i, s] = dot_product(Z[s,  : ], LV[i,  : ]);
+#>     }
+#>   }
+#> }
+#> model {
+#>   // prior for seriesseries_1...
+#>   b_raw[1] ~ student_t(3, 0, 2);
+#>   
+#>   // prior for seriesseries_2...
+#>   b_raw[2] ~ student_t(3, 0, 2);
+#>   
+#>   // prior for seriesseries_3...
+#>   b_raw[3] ~ student_t(3, 0, 2);
+#>   
+#>   // priors for AR parameters
+#>   ar1 ~ std_normal();
+#>   
+#>   // priors for latent state SD parameters
+#>   sigma ~ student_t(3, 0, 2.5);
+#>   
+#>   // dynamic process models
+#>   
+#>   // prior for s(season)_trend...
+#>   b_raw_trend[1 : 4] ~ multi_normal_prec(zero_trend[1 : 4],
+#>                                          S_trend1[1 : 4, 1 : 4]
+#>                                          * lambda_trend[1]);
+#>   lambda_trend ~ normal(5, 30);
+#>   for (j in 1 : n_lv) {
+#>     LV[1, j] ~ normal(trend_mus[ytimes_trend[1, j]], sigma[j]);
+#>     for (i in 2 : n) {
+#>       LV[i, j] ~ normal(trend_mus[ytimes_trend[i, j]]
+#>                         + ar1[j]
+#>                           * (LV[i - 1, j] - trend_mus[ytimes_trend[i - 1, j]]),
+#>                         sigma[j]);
+#>     }
+#>   }
+#>   {
+#>     // likelihood functions
+#>     vector[n_nonmissing] flat_trends;
+#>     flat_trends = to_vector(trend)[obs_ind];
+#>     flat_ys ~ poisson_log_glm(append_col(flat_xs, flat_trends), 0.0,
+#>                               append_row(b, 1.0));
+#>   }
+#> }
+#> generated quantities {
+#>   vector[total_obs] eta;
+#>   matrix[n, n_series] mus;
+#>   vector[n_sp_trend] rho_trend;
+#>   vector[n_lv] penalty;
+#>   array[n, n_series] int ypred;
+#>   penalty = 1.0 / (sigma .* sigma);
+#>   rho_trend = log(lambda_trend);
+#>   
+#>   matrix[n_series, n_lv] lv_coefs = Z;
+#>   // posterior predictions
+#>   eta = X * b;
+#>   for (s in 1 : n_series) {
+#>     mus[1 : n, s] = eta[ytimes[1 : n, s]] + trend[1 : n, s];
+#>     ypred[1 : n, s] = poisson_log_rng(mus[1 : n, s]);
+#>   }
+#> }
+

Notice the line that states “lv_coefs = Z;”. This uses the supplied +\(Z\) matrix to construct the loading +coefficients. The supplied matrix now looks exactly like what you’d use +if you were to create a similar model in the MARSS +package:

+
fake_mod$model_data$Z
+#>      [,1] [,2]
+#> [1,]    1    0
+#> [2,]    1    0
+#> [3,]    0    1
+
+
+

Fitting and inspecting the model

+

Though this model doesn’t perfectly match the data-generating process +(which allowed each series to have different underlying dynamics), we +can still fit it to show what the resulting inferences look like:

+
full_mod <- mvgam(y ~ series - 1,
+                  trend_formula = ~ s(season, bs = 'cc', k = 6),
+                  trend_model = 'AR1',
+                  trend_map = trend_map,
+                  family = poisson(),
+                  data = simdat$data_train)
+

The summary of this model is informative as it shows that only two +latent process models have been estimated, even though we have three +observed time series. The model converges well

+
summary(full_mod)
+#> GAM observation formula:
+#> y ~ series - 1
+#> <environment: 0x0000025404046520>
+#> 
+#> GAM process formula:
+#> ~s(season, bs = "cc", k = 6)
+#> <environment: 0x0000025404046520>
+#> 
+#> Family:
+#> poisson
+#> 
+#> Link function:
+#> log
+#> 
+#> Trend model:
+#> AR1
+#> 
+#> N process models:
+#> 2 
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 75 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>                 2.5%   50% 97.5% Rhat n_eff
+#> seriesseries_1 -0.15 0.088  0.31 1.00  1895
+#> seriesseries_2  0.92 1.100  1.20 1.00  1267
+#> seriesseries_3  1.90 2.100  2.30 1.02   256
+#> 
+#> Process model AR parameter estimates:
+#>         2.5%    50%  97.5% Rhat n_eff
+#> ar1[1] -0.72 -0.420 -0.056    1   676
+#> ar1[2] -0.28 -0.011  0.280    1  1433
+#> 
+#> Process error parameter estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> sigma[1] 0.33 0.49  0.67    1   487
+#> sigma[2] 0.59 0.73  0.91    1   948
+#> 
+#> GAM process model coefficient (beta) estimates:
+#>                    2.5%    50% 97.5% Rhat n_eff
+#> s(season).1_trend -0.22 -0.011  0.20    1  1612
+#> s(season).2_trend -0.27 -0.045  0.18    1  1745
+#> s(season).3_trend -0.15  0.074  0.29    1  1347
+#> s(season).4_trend -0.15  0.067  0.28    1  1561
+#> 
+#> Approximate significance of GAM process smooths:
+#>            edf Ref.df    F p-value
+#> s(season) 1.52      4 0.21    0.92
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Fri Apr 19 8:11:33 AM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Quick plots of all main effects can be made using +conditional_effects():

+
plot(conditional_effects(full_mod, type = 'link'), ask = FALSE)
+

+

Even more informative are the plots of the latent processes. Both +series 1 and 2 share the exact same estimates, while the estimates for +series 3 are different:

+
plot(full_mod, type = 'trend', series = 1)
+

+
plot(full_mod, type = 'trend', series = 2)
+

+
plot(full_mod, type = 'trend', series = 3)
+

+

However, the forecasts for series’ 1 and 2 differ because they have +different intercepts in the observation model

+
plot(full_mod, type = 'forecast', series = 1)
+

+
plot(full_mod, type = 'forecast', series = 2)
+

+
plot(full_mod, type = 'forecast', series = 3)
+

+
+
+
+

Example: signal detection

+

Now we will explore a more complicated example. Here we simulate a +true hidden signal that we are trying to track. This signal depends +nonlinearly on some covariate (called productivity, which +represents a measure of how productive the landscape is). The signal +also demonstrates a fairly large amount of temporal autocorrelation:

+
set.seed(543210)
+# simulate a nonlinear relationship using the mgcv function gamSim
+signal_dat <- gamSim(n = 100, eg = 1, scale = 1)
+#> Gu & Wahba 4 term additive model
+
+# productivity is one of the variables in the simulated data
+productivity <- signal_dat$x2
+
+# simulate the true signal, which already has a nonlinear relationship
+# with productivity; we will add in a fairly strong AR1 process to 
+# contribute to the signal
+true_signal <- as.vector(scale(signal_dat$y) +
+                         arima.sim(100, model = list(ar = 0.8, sd = 0.1)))
+

Plot the signal to inspect it’s evolution over time

+
plot(true_signal, type = 'l',
+     bty = 'l', lwd = 2,
+     ylab = 'True signal',
+     xlab = 'Time')
+

+

And plot the relationship between the signal and the +productivity covariate:

+
plot(true_signal ~ productivity,
+     pch = 16, bty = 'l',
+     ylab = 'True signal',
+     xlab = 'Productivity')
+

+

Next we simulate three sensors that are trying to track the same +hidden signal. All of these sensors have different observation errors +that can depend nonlinearly on a second external covariate, called +temperature in this example. Again this makes use of +gamSim

+
set.seed(543210)
+sim_series = function(n_series = 3, true_signal){
+  temp_effects <- gamSim(n = 100, eg = 7, scale = 0.1)
+  temperature <- temp_effects$y
+  alphas <- rnorm(n_series, sd = 2)
+
+  do.call(rbind, lapply(seq_len(n_series), function(series){
+    data.frame(observed = rnorm(length(true_signal),
+                                mean = alphas[series] +
+                                       1.5*as.vector(scale(temp_effects[, series + 1])) +
+                                       true_signal,
+                                sd = runif(1, 1, 2)),
+               series = paste0('sensor_', series),
+               time = 1:length(true_signal),
+               temperature = temperature,
+               productivity = productivity,
+               true_signal = true_signal)
+   }))
+  }
+model_dat <- sim_series(true_signal = true_signal) %>%
+  dplyr::mutate(series = factor(series))
+#> Gu & Wahba 4 term additive model, correlated predictors
+

Plot the sensor observations

+
plot_mvgam_series(data = model_dat, y = 'observed',
+                  series = 'all')
+

+

And now plot the observed relationships between the three sensors and +the temperature covariate

+
 plot(observed ~ temperature, data = model_dat %>%
+   dplyr::filter(series == 'sensor_1'),
+   pch = 16, bty = 'l',
+   ylab = 'Sensor 1',
+   xlab = 'Temperature')
+

+
 plot(observed ~ temperature, data = model_dat %>%
+   dplyr::filter(series == 'sensor_2'),
+   pch = 16, bty = 'l',
+   ylab = 'Sensor 2',
+   xlab = 'Temperature')
+

+
 plot(observed ~ temperature, data = model_dat %>%
+   dplyr::filter(series == 'sensor_3'),
+   pch = 16, bty = 'l',
+   ylab = 'Sensor 3',
+   xlab = 'Temperature')
+

+
+

The shared signal model

+

Now we can formulate and fit a model that allows each sensor’s +observation error to depend nonlinearly on temperature +while allowing the true signal to depend nonlinearly on +productivity. By fixing all of the values in the +trend column to 1 in the +trend_map, we are assuming that all observation sensors are +tracking the same latent signal. We use informative priors on the two +variance components (process error and observation error), which reflect +our prior belief that the observation error is smaller overall than the +true process error

+
mod <- mvgam(formula =
+               # formula for observations, allowing for different
+               # intercepts and hierarchical smooth effects of temperature
+               observed ~ series + 
+               s(temperature, k = 10) +
+               s(series, temperature, bs = 'sz', k = 8),
+             
+             trend_formula =
+               # formula for the latent signal, which can depend
+               # nonlinearly on productivity
+               ~ s(productivity, k = 8),
+             
+             trend_model =
+               # in addition to productivity effects, the signal is
+               # assumed to exhibit temporal autocorrelation
+               'AR1',
+             
+             trend_map =
+               # trend_map forces all sensors to track the same
+               # latent signal
+               data.frame(series = unique(model_dat$series),
+                          trend = c(1, 1, 1)),
+             
+             # informative priors on process error
+             # and observation error will help with convergence
+             priors = c(prior(normal(2, 0.5), class = sigma),
+                        prior(normal(1, 0.5), class = sigma_obs)),
+             
+             # Gaussian observations
+             family = gaussian(),
+             data = model_dat)
+

View a reduced version of the model summary because there will be +many spline coefficients in this model

+
summary(mod, include_betas = FALSE)
+#> GAM observation formula:
+#> observed ~ series + s(temperature, k = 10) + s(series, temperature, 
+#>     bs = "sz", k = 8)
+#> <environment: 0x0000025404046520>
+#> 
+#> GAM process formula:
+#> ~s(productivity, k = 8)
+#> <environment: 0x0000025404046520>
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> AR1
+#> 
+#> N process models:
+#> 1 
+#> 
+#> N series:
+#> 3 
+#> 
+#> N timepoints:
+#> 100 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1100; warmup = 600; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5% 50% 97.5% Rhat n_eff
+#> sigma_obs[1]  1.6 1.9   2.2    1  1757
+#> sigma_obs[2]  1.4 1.7   2.0    1  1090
+#> sigma_obs[3]  1.3 1.5   1.8    1  1339
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>                 2.5%   50% 97.5% Rhat n_eff
+#> (Intercept)     0.72  1.70  2.50 1.01   360
+#> seriessensor_2 -2.10 -0.96  0.32 1.00  1068
+#> seriessensor_3 -3.40 -2.00 -0.39 1.00  1154
+#> 
+#> Approximate significance of GAM observation smooths:
+#>                        edf Ref.df    F p-value    
+#> s(temperature)        1.22      9 12.7  <2e-16 ***
+#> s(series,temperature) 1.92     16  1.0   0.011 *  
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Process model AR parameter estimates:
+#>        2.5%  50% 97.5% Rhat n_eff
+#> ar1[1] 0.33 0.59  0.83    1   492
+#> 
+#> Process error parameter estimates:
+#>          2.5% 50% 97.5% Rhat n_eff
+#> sigma[1] 0.72   1   1.3 1.01   392
+#> 
+#> Approximate significance of GAM process smooths:
+#>                 edf Ref.df    F p-value    
+#> s(productivity) 3.6      7 9.34 0.00036 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhats above 1.05 found for 28 parameters
+#>  *Diagnose further to investigate why the chains have not mixed
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Fri Apr 19 8:13:22 AM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+
+
+

Inspecting effects on both process and observation models

+

Don’t pay much attention to the approximate p-values of the +smooth terms. The calculation for these values is incredibly sensitive +to the estimates for the smoothing parameters so I don’t tend to find +them to be very meaningful. What are meaningful, however, are +prediction-based plots of the smooth functions. For example, here is the +estimated response of the underlying signal to +productivity:

+
plot(mod, type = 'smooths', trend_effects = TRUE)
+

+

And here are the estimated relationships between the sensor +observations and the temperature covariate:

+
plot(mod, type = 'smooths')
+

+

All main effects can be quickly plotted with +conditional_effects:

+
plot(conditional_effects(mod, type = 'link'), ask = FALSE)
+

+

conditional_effects is simply a wrapper to the more +flexible plot_predictions function from the +marginaleffects package. We can get more useful plots of +these effects using this function for further customisation:

+
plot_predictions(mod, 
+                 condition = c('temperature', 'series', 'series'),
+                 points = 0.5) +
+  theme(legend.position = 'none')
+

+

We have successfully estimated effects, some of them nonlinear, that +impact the hidden process AND the observations. All in a single joint +model. But there can always be challenges with these models, +particularly when estimating both process and observation error at the +same time. For example, a pairs plot for the observation +error for sensor 1 and the hidden process error shows some strong +correlations that we might want to deal with by using a more structured +prior:

+
pairs(mod, variable = c('sigma[1]', 'sigma_obs[1]'))
+

+

But we will leave the model as-is for this example

+
+
+

Recovering the hidden signal

+

A final but very key question is whether we can successfully recover +the true hidden signal. The trend slot in the returned +model parameters has the estimates for this signal, which we can easily +plot using the mvgam S3 method for plot. We +can also overlay the true values for the hidden signal, which shows that +our model has done a good job of recovering it:

+
plot(mod, type = 'trend')
+
+# Overlay the true simulated signal
+points(true_signal, pch = 16, cex = 1, col = 'white')
+points(true_signal, pch = 16, cex = 0.8)
+

+
+
+
+

Further reading

+

The following papers and resources offer a lot of useful material +about other types of State-Space models and how they can be applied in +practice:

+

Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. “MARSS: +multivariate autoregressive state-space models for analyzing time-series +data.R Journal. 4.1 (2012): 11.

+

Ward, Eric J., et al. “Inferring +spatial structure from time‐series data: using multivariate state‐space +models to detect metapopulation structure of California sea lions in the +Gulf of California, Mexico.Journal of Applied Ecology +47.1 (2010): 47-56.

+

Auger‐Méthé, Marie, et al. “A +guide to state–space modeling of ecological time series.” +Ecological Monographs 91.4 (2021): e01470.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/time_varying_effects.R b/inst/doc/time_varying_effects.R new file mode 100644 index 00000000..f5415f2b --- /dev/null +++ b/inst/doc/time_varying_effects.R @@ -0,0 +1,237 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +set.seed(1111) +N <- 200 +beta_temp <- mvgam:::sim_gp(rnorm(1), + alpha_gp = 0.75, + rho_gp = 10, + h = N) + 0.5 + +## ----fig.alt = "Simulating time-varying effects in mvgam and R"--------------- +plot(beta_temp, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Coefficient', + col = 'darkred') +box(bty = 'l', lwd = 2) + +## ----------------------------------------------------------------------------- +temp <- rnorm(N, sd = 1) + +## ----fig.alt = "Simulating time-varying effects in mvgam and R"--------------- +out <- rnorm(N, mean = 4 + beta_temp * temp, + sd = 0.25) +time <- seq_along(temp) +plot(out, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Outcome', + col = 'darkred') +box(bty = 'l', lwd = 2) + +## ----------------------------------------------------------------------------- +data <- data.frame(out, temp, time) +data_train <- data[1:190,] +data_test <- data[191:200,] + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = data_train, newdata = data_test, y = 'out') + +## ----include=FALSE------------------------------------------------------------ +mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), + family = gaussian(), + data = data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), +# family = gaussian(), +# data = data_train) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(mod, type = 'smooths') + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) + +## ----------------------------------------------------------------------------- +range_round = function(x){ + round(range(x, na.rm = TRUE), 2) +} +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') + +## ----------------------------------------------------------------------------- +fc <- forecast(mod, newdata = data_test) +plot(fc) + +## ----include=FALSE------------------------------------------------------------ +mod <- mvgam(out ~ dynamic(temp, k = 40), + family = gaussian(), + data = data_train) + +## ----eval=FALSE--------------------------------------------------------------- +# mod <- mvgam(out ~ dynamic(temp, k = 40), +# family = gaussian(), +# data = data_train) + +## ----------------------------------------------------------------------------- +summary(mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) + +## ----------------------------------------------------------------------------- +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') + +## ----------------------------------------------------------------------------- +fc <- forecast(mod, newdata = data_test) +plot(fc) + +## ----------------------------------------------------------------------------- +load(url('https://github.com/atsa-es/MARSS/raw/master/data/SalmonSurvCUI.rda')) +dplyr::glimpse(SalmonSurvCUI) + +## ----------------------------------------------------------------------------- +SalmonSurvCUI %>% + # create a time variable + dplyr::mutate(time = dplyr::row_number()) %>% + + # create a series variable + dplyr::mutate(series = as.factor('salmon')) %>% + + # z-score the covariate CUI.apr + dplyr::mutate(CUI.apr = as.vector(scale(CUI.apr))) %>% + + # convert logit-transformed survival back to proportional + dplyr::mutate(survival = plogis(logit.s)) -> model_data + +## ----------------------------------------------------------------------------- +dplyr::glimpse(model_data) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = model_data, y = 'survival') + +## ----include = FALSE---------------------------------------------------------- +mod0 <- mvgam(formula = survival ~ 1, + trend_model = 'RW', + family = betar(), + data = model_data) + +## ----eval = FALSE------------------------------------------------------------- +# mod0 <- mvgam(formula = survival ~ 1, +# trend_model = 'RW', +# family = betar(), +# data = model_data) + +## ----------------------------------------------------------------------------- +summary(mod0) + +## ----------------------------------------------------------------------------- +plot(mod0, type = 'trend') + +## ----------------------------------------------------------------------------- +plot(mod0, type = 'forecast') + +## ----include=FALSE------------------------------------------------------------ +mod1 <- mvgam(formula = survival ~ 1, + trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), + trend_model = 'RW', + family = betar(), + data = model_data, + adapt_delta = 0.99) + +## ----eval=FALSE--------------------------------------------------------------- +# mod1 <- mvgam(formula = survival ~ 1, +# trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), +# trend_model = 'RW', +# family = betar(), +# data = model_data) + +## ----------------------------------------------------------------------------- +summary(mod1, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(mod1, type = 'trend') + +## ----------------------------------------------------------------------------- +plot(mod1, type = 'forecast') + +## ----------------------------------------------------------------------------- +# Extract estimates of the process error 'sigma' for each model +mod0_sigma <- as.data.frame(mod0, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod0') +mod1_sigma <- as.data.frame(mod1, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod1') +sigmas <- rbind(mod0_sigma, mod1_sigma) + +# Plot using ggplot2 +library(ggplot2) +ggplot(sigmas, aes(y = `sigma[1]`, fill = model)) + + geom_density(alpha = 0.3, colour = NA) + + coord_flip() + +## ----------------------------------------------------------------------------- +plot(mod1, type = 'smooth', trend_effects = TRUE) + +## ----------------------------------------------------------------------------- +plot_predictions(mod1, newdata = datagrid(CUI.apr = range_round, + time = unique), + by = c('time', 'CUI.apr', 'CUI.apr')) + +## ----------------------------------------------------------------------------- +loo_compare(mod0, mod1) + +## ----include=FALSE------------------------------------------------------------ +lfo_mod0 <- lfo_cv(mod0, min_t = 30) +lfo_mod1 <- lfo_cv(mod1, min_t = 30) + +## ----eval=FALSE--------------------------------------------------------------- +# lfo_mod0 <- lfo_cv(mod0, min_t = 30) +# lfo_mod1 <- lfo_cv(mod1, min_t = 30) + +## ----------------------------------------------------------------------------- +sum(lfo_mod0$elpds) +sum(lfo_mod1$elpds) + +## ----fig.alt = "Comparing forecast skill for dynamic beta regression models in mvgam and R"---- +plot(x = 1:length(lfo_mod0$elpds) + 30, + y = lfo_mod0$elpds - lfo_mod1$elpds, + ylab = 'ELPDmod0 - ELPDmod1', + xlab = 'Evaluation time point', + pch = 16, + col = 'darkred', + bty = 'l') +abline(h = 0, lty = 'dashed') + diff --git a/inst/doc/time_varying_effects.Rmd b/inst/doc/time_varying_effects.Rmd new file mode 100644 index 00000000..271a907d --- /dev/null +++ b/inst/doc/time_varying_effects.Rmd @@ -0,0 +1,357 @@ +--- +title: "Time-varying effects in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{Time-varying effects in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to estimate and forecast regression coefficients that vary through time. + +## Time-varying effects +Dynamic fixed-effect coefficients (often referred to as dynamic linear models) can be readily incorporated into GAMs / DGAMs. In `mvgam`, the `dynamic()` formula wrapper offers a convenient interface to set these up. The plan is to incorporate a range of dynamic options (such as random walk, AR1 etc...) but for the moment only low-rank Gaussian Process (GP) smooths are allowed (making use either of the `gp` basis in `mgcv` of of Hilbert space approximate GPs). These are advantageous over splines or random walk effects for several reasons. First, GPs will force the time-varying effect to be smooth. This often makes sense in reality, where we would not expect a regression coefficient to change rapidly from one time point to the next. Second, GPs provide information on the 'global' dynamics of a time-varying effect through their length-scale parameters. This means we can use them to provide accurate forecasts of how an effect is expected to change in the future, something that we couldn't do well if we used splines to estimate the effect. An example below illustrates. + +### Simulating time-varying effects +Simulate a time-varying coefficient using a squared exponential Gaussian Process function with length scale $\rho$=10. We will do this using an internal function from `mvgam` (the `sim_gp` function): +```{r} +set.seed(1111) +N <- 200 +beta_temp <- mvgam:::sim_gp(rnorm(1), + alpha_gp = 0.75, + rho_gp = 10, + h = N) + 0.5 +``` + +A plot of the time-varying coefficient shows that it changes smoothly through time: +```{r, fig.alt = "Simulating time-varying effects in mvgam and R"} +plot(beta_temp, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Coefficient', + col = 'darkred') +box(bty = 'l', lwd = 2) +``` + +Next we need to simulate the values of the covariate, which we will call `temp` (to represent $temperature$). In this case we just use a standard normal distribution to simulate this covariate: +```{r} +temp <- rnorm(N, sd = 1) +``` + +Finally, simulate the outcome variable, which is a Gaussian observation process (with observation error) over the time-varying effect of $temperature$ +```{r, fig.alt = "Simulating time-varying effects in mvgam and R"} +out <- rnorm(N, mean = 4 + beta_temp * temp, + sd = 0.25) +time <- seq_along(temp) +plot(out, type = 'l', lwd = 3, + bty = 'l', xlab = 'Time', ylab = 'Outcome', + col = 'darkred') +box(bty = 'l', lwd = 2) +``` + +Gather the data into a `data.frame` for fitting models, and split the data into training and testing folds. +```{r} +data <- data.frame(out, temp, time) +data_train <- data[1:190,] +data_test <- data[191:200,] +``` + +Plot the series +```{r} +plot_mvgam_series(data = data_train, newdata = data_test, y = 'out') +``` + + +### The `dynamic()` function +Time-varying coefficients can be fairly easily set up using the `s()` or `gp()` wrapper functions in `mvgam` formulae by fitting a nonlinear effect of `time` and using the covariate of interest as the numeric `by` variable (see `?mgcv::s` or `?brms::gp` for more details). The `dynamic()` formula wrapper offers a way to automate this process, and will eventually allow for a broader variety of time-varying effects (such as random walk or AR processes). Depending on the arguments that are specified to `dynamic`, it will either set up a low-rank GP smooth function using `s()` with `bs = 'gp'` and a fixed value of the length scale parameter $\rho$, or it will set up a Hilbert space approximate GP using the `gp()` function with `c=5/4` so that $\rho$ is estimated (see `?dynamic` for more details). In this first example we will use the `s()` option, and will mis-specify the $\rho$ parameter here as, in practice, it is never known. This call to `dynamic()` will set up the following smooth: `s(time, by = temp, bs = "gp", m = c(-2, 8, 2), k = 40)` +```{r, include=FALSE} +mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), + family = gaussian(), + data = data_train) +``` + +```{r, eval=FALSE} +mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40), + family = gaussian(), + data = data_train) +``` + +Inspect the model summary, which shows how the `dynamic()` wrapper was used to construct a low-rank Gaussian Process smooth function: +```{r} +summary(mod, include_betas = FALSE) +``` + +Because this model used a spline with a `gp` basis, it's smooths can be visualised just like any other `gam`. Plot the estimated time-varying coefficient for the in-sample training period +```{r} +plot(mod, type = 'smooths') +``` + +We can also plot the estimates for the in-sample and out-of-sample periods to see how the Gaussian Process function produces sensible smooth forecasts. Here we supply the full dataset to the `newdata` argument in `plot_mvgam_smooth` to inspect posterior forecasts of the time-varying smooth function. Overlay the true simulated function to see that the model has adequately estimated it's dynamics in both the training and testing data partitions +```{r} +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) +``` + +We can also use `plot_predictions` from the `marginaleffects` package to visualise the time-varying coefficient for what the effect would be estimated to be at different values of $temperature$: +```{r} +range_round = function(x){ + round(range(x, na.rm = TRUE), 2) +} +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') +``` + +This results in sensible forecasts of the observations as well +```{r} +fc <- forecast(mod, newdata = data_test) +plot(fc) +``` + +The syntax is very similar if we wish to estimate the parameters of the underlying Gaussian Process, this time using a Hilbert space approximation. We simply omit the `rho` argument in `dynamic` to make this happen. This will set up a call similar to `gp(time, by = 'temp', c = 5/4, k = 40)`. +```{r include=FALSE} +mod <- mvgam(out ~ dynamic(temp, k = 40), + family = gaussian(), + data = data_train) +``` + +```{r eval=FALSE} +mod <- mvgam(out ~ dynamic(temp, k = 40), + family = gaussian(), + data = data_train) +``` + +This model summary now contains estimates for the marginal deviation and length scale parameters of the underlying Gaussian Process function: +```{r} +summary(mod, include_betas = FALSE) +``` + +Effects for `gp()` terms can also be plotted as smooths: +```{r} +plot_mvgam_smooth(mod, smooth = 1, newdata = data) +abline(v = 190, lty = 'dashed', lwd = 2) +lines(beta_temp, lwd = 2.5, col = 'white') +lines(beta_temp, lwd = 2) +``` + +Both the above plot and the below `plot_predictions()` call show that the effect in this case is similar to what we estimated in the approximate GP smooth model above: +```{r} +plot_predictions(mod, + newdata = datagrid(time = unique, + temp = range_round), + by = c('time', 'temp', 'temp'), + type = 'link') +``` + +Forecasts are also similar: +```{r} +fc <- forecast(mod, newdata = data_test) +plot(fc) +``` + +## Salmon survival example +Here we will use openly available data on marine survival of Chinook salmon to illustrate how time-varying effects can be used to improve ecological time series models. [Scheuerell and Williams (2005)](https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1365-2419.2005.00346.x) used a dynamic linear model to examine the relationship between marine survival of Chinook salmon and an index of ocean upwelling strength along the west coast of the USA. The authors hypothesized that stronger upwelling in April should create better growing conditions for phytoplankton, which would then translate into more zooplankton and provide better foraging opportunities for juvenile salmon entering the ocean. The data on survival is measured as a proportional variable over 42 years (1964–2005) and is available in the `MARSS` package: +```{r} +load(url('https://github.com/atsa-es/MARSS/raw/master/data/SalmonSurvCUI.rda')) +dplyr::glimpse(SalmonSurvCUI) +``` + +First we need to prepare the data for modelling. The variable `CUI.apr` will be standardized to make it easier for the sampler to estimate underlying GP parameters for the time-varying effect. We also need to convert the survival back to a proportion, as in its current form it has been logit-transformed (this is because most time series packages cannot handle proportional data). As usual, we also need to create a `time` indicator and a `series` indicator for working in `mvgam`: +```{r} +SalmonSurvCUI %>% + # create a time variable + dplyr::mutate(time = dplyr::row_number()) %>% + + # create a series variable + dplyr::mutate(series = as.factor('salmon')) %>% + + # z-score the covariate CUI.apr + dplyr::mutate(CUI.apr = as.vector(scale(CUI.apr))) %>% + + # convert logit-transformed survival back to proportional + dplyr::mutate(survival = plogis(logit.s)) -> model_data +``` + +Inspect the data +```{r} +dplyr::glimpse(model_data) +``` + +Plot features of the outcome variable, which shows that it is a proportional variable with particular restrictions that we want to model: +```{r} +plot_mvgam_series(data = model_data, y = 'survival') +``` + + +### A State-Space Beta regression +`mvgam` can easily handle data that are bounded at 0 and 1 with a Beta observation model (using the `mgcv` function `betar()`, see `?mgcv::betar` for details). First we will fit a simple State-Space model that uses a Random Walk dynamic process model with no predictors and a Beta observation model: +```{r include = FALSE} +mod0 <- mvgam(formula = survival ~ 1, + trend_model = 'RW', + family = betar(), + data = model_data) +``` + +```{r eval = FALSE} +mod0 <- mvgam(formula = survival ~ 1, + trend_model = 'RW', + family = betar(), + data = model_data) +``` + +The summary of this model shows good behaviour of the Hamiltonian Monte Carlo sampler and provides useful summaries on the Beta observation model parameters: +```{r} +summary(mod0) +``` + +A plot of the underlying dynamic component shows how it has easily handled the temporal evolution of the time series: +```{r} +plot(mod0, type = 'trend') +``` + +Posterior hindcasts are also good and will automatically respect the observational data bounding at 0 and 1: +```{r} +plot(mod0, type = 'forecast') +``` + + +### Including time-varying upwelling effects +Now we can increase the complexity of our model by constructing and fitting a State-Space model with a time-varying effect of the coastal upwelling index in addition to the autoregressive dynamics. We again use a Beta observation model to capture the restrictions of our proportional observations, but this time will include a `dynamic()` effect of `CUI.apr` in the latent process model. We do not specify the $\rho$ parameter, instead opting to estimate it using a Hilbert space approximate GP: +```{r include=FALSE} +mod1 <- mvgam(formula = survival ~ 1, + trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), + trend_model = 'RW', + family = betar(), + data = model_data, + adapt_delta = 0.99) +``` + +```{r eval=FALSE} +mod1 <- mvgam(formula = survival ~ 1, + trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE), + trend_model = 'RW', + family = betar(), + data = model_data) +``` + +The summary for this model now includes estimates for the time-varying GP parameters: +```{r} +summary(mod1, include_betas = FALSE) +``` + +The estimates for the underlying dynamic process, and for the hindcasts, haven't changed much: +```{r} +plot(mod1, type = 'trend') +``` + +```{r} +plot(mod1, type = 'forecast') +``` + +But the process error parameter $\sigma$ is slightly smaller for this model than for the first model: +```{r} +# Extract estimates of the process error 'sigma' for each model +mod0_sigma <- as.data.frame(mod0, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod0') +mod1_sigma <- as.data.frame(mod1, variable = 'sigma', regex = TRUE) %>% + dplyr::mutate(model = 'Mod1') +sigmas <- rbind(mod0_sigma, mod1_sigma) + +# Plot using ggplot2 +library(ggplot2) +ggplot(sigmas, aes(y = `sigma[1]`, fill = model)) + + geom_density(alpha = 0.3, colour = NA) + + coord_flip() +``` + +Why does the process error not need to be as flexible in the second model? Because the estimates of this dynamic process are now informed partly by the time-varying effect of upwelling, which we can visualise on the link scale using `plot()` with `trend_effects = TRUE`: +```{r} +plot(mod1, type = 'smooth', trend_effects = TRUE) +``` + +Or on the outcome scale, at a range of possible `CUI.apr` values, using `plot_predictions()`: +```{r} +plot_predictions(mod1, newdata = datagrid(CUI.apr = range_round, + time = unique), + by = c('time', 'CUI.apr', 'CUI.apr')) +``` + + +### Comparing model predictive performances +A key question when fitting multiple time series models is whether one of them provides better predictions than the other. There are several options in `mvgam` for exploring this quantitatively. First, we can compare models based on in-sample approximate leave-one-out cross-validation as implemented in the popular `loo` package: +```{r} +loo_compare(mod0, mod1) +``` + +The second model has the larger Expected Log Predictive Density (ELPD), meaning that it is slightly favoured over the simpler model that did not include the time-varying upwelling effect. However, the two models certainly do not differ by much. But this metric only compares in-sample performance, and we are hoping to use our models to produce reasonable forecasts. Luckily, `mvgam` also has routines for comparing models using approximate leave-future-out cross-validation. Here we refit both models to a reduced training set (starting at time point 30) and produce approximate 1-step ahead forecasts. These forecasts are used to estimate forecast ELPD before expanding the training set one time point at a time. We use Pareto-smoothed importance sampling to reweight posterior predictions, acting as a kind of particle filter so that we don't need to refit the model too often (you can read more about how this process works in Bürkner et al. 2020). +```{r include=FALSE} +lfo_mod0 <- lfo_cv(mod0, min_t = 30) +lfo_mod1 <- lfo_cv(mod1, min_t = 30) +``` + +```{r eval=FALSE} +lfo_mod0 <- lfo_cv(mod0, min_t = 30) +lfo_mod1 <- lfo_cv(mod1, min_t = 30) +``` + +The model with the time-varying upwelling effect tends to provides better 1-step ahead forecasts, with a higher total forecast ELPD +```{r} +sum(lfo_mod0$elpds) +sum(lfo_mod1$elpds) +``` + +We can also plot the ELPDs for each model as a contrast. Here, values less than zero suggest the time-varying predictor model (Mod1) gives better 1-step ahead forecasts: +```{r, fig.alt = "Comparing forecast skill for dynamic beta regression models in mvgam and R"} +plot(x = 1:length(lfo_mod0$elpds) + 30, + y = lfo_mod0$elpds - lfo_mod1$elpds, + ylab = 'ELPDmod0 - ELPDmod1', + xlab = 'Evaluation time point', + pch = 16, + col = 'darkred', + bty = 'l') +abline(h = 0, lty = 'dashed') +``` + +A useful exercise to further expand this model would be to think about what kinds of predictors might impact measurement error, which could easily be implemented into the observation formula in `mvgam`. But for now, we will leave the model as-is. + +## Further reading +The following papers and resources offer a lot of useful material about dynamic linear models and how they can be applied / evaluated in practice: + +Bürkner, PC, Gabry, J and Vehtari, A [Approximate leave-future-out cross-validation for Bayesian time series models](https://www.tandfonline.com/doi/full/10.1080/00949655.2020.1783262). *Journal of Statistical Computation and Simulation*. 90:14 (2020) 2499-2523. + +Herrero, Asier, et al. [From the individual to the landscape and back: time‐varying effects of climate and herbivory on tree sapling growth at distribution limits](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/1365-2745.12527). *Journal of Ecology* 104.2 (2016): 430-442. + +Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. "[MARSS: multivariate autoregressive state-space models for analyzing time-series data.](https://journal.r-project.org/archive/2012/RJ-2012-002/index.html)" *R Journal*. 4.1 (2012): 11. + +Scheuerell, Mark D., and John G. Williams. [Forecasting climate induced changes in the survival of Snake River Spring/Summer Chinook Salmon (*Oncorhynchus Tshawytscha*)](https://onlinelibrary.wiley.com/doi/10.1111/j.1365-2419.2005.00346.x) *Fisheries Oceanography* 14 (2005): 448–57. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/inst/doc/time_varying_effects.html b/inst/doc/time_varying_effects.html new file mode 100644 index 00000000..8d85f76b --- /dev/null +++ b/inst/doc/time_varying_effects.html @@ -0,0 +1,980 @@ + + + + + + + + + + + + + + + + +Time-varying effects in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

Time-varying effects in mvgam

+

Nicholas J Clark

+

2024-04-18

+ + +
+ +
+ +

The purpose of this vignette is to show how the mvgam +package can be used to estimate and forecast regression coefficients +that vary through time.

+
+

Time-varying effects

+

Dynamic fixed-effect coefficients (often referred to as dynamic +linear models) can be readily incorporated into GAMs / DGAMs. In +mvgam, the dynamic() formula wrapper offers a +convenient interface to set these up. The plan is to incorporate a range +of dynamic options (such as random walk, AR1 etc…) but for the moment +only low-rank Gaussian Process (GP) smooths are allowed (making use +either of the gp basis in mgcv of of Hilbert +space approximate GPs). These are advantageous over splines or random +walk effects for several reasons. First, GPs will force the time-varying +effect to be smooth. This often makes sense in reality, where we would +not expect a regression coefficient to change rapidly from one time +point to the next. Second, GPs provide information on the ‘global’ +dynamics of a time-varying effect through their length-scale parameters. +This means we can use them to provide accurate forecasts of how an +effect is expected to change in the future, something that we couldn’t +do well if we used splines to estimate the effect. An example below +illustrates.

+
+

Simulating time-varying effects

+

Simulate a time-varying coefficient using a squared exponential +Gaussian Process function with length scale \(\rho\)=10. We will do this using an +internal function from mvgam (the sim_gp +function):

+
set.seed(1111)
+N <- 200
+beta_temp <- mvgam:::sim_gp(rnorm(1),
+                            alpha_gp = 0.75,
+                            rho_gp = 10,
+                            h = N) + 0.5
+

A plot of the time-varying coefficient shows that it changes smoothly +through time:

+
plot(beta_temp, type = 'l', lwd = 3, 
+     bty = 'l', xlab = 'Time', ylab = 'Coefficient',
+     col = 'darkred')
+box(bty = 'l', lwd = 2)
+

Simulating time-varying effects in mvgam and R

+

Next we need to simulate the values of the covariate, which we will +call temp (to represent \(temperature\)). In this case we just use a +standard normal distribution to simulate this covariate:

+
temp <- rnorm(N, sd = 1)
+

Finally, simulate the outcome variable, which is a Gaussian +observation process (with observation error) over the time-varying +effect of \(temperature\)

+
out <- rnorm(N, mean = 4 + beta_temp * temp,
+             sd = 0.25)
+time <- seq_along(temp)
+plot(out,  type = 'l', lwd = 3, 
+     bty = 'l', xlab = 'Time', ylab = 'Outcome',
+     col = 'darkred')
+box(bty = 'l', lwd = 2)
+

Simulating time-varying effects in mvgam and R

+

Gather the data into a data.frame for fitting models, +and split the data into training and testing folds.

+
data <- data.frame(out, temp, time)
+data_train <- data[1:190,]
+data_test <- data[191:200,]
+

Plot the series

+
plot_mvgam_series(data = data_train, newdata = data_test, y = 'out')
+

+
+
+

The dynamic() function

+

Time-varying coefficients can be fairly easily set up using the +s() or gp() wrapper functions in +mvgam formulae by fitting a nonlinear effect of +time and using the covariate of interest as the numeric +by variable (see ?mgcv::s or +?brms::gp for more details). The dynamic() +formula wrapper offers a way to automate this process, and will +eventually allow for a broader variety of time-varying effects (such as +random walk or AR processes). Depending on the arguments that are +specified to dynamic, it will either set up a low-rank GP +smooth function using s() with bs = 'gp' and a +fixed value of the length scale parameter \(\rho\), or it will set up a Hilbert space +approximate GP using the gp() function with +c=5/4 so that \(\rho\) is +estimated (see ?dynamic for more details). In this first +example we will use the s() option, and will mis-specify +the \(\rho\) parameter here as, in +practice, it is never known. This call to dynamic() will +set up the following smooth: +s(time, by = temp, bs = "gp", m = c(-2, 8, 2), k = 40)

+
mod <- mvgam(out ~ dynamic(temp, rho = 8, stationary = TRUE, k = 40),
+             family = gaussian(),
+             data = data_train)
+

Inspect the model summary, which shows how the dynamic() +wrapper was used to construct a low-rank Gaussian Process smooth +function:

+
summary(mod, include_betas = FALSE)
+#> GAM formula:
+#> out ~ s(time, by = temp, bs = "gp", m = c(-2, 8, 2), k = 40)
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 190 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5%  50% 97.5% Rhat n_eff
+#> sigma_obs[1] 0.23 0.25  0.28    1  2222
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5% 50% 97.5% Rhat n_eff
+#> (Intercept)    4   4   4.1    1  2893
+#> 
+#> Approximate significance of GAM smooths:
+#>              edf Ref.df    F p-value    
+#> s(time):temp  14     40 72.4  <2e-16 ***
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:39:49 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Because this model used a spline with a gp basis, it’s +smooths can be visualised just like any other gam. Plot the +estimated time-varying coefficient for the in-sample training period

+
plot(mod, type = 'smooths')
+

+

We can also plot the estimates for the in-sample and out-of-sample +periods to see how the Gaussian Process function produces sensible +smooth forecasts. Here we supply the full dataset to the +newdata argument in plot_mvgam_smooth to +inspect posterior forecasts of the time-varying smooth function. Overlay +the true simulated function to see that the model has adequately +estimated it’s dynamics in both the training and testing data +partitions

+
plot_mvgam_smooth(mod, smooth = 1, newdata = data)
+abline(v = 190, lty = 'dashed', lwd = 2)
+lines(beta_temp, lwd = 2.5, col = 'white')
+lines(beta_temp, lwd = 2)
+

+

We can also use plot_predictions from the +marginaleffects package to visualise the time-varying +coefficient for what the effect would be estimated to be at different +values of \(temperature\):

+
range_round = function(x){
+  round(range(x, na.rm = TRUE), 2)
+}
+plot_predictions(mod, 
+                 newdata = datagrid(time = unique,
+                                    temp = range_round),
+                 by = c('time', 'temp', 'temp'),
+                 type = 'link')
+

+

This results in sensible forecasts of the observations as well

+
fc <- forecast(mod, newdata = data_test)
+plot(fc)
+

+
#> Out of sample CRPS:
+#> [1] 1.280347
+

The syntax is very similar if we wish to estimate the parameters of +the underlying Gaussian Process, this time using a Hilbert space +approximation. We simply omit the rho argument in +dynamic to make this happen. This will set up a call +similar to gp(time, by = 'temp', c = 5/4, k = 40).

+
mod <- mvgam(out ~ dynamic(temp, k = 40),
+             family = gaussian(),
+             data = data_train)
+

This model summary now contains estimates for the marginal deviation +and length scale parameters of the underlying Gaussian Process +function:

+
summary(mod, include_betas = FALSE)
+#> GAM formula:
+#> out ~ gp(time, by = temp, c = 5/4, k = 40, scale = TRUE)
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> None
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 190 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5%  50% 97.5% Rhat n_eff
+#> sigma_obs[1] 0.24 0.26  0.29    1  2151
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5% 50% 97.5% Rhat n_eff
+#> (Intercept)    4   4   4.1    1  2989
+#> 
+#> GAM gp term marginal deviation (alpha) and length scale (rho) estimates:
+#>                      2.5%   50% 97.5% Rhat n_eff
+#> alpha_gp(time):temp 0.640 0.890 1.400 1.01   745
+#> rho_gp(time):temp   0.028 0.053 0.069 1.00   888
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 1 of 2000 iterations ended with a divergence (0.05%)
+#>  *Try running with larger adapt_delta to remove the divergences
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:41:07 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

Effects for gp() terms can also be plotted as +smooths:

+
plot_mvgam_smooth(mod, smooth = 1, newdata = data)
+abline(v = 190, lty = 'dashed', lwd = 2)
+lines(beta_temp, lwd = 2.5, col = 'white')
+lines(beta_temp, lwd = 2)
+

+

Both the above plot and the below plot_predictions() +call show that the effect in this case is similar to what we estimated +in the approximate GP smooth model above:

+
plot_predictions(mod, 
+                 newdata = datagrid(time = unique,
+                                    temp = range_round),
+                 by = c('time', 'temp', 'temp'),
+                 type = 'link')
+

+

Forecasts are also similar:

+
fc <- forecast(mod, newdata = data_test)
+plot(fc)
+

+
#> Out of sample CRPS:
+#> [1] 1.667521
+
+
+
+

Salmon survival example

+

Here we will use openly available data on marine survival of Chinook +salmon to illustrate how time-varying effects can be used to improve +ecological time series models. Scheuerell +and Williams (2005) used a dynamic linear model to examine the +relationship between marine survival of Chinook salmon and an index of +ocean upwelling strength along the west coast of the USA. The authors +hypothesized that stronger upwelling in April should create better +growing conditions for phytoplankton, which would then translate into +more zooplankton and provide better foraging opportunities for juvenile +salmon entering the ocean. The data on survival is measured as a +proportional variable over 42 years (1964–2005) and is available in the +MARSS package:

+
load(url('https://github.com/atsa-es/MARSS/raw/master/data/SalmonSurvCUI.rda'))
+dplyr::glimpse(SalmonSurvCUI)
+#> Rows: 42
+#> Columns: 3
+#> $ year    <int> 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 19…
+#> $ logit.s <dbl> -3.46, -3.32, -3.58, -3.03, -3.61, -3.35, -3.93, -4.19, -4.82,…
+#> $ CUI.apr <int> 57, 5, 43, 11, 47, -21, 25, -2, -1, 43, 2, 35, 0, 1, -1, 6, -7…
+

First we need to prepare the data for modelling. The variable +CUI.apr will be standardized to make it easier for the +sampler to estimate underlying GP parameters for the time-varying +effect. We also need to convert the survival back to a proportion, as in +its current form it has been logit-transformed (this is because most +time series packages cannot handle proportional data). As usual, we also +need to create a time indicator and a series +indicator for working in mvgam:

+
SalmonSurvCUI %>%
+  # create a time variable
+  dplyr::mutate(time = dplyr::row_number()) %>%
+
+  # create a series variable
+  dplyr::mutate(series = as.factor('salmon')) %>%
+
+  # z-score the covariate CUI.apr
+  dplyr::mutate(CUI.apr = as.vector(scale(CUI.apr))) %>%
+
+  # convert logit-transformed survival back to proportional
+  dplyr::mutate(survival = plogis(logit.s)) -> model_data
+

Inspect the data

+
dplyr::glimpse(model_data)
+#> Rows: 42
+#> Columns: 6
+#> $ year     <int> 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1…
+#> $ logit.s  <dbl> -3.46, -3.32, -3.58, -3.03, -3.61, -3.35, -3.93, -4.19, -4.82…
+#> $ CUI.apr  <dbl> 2.37949804, 0.03330223, 1.74782994, 0.30401713, 1.92830654, -…
+#> $ time     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
+#> $ series   <fct> salmon, salmon, salmon, salmon, salmon, salmon, salmon, salmo…
+#> $ survival <dbl> 0.030472033, 0.034891409, 0.027119717, 0.046088827, 0.0263393…
+

Plot features of the outcome variable, which shows that it is a +proportional variable with particular restrictions that we want to +model:

+
plot_mvgam_series(data = model_data, y = 'survival')
+

+
+

A State-Space Beta regression

+

mvgam can easily handle data that are bounded at 0 and 1 +with a Beta observation model (using the mgcv function +betar(), see ?mgcv::betar for details). First +we will fit a simple State-Space model that uses a Random Walk dynamic +process model with no predictors and a Beta observation model:

+
mod0 <- mvgam(formula = survival ~ 1,
+             trend_model = 'RW',
+             family = betar(),
+             data = model_data)
+

The summary of this model shows good behaviour of the Hamiltonian +Monte Carlo sampler and provides useful summaries on the Beta +observation model parameters:

+
summary(mod0)
+#> GAM formula:
+#> survival ~ 1
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> beta
+#> 
+#> Link function:
+#> logit
+#> 
+#> Trend model:
+#> RW
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 42 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation precision parameter estimates:
+#>        2.5% 50% 97.5% Rhat n_eff
+#> phi[1]  160 310   580 1.01   612
+#> 
+#> GAM coefficient (beta) estimates:
+#>             2.5%  50% 97.5% Rhat n_eff
+#> (Intercept) -4.2 -3.4  -2.4 1.02   125
+#> 
+#> Latent trend variance estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> sigma[1] 0.18 0.33  0.55 1.02   276
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhat looks reasonable for all parameters
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:42:35 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

A plot of the underlying dynamic component shows how it has easily +handled the temporal evolution of the time series:

+
plot(mod0, type = 'trend')
+

+

Posterior hindcasts are also good and will automatically respect the +observational data bounding at 0 and 1:

+
plot(mod0, type = 'forecast')
+

+
+
+

Including time-varying upwelling effects

+

Now we can increase the complexity of our model by constructing and +fitting a State-Space model with a time-varying effect of the coastal +upwelling index in addition to the autoregressive dynamics. We again use +a Beta observation model to capture the restrictions of our proportional +observations, but this time will include a dynamic() effect +of CUI.apr in the latent process model. We do not specify +the \(\rho\) parameter, instead opting +to estimate it using a Hilbert space approximate GP:

+
mod1 <- mvgam(formula = survival ~ 1,
+              trend_formula = ~ dynamic(CUI.apr, k = 25, scale = FALSE),
+              trend_model = 'RW',
+              family = betar(),
+              data = model_data)
+

The summary for this model now includes estimates for the +time-varying GP parameters:

+
summary(mod1, include_betas = FALSE)
+#> GAM observation formula:
+#> survival ~ 1
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> GAM process formula:
+#> ~dynamic(CUI.apr, k = 25, scale = FALSE)
+#> <environment: 0x000001d9c44e71e0>
+#> 
+#> Family:
+#> beta
+#> 
+#> Link function:
+#> logit
+#> 
+#> Trend model:
+#> RW
+#> 
+#> N process models:
+#> 1 
+#> 
+#> N series:
+#> 1 
+#> 
+#> N timepoints:
+#> 42 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1000; warmup = 500; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation precision parameter estimates:
+#>        2.5% 50% 97.5% Rhat n_eff
+#> phi[1]  190 360   670    1   858
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>             2.5%  50% 97.5% Rhat n_eff
+#> (Intercept) -4.1 -3.2  -2.2 1.07    64
+#> 
+#> Process error parameter estimates:
+#>          2.5%  50% 97.5% Rhat n_eff
+#> sigma[1] 0.18 0.31  0.51 1.02   274
+#> 
+#> GAM process model gp term marginal deviation (alpha) and length scale (rho) estimates:
+#>                                2.5%  50% 97.5% Rhat n_eff
+#> alpha_gp_time_byCUI_apr_trend 0.028 0.32   1.5 1.02   205
+#> rho_gp_time_byCUI_apr_trend   1.400 6.50  40.0 1.02   236
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhats above 1.05 found for 30 parameters
+#>  *Diagnose further to investigate why the chains have not mixed
+#> 89 of 2000 iterations ended with a divergence (4.45%)
+#>  *Try running with larger adapt_delta to remove the divergences
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> E-FMI indicated no pathological behavior
+#> 
+#> Samples were drawn using NUTS(diag_e) at Thu Apr 18 8:44:05 PM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The estimates for the underlying dynamic process, and for the +hindcasts, haven’t changed much:

+
plot(mod1, type = 'trend')
+

+
plot(mod1, type = 'forecast')
+

+

But the process error parameter \(\sigma\) is slightly smaller for this model +than for the first model:

+
# Extract estimates of the process error 'sigma' for each model
+mod0_sigma <- as.data.frame(mod0, variable = 'sigma', regex = TRUE) %>%
+  dplyr::mutate(model = 'Mod0')
+mod1_sigma <- as.data.frame(mod1, variable = 'sigma', regex = TRUE) %>%
+  dplyr::mutate(model = 'Mod1')
+sigmas <- rbind(mod0_sigma, mod1_sigma)
+
+# Plot using ggplot2
+library(ggplot2)
+ggplot(sigmas, aes(y = `sigma[1]`, fill = model)) +
+  geom_density(alpha = 0.3, colour = NA) +
+  coord_flip()
+

+

Why does the process error not need to be as flexible in the second +model? Because the estimates of this dynamic process are now informed +partly by the time-varying effect of upwelling, which we can visualise +on the link scale using plot() with +trend_effects = TRUE:

+
plot(mod1, type = 'smooth', trend_effects = TRUE)
+

+

Or on the outcome scale, at a range of possible CUI.apr +values, using plot_predictions():

+
plot_predictions(mod1, newdata = datagrid(CUI.apr = range_round,
+                                          time = unique),
+                 by = c('time', 'CUI.apr', 'CUI.apr'))
+

+
+
+

Comparing model predictive performances

+

A key question when fitting multiple time series models is whether +one of them provides better predictions than the other. There are +several options in mvgam for exploring this quantitatively. +First, we can compare models based on in-sample approximate +leave-one-out cross-validation as implemented in the popular +loo package:

+
loo_compare(mod0, mod1)
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+
+#> Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.
+#>      elpd_diff se_diff
+#> mod1  0.0       0.0   
+#> mod0 -2.3       1.6
+

The second model has the larger Expected Log Predictive Density +(ELPD), meaning that it is slightly favoured over the simpler model that +did not include the time-varying upwelling effect. However, the two +models certainly do not differ by much. But this metric only compares +in-sample performance, and we are hoping to use our models to produce +reasonable forecasts. Luckily, mvgam also has routines for +comparing models using approximate leave-future-out cross-validation. +Here we refit both models to a reduced training set (starting at time +point 30) and produce approximate 1-step ahead forecasts. These +forecasts are used to estimate forecast ELPD before expanding the +training set one time point at a time. We use Pareto-smoothed importance +sampling to reweight posterior predictions, acting as a kind of particle +filter so that we don’t need to refit the model too often (you can read +more about how this process works in Bürkner et al. 2020).

+
lfo_mod0 <- lfo_cv(mod0, min_t = 30)
+lfo_mod1 <- lfo_cv(mod1, min_t = 30)
+

The model with the time-varying upwelling effect tends to provides +better 1-step ahead forecasts, with a higher total forecast ELPD

+
sum(lfo_mod0$elpds)
+#> [1] 35.11835
+sum(lfo_mod1$elpds)
+#> [1] 36.77461
+

We can also plot the ELPDs for each model as a contrast. Here, values +less than zero suggest the time-varying predictor model (Mod1) gives +better 1-step ahead forecasts:

+
plot(x = 1:length(lfo_mod0$elpds) + 30,
+     y = lfo_mod0$elpds - lfo_mod1$elpds,
+     ylab = 'ELPDmod0 - ELPDmod1',
+     xlab = 'Evaluation time point',
+     pch = 16,
+     col = 'darkred',
+     bty = 'l')
+abline(h = 0, lty = 'dashed')
+

Comparing forecast skill for dynamic beta regression models in mvgam and R

+

A useful exercise to further expand this model would be to think +about what kinds of predictors might impact measurement error, which +could easily be implemented into the observation formula in +mvgam. But for now, we will leave the model as-is.

+
+
+
+

Further reading

+

The following papers and resources offer a lot of useful material +about dynamic linear models and how they can be applied / evaluated in +practice:

+

Bürkner, PC, Gabry, J and Vehtari, A Approximate +leave-future-out cross-validation for Bayesian time series models. +Journal of Statistical Computation and Simulation. 90:14 (2020) +2499-2523.

+

Herrero, Asier, et al. From +the individual to the landscape and back: time‐varying effects of +climate and herbivory on tree sapling growth at distribution limits. +Journal of Ecology 104.2 (2016): 430-442.

+

Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. “MARSS: +multivariate autoregressive state-space models for analyzing time-series +data.R Journal. 4.1 (2012): 11.

+

Scheuerell, Mark D., and John G. Williams. Forecasting +climate induced changes in the survival of Snake River Spring/Summer +Chinook Salmon (Oncorhynchus Tshawytscha) Fisheries +Oceanography 14 (2005): 448–57.

+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/trend_formulas.R b/inst/doc/trend_formulas.R new file mode 100644 index 00000000..4739e2e9 --- /dev/null +++ b/inst/doc/trend_formulas.R @@ -0,0 +1,403 @@ +## ----echo = FALSE------------------------------------------------------------- +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) + +## ----setup, include=FALSE----------------------------------------------------- +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) + +## ----------------------------------------------------------------------------- +load(url('https://github.com/atsa-es/MARSS/raw/master/data/lakeWAplankton.rda')) + +## ----------------------------------------------------------------------------- +outcomes <- c('Greens', 'Bluegreens', 'Diatoms', 'Unicells', 'Other.algae') + +## ----------------------------------------------------------------------------- +# loop across each plankton group to create the long datframe +plankton_data <- do.call(rbind, lapply(outcomes, function(x){ + + # create a group-specific dataframe with counts labelled 'y' + # and the group name in the 'series' variable + data.frame(year = lakeWAplanktonTrans[, 'Year'], + month = lakeWAplanktonTrans[, 'Month'], + y = lakeWAplanktonTrans[, x], + series = x, + temp = lakeWAplanktonTrans[, 'Temp'])})) %>% + + # change the 'series' label to a factor + dplyr::mutate(series = factor(series)) %>% + + # filter to only include some years in the data + dplyr::filter(year >= 1965 & year < 1975) %>% + dplyr::arrange(year, month) %>% + dplyr::group_by(series) %>% + + # z-score the counts so they are approximately standard normal + dplyr::mutate(y = as.vector(scale(y))) %>% + + # add the time indicator + dplyr::mutate(time = dplyr::row_number()) %>% + dplyr::ungroup() + +## ----------------------------------------------------------------------------- +head(plankton_data) + +## ----------------------------------------------------------------------------- +dplyr::glimpse(plankton_data) + +## ----------------------------------------------------------------------------- +plot_mvgam_series(data = plankton_data, series = 'all') + +## ----------------------------------------------------------------------------- +image(is.na(t(plankton_data)), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(plankton_data)), + labels = colnames(plankton_data)) + +## ----------------------------------------------------------------------------- +plankton_data %>% + dplyr::filter(series == 'Other.algae') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Other algae (red)') + +## ----------------------------------------------------------------------------- +plankton_data %>% + dplyr::filter(series == 'Diatoms') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Diatoms (red)') + +## ----------------------------------------------------------------------------- +plankton_data %>% + dplyr::filter(series == 'Greens') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Greens (red)') + +## ----------------------------------------------------------------------------- +plankton_train <- plankton_data %>% + dplyr::filter(time <= 112) +plankton_test <- plankton_data %>% + dplyr::filter(time > 112) + +## ----notrend_mod, include = FALSE, results='hide'----------------------------- +notrend_mod <- mvgam(y ~ + te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = series), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'None') + +## ----eval=FALSE--------------------------------------------------------------- +# notrend_mod <- mvgam(y ~ +# # tensor of temp and month to capture +# # "global" seasonality +# te(temp, month, k = c(4, 4)) + +# +# # series-specific deviation tensor products +# te(temp, month, k = c(4, 4), by = series), +# family = gaussian(), +# data = plankton_train, +# newdata = plankton_test, +# trend_model = 'None') +# + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 1) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 2) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 3) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 4) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 5) + +## ----------------------------------------------------------------------------- +plot_mvgam_smooth(notrend_mod, smooth = 6) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 1) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 2) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 3) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 4) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'forecast', series = 5) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 1) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 2) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 3) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 4) + +## ----------------------------------------------------------------------------- +plot(notrend_mod, type = 'residuals', series = 5) + +## ----------------------------------------------------------------------------- +priors <- get_mvgam_priors( + # observation formula, which has no terms in it + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with uncorrelated process errors + trend_model = 'VAR1', + family = gaussian(), + data = plankton_train) + +## ----------------------------------------------------------------------------- +priors[, 3] + +## ----------------------------------------------------------------------------- +priors[, 4] + +## ----------------------------------------------------------------------------- +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) + +## ----var_mod, include = FALSE, results='hide'--------------------------------- +var_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1', + priors = priors, + burnin = 1000) + +## ----eval=FALSE--------------------------------------------------------------- +# var_mod <- mvgam( +# # observation formula, which is empty +# y ~ -1, +# +# # process model formula, which includes the smooth functions +# trend_formula = ~ te(temp, month, k = c(4, 4)) + +# te(temp, month, k = c(4, 4), by = trend), +# +# # VAR1 model with uncorrelated process errors +# trend_model = 'VAR1', +# family = gaussian(), +# data = plankton_train, +# newdata = plankton_test, +# +# # include the updated priors +# priors = priors) + +## ----------------------------------------------------------------------------- +summary(var_mod, include_betas = FALSE) + +## ----------------------------------------------------------------------------- +plot(var_mod, 'smooths', trend_effects = TRUE) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +mcmc_plot(var_mod, variable = 'A', regex = TRUE, type = 'hist') + +## ----warning=FALSE, message=FALSE--------------------------------------------- +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(A_pars)), + type = 'hist') + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'trend', series = 1) + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'trend', series = 3) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') + +## ----warning=FALSE, message=FALSE--------------------------------------------- +mcmc_plot(var_mod, variable = 'sigma_obs', regex = TRUE, type = 'hist') + +## ----------------------------------------------------------------------------- +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) + +## ----varcor_mod, include = FALSE, results='hide'------------------------------ +varcor_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1cor', + burnin = 1000, + priors = priors) + +## ----eval=FALSE--------------------------------------------------------------- +# varcor_mod <- mvgam( +# # observation formula, which remains empty +# y ~ -1, +# +# # process model formula, which includes the smooth functions +# trend_formula = ~ te(temp, month, k = c(4, 4)) + +# te(temp, month, k = c(4, 4), by = trend), +# +# # VAR1 model with correlated process errors +# trend_model = 'VAR1cor', +# family = gaussian(), +# data = plankton_train, +# newdata = plankton_test, +# +# # include the updated priors +# priors = priors) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +mcmc_plot(varcor_mod, type = 'rhat') + + labs(title = 'VAR1cor') +mcmc_plot(var_mod, type = 'rhat') + + labs(title = 'VAR1') + +## ----warning=FALSE, message=FALSE--------------------------------------------- +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') + +## ----------------------------------------------------------------------------- +Sigma_post <- as.matrix(varcor_mod, variable = 'Sigma', regex = TRUE) +median_correlations <- cov2cor(matrix(apply(Sigma_post, 2, median), + nrow = 5, ncol = 5)) +rownames(median_correlations) <- colnames(median_correlations) <- levels(plankton_train$series) + +round(median_correlations, 2) + +## ----warning=FALSE, message=FALSE--------------------------------------------- +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(A_pars)), + type = 'hist') + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'forecast', series = 1, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(varcor_mod, type = 'forecast', series = 1, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'forecast', series = 2, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(varcor_mod, type = 'forecast', series = 2, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(var_mod, type = 'forecast', series = 3, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +plot(varcor_mod, type = 'forecast', series = 3, newdata = plankton_test) + +## ----------------------------------------------------------------------------- +# create forecast objects for each model +fcvar <- forecast(var_mod) +fcvarcor <- forecast(varcor_mod) + +# plot the difference in variogram scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'variogram')$all_series$score - + score(fcvar, score = 'variogram')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(variogram[VAR1cor]~-~variogram[VAR1])) +abline(h = 0, lty = 'dashed') + +## ----------------------------------------------------------------------------- +# plot the difference in energy scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'energy')$all_series$score - + score(fcvar, score = 'energy')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(energy[VAR1cor]~-~energy[VAR1])) +abline(h = 0, lty = 'dashed') + diff --git a/inst/doc/trend_formulas.Rmd b/inst/doc/trend_formulas.Rmd new file mode 100644 index 00000000..0d95ab8a --- /dev/null +++ b/inst/doc/trend_formulas.Rmd @@ -0,0 +1,561 @@ +--- +title: "State-Space models in mvgam" +author: "Nicholas J Clark" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: yes +vignette: > + %\VignetteIndexEntry{State-Space models in mvgam} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- +```{r, echo = FALSE} +NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + purl = NOT_CRAN, + eval = NOT_CRAN +) +``` + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + echo = TRUE, + dpi = 150, + fig.asp = 0.8, + fig.width = 6, + out.width = "60%", + fig.align = "center") +library(mvgam) +library(ggplot2) +theme_set(theme_bw(base_size = 12, base_family = 'serif')) +``` + +The purpose of this vignette is to show how the `mvgam` package can be used to fit and interrogate State-Space models with nonlinear effects. + +## State-Space Models + +![Illustration of a basic State-Space model, which assumes that a latent dynamic *process* (X) can evolve independently from the way we take *observations* (Y) of that process](SS_model.svg){width=85%} + +
+ +State-Space models allow us to separately make inferences about the underlying dynamic *process model* that we are interested in (i.e. the evolution of a time series or a collection of time series) and the *observation model* (i.e. the way that we survey / measure this underlying process). This is extremely useful in ecology because our observations are always imperfect / noisy measurements of the thing we are interested in measuring. It is also helpful because we often know that some covariates will impact our ability to measure accurately (i.e. we cannot take accurate counts of rodents if there is a thunderstorm happening) while other covariate impact the underlying process (it is highly unlikely that rodent abundance responds to one storm, but instead probably responds to longer-term weather and climate variation). A State-Space model allows us to model both components in a single unified modelling framework. A major advantage of `mvgam` is that it can include nonlinear effects and random effects in BOTH model components while also capturing dynamic processes. + +### Lake Washington plankton data +The data we will use to illustrate how we can fit State-Space models in `mvgam` are from a long-term monitoring study of plankton counts (cells per mL) taken from Lake Washington in Washington, USA. The data are available as part of the `MARSS` package and can be downloaded using the following: +```{r} +load(url('https://github.com/atsa-es/MARSS/raw/master/data/lakeWAplankton.rda')) +``` + +We will work with five different groups of plankton: +```{r} +outcomes <- c('Greens', 'Bluegreens', 'Diatoms', 'Unicells', 'Other.algae') +``` + +As usual, preparing the data into the correct format for `mvgam` modelling takes a little bit of wrangling in `dplyr`: +```{r} +# loop across each plankton group to create the long datframe +plankton_data <- do.call(rbind, lapply(outcomes, function(x){ + + # create a group-specific dataframe with counts labelled 'y' + # and the group name in the 'series' variable + data.frame(year = lakeWAplanktonTrans[, 'Year'], + month = lakeWAplanktonTrans[, 'Month'], + y = lakeWAplanktonTrans[, x], + series = x, + temp = lakeWAplanktonTrans[, 'Temp'])})) %>% + + # change the 'series' label to a factor + dplyr::mutate(series = factor(series)) %>% + + # filter to only include some years in the data + dplyr::filter(year >= 1965 & year < 1975) %>% + dplyr::arrange(year, month) %>% + dplyr::group_by(series) %>% + + # z-score the counts so they are approximately standard normal + dplyr::mutate(y = as.vector(scale(y))) %>% + + # add the time indicator + dplyr::mutate(time = dplyr::row_number()) %>% + dplyr::ungroup() +``` + +Inspect the data structure +```{r} +head(plankton_data) +``` + +```{r} +dplyr::glimpse(plankton_data) +``` + +Note that we have z-scored the counts in this example as that will make it easier to specify priors (though this is not completely necessary; it is often better to build a model that respects the properties of the actual outcome variables) +```{r} +plot_mvgam_series(data = plankton_data, series = 'all') +``` + +It is always helpful to check the data for `NA`s before attempting any models: +```{r} +image(is.na(t(plankton_data)), axes = F, + col = c('grey80', 'darkred')) +axis(3, at = seq(0,1, len = NCOL(plankton_data)), + labels = colnames(plankton_data)) +``` + +We have some missing observations, but this isn't an issue for modelling in `mvgam`. A useful property to understand about these counts is that they tend to be highly seasonal. Below are some plots of z-scored counts against the z-scored temperature measurements in the lake for each month: +```{r} +plankton_data %>% + dplyr::filter(series == 'Other.algae') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Other algae (red)') +``` + + +```{r} +plankton_data %>% + dplyr::filter(series == 'Diatoms') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Diatoms (red)') +``` + +```{r} +plankton_data %>% + dplyr::filter(series == 'Greens') %>% + ggplot(aes(x = time, y = temp)) + + geom_line(size = 1.1) + + geom_line(aes(y = y), col = 'white', + size = 1.3) + + geom_line(aes(y = y), col = 'darkred', + size = 1.1) + + ylab('z-score') + + xlab('Time') + + ggtitle('Temperature (black) vs Greens (red)') +``` + +We will have to try and capture this seasonality in our process model, which should be easy to do given the flexibility of GAMs. Next we will split the data into training and testing splits: +```{r} +plankton_train <- plankton_data %>% + dplyr::filter(time <= 112) +plankton_test <- plankton_data %>% + dplyr::filter(time > 112) +``` + +Now time to fit some models. This requires a bit of thinking about how we can best tackle the seasonal variation and the likely dependence structure in the data. These algae are interacting as part of a complex system within the same lake, so we certainly expect there to be some lagged cross-dependencies underling their dynamics. But if we do not capture the seasonal variation, our multivariate dynamic model will be forced to try and capture it, which could lead to poor convergence and unstable results (we could feasibly capture cyclic dynamics with a more complex multi-species Lotka-Volterra model, but ordinary differential equation approaches are beyond the scope of `mvgam`). + +### Capturing seasonality + +First we will fit a model that does not include a dynamic component, just to see if it can reproduce the seasonal variation in the observations. This model introduces hierarchical multidimensional smooths, where all time series share a "global" tensor product of the `month` and `temp` variables, capturing our expectation that algal seasonality responds to temperature variation. But this response should depend on when in the year these temperatures are recorded (i.e. a response to warm temperatures in Spring should be different to a response to warm temperatures in Autumn). The model also fits series-specific deviation smooths (i.e. one tensor product per series) to capture how each algal group's seasonality differs from the overall "global" seasonality. Note that we do not include series-specific intercepts in this model because each series was z-scored to have a mean of 0. +```{r notrend_mod, include = FALSE, results='hide'} +notrend_mod <- mvgam(y ~ + te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = series), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'None') +``` + +```{r eval=FALSE} +notrend_mod <- mvgam(y ~ + # tensor of temp and month to capture + # "global" seasonality + te(temp, month, k = c(4, 4)) + + + # series-specific deviation tensor products + te(temp, month, k = c(4, 4), by = series), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'None') + +``` + +The "global" tensor product smooth function can be quickly visualized: +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 1) +``` + +On this plot, red indicates below-average linear predictors and white indicates above-average. We can then plot the deviation smooths for each algal group to see how they vary from the "global" pattern: +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 2) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 3) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 4) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 5) +``` + +```{r} +plot_mvgam_smooth(notrend_mod, smooth = 6) +``` + +These multidimensional smooths have done a good job of capturing the seasonal variation in our observations: +```{r} +plot(notrend_mod, type = 'forecast', series = 1) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 2) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 3) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 4) +``` + +```{r} +plot(notrend_mod, type = 'forecast', series = 5) +``` + +This basic model gives us confidence that we can capture the seasonal variation in the observations. But the model has not captured the remaining temporal dynamics, which is obvious when we inspect Dunn-Smyth residuals for each series: +```{r} +plot(notrend_mod, type = 'residuals', series = 1) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 2) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 3) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 4) +``` + +```{r} +plot(notrend_mod, type = 'residuals', series = 5) +``` + +### Multiseries dynamics +Now it is time to get into multivariate State-Space models. We will fit two models that can both incorporate lagged cross-dependencies in the latent process models. The first model assumes that the process errors operate independently from one another, while the second assumes that there may be contemporaneous correlations in the process errors. Both models include a Vector Autoregressive component for the process means, and so both can model complex community dynamics. The models can be described mathematically as follows: + +\begin{align*} +\boldsymbol{count}_t & \sim \text{Normal}(\mu_{obs[t]}, \sigma_{obs}) \\ +\mu_{obs[t]} & = process_t \\ +process_t & \sim \text{MVNormal}(\mu_{process[t]}, \Sigma_{process}) \\ +\mu_{process[t]} & = VAR * process_{t-1} + f_{global}(\boldsymbol{month},\boldsymbol{temp})_t + f_{series}(\boldsymbol{month},\boldsymbol{temp})_t \\ +f_{global}(\boldsymbol{month},\boldsymbol{temp}) & = \sum_{k=1}^{K}b_{global} * \beta_{global} \\ +f_{series}(\boldsymbol{month},\boldsymbol{temp}) & = \sum_{k=1}^{K}b_{series} * \beta_{series} \end{align*} + +Here you can see that there are no terms in the observation model apart from the underlying process model. But we could easily add covariates into the observation model if we felt that they could explain some of the systematic observation errors. We also assume independent observation processes (there is no covariance structure in the observation errors $\sigma_{obs}$). At present, `mvgam` does not support multivariate observation models. But this feature will be added in future versions. However the underlying process model is multivariate, and there is a lot going on here. This component has a Vector Autoregressive part, where the process mean at time $t$ $(\mu_{process[t]})$ is a vector that evolves as a function of where the vector-valued process model was at time $t-1$. The $VAR$ matrix captures these dynamics with self-dependencies on the diagonal and possibly asymmetric cross-dependencies on the off-diagonals, while also incorporating the nonlinear smooth functions that capture seasonality for each series. The contemporaneous process errors are modeled by $\Sigma_{process}$, which can be constrained so that process errors are independent (i.e. setting the off-diagonals to 0) or can be fully parameterized using a Cholesky decomposition (using `Stan`'s $LKJcorr$ distribution to place a prior on the strength of inter-species correlations). For those that are interested in the inner-workings, `mvgam` makes use of a recent breakthrough by [Sarah Heaps to enforce stationarity of Bayesian VAR processes](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648). This is advantageous as we often don't expect forecast variance to increase without bound forever into the future, but many estimated VARs tend to behave this way. + +
+Ok that was a lot to take in. Let's fit some models to try and inspect what is going on and what they assume. But first, we need to update `mvgam`'s default priors for the observation and process errors. By default, `mvgam` uses a fairly wide Student-T prior on these parameters to avoid being overly informative. But our observations are z-scored and so we do not expect very large process or observation errors. However, we also do not expect very small observation errors either as we know these measurements are not perfect. So let's update the priors for these parameters. In doing so, you will get to see how the formula for the latent process (i.e. trend) model is used in `mvgam`: +```{r} +priors <- get_mvgam_priors( + # observation formula, which has no terms in it + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with uncorrelated process errors + trend_model = 'VAR1', + family = gaussian(), + data = plankton_train) +``` + +Get names of all parameters whose priors can be modified: +```{r} +priors[, 3] +``` + +And their default prior distributions: +```{r} +priors[, 4] +``` + +Setting priors is easy in `mvgam` as you can use `brms` routines. Here we use more informative Normal priors for both error components, but we impose a lower bound of 0.2 for the observation errors: +```{r} +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) +``` + +You may have noticed something else unique about this model: there is no intercept term in the observation formula. This is because a shared intercept parameter can sometimes be unidentifiable with respect to the latent VAR process, particularly if our series have similar long-run averages (which they do in this case because they were z-scored). We will often get better convergence in these State-Space models if we drop this parameter. `mvgam` accomplishes this by fixing the coefficient for the intercept to zero. Now we can fit the first model, which assumes that process errors are contemporaneously uncorrelated +```{r var_mod, include = FALSE, results='hide'} +var_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1', + priors = priors, + burnin = 1000) +``` + +```{r eval=FALSE} +var_mod <- mvgam( + # observation formula, which is empty + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with uncorrelated process errors + trend_model = 'VAR1', + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + + # include the updated priors + priors = priors) +``` + +### Inspecting SS models +This model's summary is a bit different to other `mvgam` summaries. It separates parameters based on whether they belong to the observation model or to the latent process model. This is because we may often have covariates that impact the observations but not the latent process, so we can have fairly complex models for each component. You will notice that some parameters have not fully converged, particularly for the VAR coefficients (called `A` in the output) and for the process errors (`Sigma`). Note that we set `include_betas = FALSE` to stop the summary from printing output for all of the spline coefficients, which can be dense and hard to interpret: +```{r} +summary(var_mod, include_betas = FALSE) +``` + +The convergence of this model isn't fabulous (more on this in a moment). But we can again plot the smooth functions, which this time operate on the process model. We can see the same plot using `trend_effects = TRUE` in the plotting functions: +```{r} +plot(var_mod, 'smooths', trend_effects = TRUE) +``` + +The VAR matrix is of particular interest here, as it captures lagged dependencies and cross-dependencies in the latent process model: +```{r warning=FALSE, message=FALSE} +mcmc_plot(var_mod, variable = 'A', regex = TRUE, type = 'hist') +``` + +Unfortunately `bayesplot` doesn't know this is a matrix of parameters so what we see is actually the transpose of the VAR matrix. A little bit of wrangling gives us these histograms in the correct order: +```{r warning=FALSE, message=FALSE} +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(A_pars)), + type = 'hist') +``` + +There is a lot happening in this matrix. Each cell captures the lagged effect of the process in the column on the process in the row in the next timestep. So for example, the effect in cell [1,3], which is quite strongly negative, means that an *increase* in the process for series 3 (Greens) at time $t$ is expected to lead to a subsequent *decrease* in the process for series 1 (Bluegreens) at time $t+1$. The latent process model is now capturing these effects and the smooth seasonal effects, so the trend plot shows our best estimate of what the *true* count should have been at each time point: +```{r} +plot(var_mod, type = 'trend', series = 1) +``` + +```{r} +plot(var_mod, type = 'trend', series = 3) +``` + +The process error $(\Sigma)$ captures unmodelled variation in the process models. Again, we fixed the off-diagonals to 0, so the histograms for these will look like flat boxes: +```{r warning=FALSE, message=FALSE} +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(var_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') +``` + +The observation error estimates $(\sigma_{obs})$ represent how much the model thinks we might miss the true count when we take our imperfect measurements: +```{r warning=FALSE, message=FALSE} +mcmc_plot(var_mod, variable = 'sigma_obs', regex = TRUE, type = 'hist') +``` + +These are still a bit hard to identify overall, especially when trying to estimate both process and observation error. Often we need to make some strong assumptions about which of these is more important for determining unexplained variation in our observations. + +### Correlated process errors + +Let's see if these estimates improve when we allow the process errors to be correlated. Once again, we need to first update the priors for the observation errors: +```{r} +priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2), + prior(normal(0.5, 0.25), class = sigma)) +``` + +And now we can fit the correlated process error model +```{r varcor_mod, include = FALSE, results='hide'} +varcor_mod <- mvgam(y ~ -1, + trend_formula = ~ + # tensor of temp and month should capture + # seasonality + te(temp, month, k = c(4, 4)) + + # need to use 'trend' rather than series + # here + te(temp, month, k = c(4, 4), by = trend), + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + trend_model = 'VAR1cor', + burnin = 1000, + priors = priors) +``` + +```{r eval=FALSE} +varcor_mod <- mvgam( + # observation formula, which remains empty + y ~ -1, + + # process model formula, which includes the smooth functions + trend_formula = ~ te(temp, month, k = c(4, 4)) + + te(temp, month, k = c(4, 4), by = trend), + + # VAR1 model with correlated process errors + trend_model = 'VAR1cor', + family = gaussian(), + data = plankton_train, + newdata = plankton_test, + + # include the updated priors + priors = priors) +``` + +Plot convergence diagnostics for the two models, which shows that both models display similar levels of convergence: +```{r warning=FALSE, message=FALSE} +mcmc_plot(varcor_mod, type = 'rhat') + + labs(title = 'VAR1cor') +mcmc_plot(var_mod, type = 'rhat') + + labs(title = 'VAR1') +``` + +The $(\Sigma)$ matrix now captures any evidence of contemporaneously correlated process error: +```{r warning=FALSE, message=FALSE} +Sigma_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(Sigma_pars)), + type = 'hist') +``` + +This symmetric matrix tells us there is support for correlated process errors. For example, series 1 and 3 (Bluegreens and Greens) show negatively correlated process errors, while series 1 and 4 (Bluegreens and Other.algae) show positively correlated errors. But it is easier to interpret these estimates if we convert the covariance matrix to a correlation matrix. Here we compute the posterior median process error correlations: +```{r} +Sigma_post <- as.matrix(varcor_mod, variable = 'Sigma', regex = TRUE) +median_correlations <- cov2cor(matrix(apply(Sigma_post, 2, median), + nrow = 5, ncol = 5)) +rownames(median_correlations) <- colnames(median_correlations) <- levels(plankton_train$series) + +round(median_correlations, 2) +``` + +Because this model is able to capture correlated errors, the VAR matrix has changed slightly: +```{r warning=FALSE, message=FALSE} +A_pars <- matrix(NA, nrow = 5, ncol = 5) +for(i in 1:5){ + for(j in 1:5){ + A_pars[i, j] <- paste0('A[', i, ',', j, ']') + } +} +mcmc_plot(varcor_mod, + variable = as.vector(t(A_pars)), + type = 'hist') +``` + +We still have some evidence of lagged cross-dependence, but some of these interactions have now been pulled more toward zero. But which model is better? Forecasts don't appear to differ very much, at least qualitatively (here are forecasts for three of the series, for each model): +```{r} +plot(var_mod, type = 'forecast', series = 1, newdata = plankton_test) +``` + +```{r} +plot(varcor_mod, type = 'forecast', series = 1, newdata = plankton_test) +``` + +```{r} +plot(var_mod, type = 'forecast', series = 2, newdata = plankton_test) +``` + +```{r} +plot(varcor_mod, type = 'forecast', series = 2, newdata = plankton_test) +``` + +```{r} +plot(var_mod, type = 'forecast', series = 3, newdata = plankton_test) +``` + +```{r} +plot(varcor_mod, type = 'forecast', series = 3, newdata = plankton_test) +``` + +We can compute the variogram score for out of sample forecasts to get a sense of which model does a better job of capturing the dependence structure in the true evaluation set: +```{r} +# create forecast objects for each model +fcvar <- forecast(var_mod) +fcvarcor <- forecast(varcor_mod) + +# plot the difference in variogram scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'variogram')$all_series$score - + score(fcvar, score = 'variogram')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(variogram[VAR1cor]~-~variogram[VAR1])) +abline(h = 0, lty = 'dashed') +``` + +And we can also compute the energy score for out of sample forecasts to get a sense of which model provides forecasts that are better calibrated: +```{r} +# plot the difference in energy scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better +diff_scores <- score(fcvarcor, score = 'energy')$all_series$score - + score(fcvar, score = 'energy')$all_series$score +plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', + ylim = c(-1*max(abs(diff_scores), na.rm = TRUE), + max(abs(diff_scores), na.rm = TRUE)), + bty = 'l', + xlab = 'Forecast horizon', + ylab = expression(energy[VAR1cor]~-~energy[VAR1])) +abline(h = 0, lty = 'dashed') +``` + +The models tend to provide similar forecasts, though the correlated error model does slightly better overall. We would probably need to use a more extensive rolling forecast evaluation exercise if we felt like we needed to only choose one for production. `mvgam` offers some utilities for doing this (i.e. see `?lfo_cv` for guidance). + +### Further reading +The following papers and resources offer a lot of useful material about multivariate State-Space models and how they can be applied in practice: + +Heaps, Sarah E. "[Enforcing stationarity through the prior in vector autoregressions.](https://www.tandfonline.com/doi/full/10.1080/10618600.2022.2079648)" *Journal of Computational and Graphical Statistics* 32.1 (2023): 74-83. + +Hannaford, Naomi E., et al. "[A sparse Bayesian hierarchical vector autoregressive model for microbial dynamics in a wastewater treatment plant.](https://www.sciencedirect.com/science/article/pii/S0167947322002390)" *Computational Statistics & Data Analysis* 179 (2023): 107659. + +Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. "[MARSS: multivariate autoregressive state-space models for analyzing time-series data.](https://journal.r-project.org/archive/2012/RJ-2012-002/index.html)" *R Journal*. 4.1 (2012): 11. + +Ward, Eric J., et al. "[Inferring spatial structure from time‐series data: using multivariate state‐space models to detect metapopulation structure of California sea lions in the Gulf of California, Mexico.](https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/j.1365-2664.2009.01745.x)" *Journal of Applied Ecology* 47.1 (2010): 47-56. + +Auger‐Méthé, Marie, et al. ["A guide to state–space modeling of ecological time series.](https://esajournals.onlinelibrary.wiley.com/doi/full/10.1002/ecm.1470)" *Ecological Monographs* 91.4 (2021): e01470. + +## Interested in contributing? +I'm actively seeking PhD students and other researchers to work in the areas of ecological forecasting, multivariate model evaluation and development of `mvgam`. Please reach out if you are interested (n.clark'at'uq.edu.au) diff --git a/inst/doc/trend_formulas.html b/inst/doc/trend_formulas.html new file mode 100644 index 00000000..b92d263e --- /dev/null +++ b/inst/doc/trend_formulas.html @@ -0,0 +1,1137 @@ + + + + + + + + + + + + + + + + +State-Space models in mvgam + + + + + + + + + + + + + + + + + + + + + + + + + + +

State-Space models in mvgam

+

Nicholas J Clark

+

2024-04-19

+ + +
+ +
+ +

The purpose of this vignette is to show how the mvgam +package can be used to fit and interrogate State-Space models with +nonlinear effects.

+
+

State-Space Models

+
+Illustration of a basic State-Space model, which assumes that a latent dynamic process (X) can evolve independently from the way we take observations (Y) of that process +
Illustration of a basic State-Space model, which +assumes that a latent dynamic process (X) can evolve +independently from the way we take observations (Y) of that +process
+
+


+

State-Space models allow us to separately make inferences about the +underlying dynamic process model that we are interested in +(i.e. the evolution of a time series or a collection of time series) and +the observation model (i.e. the way that we survey / measure +this underlying process). This is extremely useful in ecology because +our observations are always imperfect / noisy measurements of the thing +we are interested in measuring. It is also helpful because we often know +that some covariates will impact our ability to measure accurately +(i.e. we cannot take accurate counts of rodents if there is a +thunderstorm happening) while other covariate impact the underlying +process (it is highly unlikely that rodent abundance responds to one +storm, but instead probably responds to longer-term weather and climate +variation). A State-Space model allows us to model both components in a +single unified modelling framework. A major advantage of +mvgam is that it can include nonlinear effects and random +effects in BOTH model components while also capturing dynamic +processes.

+
+

Lake Washington plankton data

+

The data we will use to illustrate how we can fit State-Space models +in mvgam are from a long-term monitoring study of plankton +counts (cells per mL) taken from Lake Washington in Washington, USA. The +data are available as part of the MARSS package and can be +downloaded using the following:

+
load(url('https://github.com/atsa-es/MARSS/raw/master/data/lakeWAplankton.rda'))
+

We will work with five different groups of plankton:

+
outcomes <- c('Greens', 'Bluegreens', 'Diatoms', 'Unicells', 'Other.algae')
+

As usual, preparing the data into the correct format for +mvgam modelling takes a little bit of wrangling in +dplyr:

+
# loop across each plankton group to create the long datframe
+plankton_data <- do.call(rbind, lapply(outcomes, function(x){
+  
+  # create a group-specific dataframe with counts labelled 'y'
+  # and the group name in the 'series' variable
+  data.frame(year = lakeWAplanktonTrans[, 'Year'],
+             month = lakeWAplanktonTrans[, 'Month'],
+             y = lakeWAplanktonTrans[, x],
+             series = x,
+             temp = lakeWAplanktonTrans[, 'Temp'])})) %>%
+  
+  # change the 'series' label to a factor
+  dplyr::mutate(series = factor(series)) %>%
+  
+  # filter to only include some years in the data
+  dplyr::filter(year >= 1965 & year < 1975) %>%
+  dplyr::arrange(year, month) %>%
+  dplyr::group_by(series) %>%
+  
+  # z-score the counts so they are approximately standard normal
+  dplyr::mutate(y = as.vector(scale(y))) %>%
+  
+  # add the time indicator
+  dplyr::mutate(time = dplyr::row_number()) %>%
+  dplyr::ungroup()
+

Inspect the data structure

+
head(plankton_data)
+#> # A tibble: 6 × 6
+#>    year month       y series       temp  time
+#>   <dbl> <dbl>   <dbl> <fct>       <dbl> <int>
+#> 1  1965     1 -0.542  Greens      -1.23     1
+#> 2  1965     1 -0.344  Bluegreens  -1.23     1
+#> 3  1965     1 -0.0768 Diatoms     -1.23     1
+#> 4  1965     1 -1.52   Unicells    -1.23     1
+#> 5  1965     1 -0.491  Other.algae -1.23     1
+#> 6  1965     2 NA      Greens      -1.32     2
+
dplyr::glimpse(plankton_data)
+#> Rows: 600
+#> Columns: 6
+#> $ year   <dbl> 1965, 1965, 1965, 1965, 1965, 1965, 1965, 1965, 1965, 1965, 196…
+#> $ month  <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, …
+#> $ y      <dbl> -0.54241769, -0.34410776, -0.07684901, -1.52243490, -0.49055442…
+#> $ series <fct> Greens, Bluegreens, Diatoms, Unicells, Other.algae, Greens, Blu…
+#> $ temp   <dbl> -1.2306562, -1.2306562, -1.2306562, -1.2306562, -1.2306562, -1.…
+#> $ time   <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, …
+

Note that we have z-scored the counts in this example as that will +make it easier to specify priors (though this is not completely +necessary; it is often better to build a model that respects the +properties of the actual outcome variables)

+
plot_mvgam_series(data = plankton_data, series = 'all')
+

+

It is always helpful to check the data for NAs before +attempting any models:

+
image(is.na(t(plankton_data)), axes = F,
+      col = c('grey80', 'darkred'))
+axis(3, at = seq(0,1, len = NCOL(plankton_data)), 
+     labels = colnames(plankton_data))
+

+

We have some missing observations, but this isn’t an issue for +modelling in mvgam. A useful property to understand about +these counts is that they tend to be highly seasonal. Below are some +plots of z-scored counts against the z-scored temperature measurements +in the lake for each month:

+
plankton_data %>%
+  dplyr::filter(series == 'Other.algae') %>%
+  ggplot(aes(x = time, y = temp)) +
+  geom_line(size = 1.1) +
+  geom_line(aes(y = y), col = 'white',
+            size = 1.3) +
+  geom_line(aes(y = y), col = 'darkred',
+            size = 1.1) +
+  ylab('z-score') +
+  xlab('Time') +
+  ggtitle('Temperature (black) vs Other algae (red)')
+#> Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
+#> ℹ Please use `linewidth` instead.
+#> This warning is displayed once every 8 hours.
+#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
+#> generated.
+

+
plankton_data %>%
+  dplyr::filter(series == 'Diatoms') %>%
+  ggplot(aes(x = time, y = temp)) +
+  geom_line(size = 1.1) +
+  geom_line(aes(y = y), col = 'white',
+            size = 1.3) +
+  geom_line(aes(y = y), col = 'darkred',
+            size = 1.1) +
+  ylab('z-score') +
+  xlab('Time') +
+  ggtitle('Temperature (black) vs Diatoms (red)')
+

+
plankton_data %>%
+  dplyr::filter(series == 'Greens') %>%
+  ggplot(aes(x = time, y = temp)) +
+  geom_line(size = 1.1) +
+  geom_line(aes(y = y), col = 'white',
+            size = 1.3) +
+  geom_line(aes(y = y), col = 'darkred',
+            size = 1.1) +
+  ylab('z-score') +
+  xlab('Time') +
+  ggtitle('Temperature (black) vs Greens (red)')
+

+

We will have to try and capture this seasonality in our process +model, which should be easy to do given the flexibility of GAMs. Next we +will split the data into training and testing splits:

+
plankton_train <- plankton_data %>%
+  dplyr::filter(time <= 112)
+plankton_test <- plankton_data %>%
+  dplyr::filter(time > 112)
+

Now time to fit some models. This requires a bit of thinking about +how we can best tackle the seasonal variation and the likely dependence +structure in the data. These algae are interacting as part of a complex +system within the same lake, so we certainly expect there to be some +lagged cross-dependencies underling their dynamics. But if we do not +capture the seasonal variation, our multivariate dynamic model will be +forced to try and capture it, which could lead to poor convergence and +unstable results (we could feasibly capture cyclic dynamics with a more +complex multi-species Lotka-Volterra model, but ordinary differential +equation approaches are beyond the scope of mvgam).

+
+
+

Capturing seasonality

+

First we will fit a model that does not include a dynamic component, +just to see if it can reproduce the seasonal variation in the +observations. This model introduces hierarchical multidimensional +smooths, where all time series share a “global” tensor product of the +month and temp variables, capturing our +expectation that algal seasonality responds to temperature variation. +But this response should depend on when in the year these temperatures +are recorded (i.e. a response to warm temperatures in Spring should be +different to a response to warm temperatures in Autumn). The model also +fits series-specific deviation smooths (i.e. one tensor product per +series) to capture how each algal group’s seasonality differs from the +overall “global” seasonality. Note that we do not include +series-specific intercepts in this model because each series was +z-scored to have a mean of 0.

+
notrend_mod <- mvgam(y ~ 
+                       # tensor of temp and month to capture
+                       # "global" seasonality
+                       te(temp, month, k = c(4, 4)) +
+                       
+                       # series-specific deviation tensor products
+                       te(temp, month, k = c(4, 4), by = series),
+                     family = gaussian(),
+                     data = plankton_train,
+                     newdata = plankton_test,
+                     trend_model = 'None')
+

The “global” tensor product smooth function can be quickly +visualized:

+
plot_mvgam_smooth(notrend_mod, smooth = 1)
+

+

On this plot, red indicates below-average linear predictors and white +indicates above-average. We can then plot the deviation smooths for each +algal group to see how they vary from the “global” pattern:

+
plot_mvgam_smooth(notrend_mod, smooth = 2)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 3)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 4)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 5)
+

+
plot_mvgam_smooth(notrend_mod, smooth = 6)
+

+

These multidimensional smooths have done a good job of capturing the +seasonal variation in our observations:

+
plot(notrend_mod, type = 'forecast', series = 1)
+

+
#> Out of sample CRPS:
+#> [1] 6.882638
+
plot(notrend_mod, type = 'forecast', series = 2)
+

+
#> Out of sample CRPS:
+#> [1] 6.761039
+
plot(notrend_mod, type = 'forecast', series = 3)
+

+
#> Out of sample CRPS:
+#> [1] 4.129973
+
plot(notrend_mod, type = 'forecast', series = 4)
+

+
#> Out of sample CRPS:
+#> [1] 3.527261
+
plot(notrend_mod, type = 'forecast', series = 5)
+

+
#> Out of sample CRPS:
+#> [1] 2.805047
+

This basic model gives us confidence that we can capture the seasonal +variation in the observations. But the model has not captured the +remaining temporal dynamics, which is obvious when we inspect Dunn-Smyth +residuals for each series:

+
plot(notrend_mod, type = 'residuals', series = 1)
+

+
plot(notrend_mod, type = 'residuals', series = 2)
+

+
plot(notrend_mod, type = 'residuals', series = 3)
+

+
plot(notrend_mod, type = 'residuals', series = 4)
+

+
plot(notrend_mod, type = 'residuals', series = 5)
+

+
+
+

Multiseries dynamics

+

Now it is time to get into multivariate State-Space models. We will +fit two models that can both incorporate lagged cross-dependencies in +the latent process models. The first model assumes that the process +errors operate independently from one another, while the second assumes +that there may be contemporaneous correlations in the process errors. +Both models include a Vector Autoregressive component for the process +means, and so both can model complex community dynamics. The models can +be described mathematically as follows:

+

\[\begin{align*} +\boldsymbol{count}_t & \sim \text{Normal}(\mu_{obs[t]}, +\sigma_{obs}) \\ +\mu_{obs[t]} & = process_t \\ +process_t & \sim \text{MVNormal}(\mu_{process[t]}, \Sigma_{process}) +\\ +\mu_{process[t]} & = VAR * process_{t-1} + +f_{global}(\boldsymbol{month},\boldsymbol{temp})_t + +f_{series}(\boldsymbol{month},\boldsymbol{temp})_t \\ +f_{global}(\boldsymbol{month},\boldsymbol{temp}) & = +\sum_{k=1}^{K}b_{global} * \beta_{global} \\ +f_{series}(\boldsymbol{month},\boldsymbol{temp}) & = +\sum_{k=1}^{K}b_{series} * \beta_{series} \end{align*}\]

+

Here you can see that there are no terms in the observation model +apart from the underlying process model. But we could easily add +covariates into the observation model if we felt that they could explain +some of the systematic observation errors. We also assume independent +observation processes (there is no covariance structure in the +observation errors \(\sigma_{obs}\)). +At present, mvgam does not support multivariate observation +models. But this feature will be added in future versions. However the +underlying process model is multivariate, and there is a lot going on +here. This component has a Vector Autoregressive part, where the process +mean at time \(t\) \((\mu_{process[t]})\) is a vector that +evolves as a function of where the vector-valued process model was at +time \(t-1\). The \(VAR\) matrix captures these dynamics with +self-dependencies on the diagonal and possibly asymmetric +cross-dependencies on the off-diagonals, while also incorporating the +nonlinear smooth functions that capture seasonality for each series. The +contemporaneous process errors are modeled by \(\Sigma_{process}\), which can be +constrained so that process errors are independent (i.e. setting the +off-diagonals to 0) or can be fully parameterized using a Cholesky +decomposition (using Stan’s \(LKJcorr\) distribution to place a prior on +the strength of inter-species correlations). For those that are +interested in the inner-workings, mvgam makes use of a +recent breakthrough by Sarah +Heaps to enforce stationarity of Bayesian VAR processes. This is +advantageous as we often don’t expect forecast variance to increase +without bound forever into the future, but many estimated VARs tend to +behave this way.

+


Ok that was a lot to take in. Let’s fit some models to try and +inspect what is going on and what they assume. But first, we need to +update mvgam’s default priors for the observation and +process errors. By default, mvgam uses a fairly wide +Student-T prior on these parameters to avoid being overly informative. +But our observations are z-scored and so we do not expect very large +process or observation errors. However, we also do not expect very small +observation errors either as we know these measurements are not perfect. +So let’s update the priors for these parameters. In doing so, you will +get to see how the formula for the latent process (i.e. trend) model is +used in mvgam:

+
priors <- get_mvgam_priors(
+  # observation formula, which has no terms in it
+  y ~ -1,
+  
+  # process model formula, which includes the smooth functions
+  trend_formula = ~ te(temp, month, k = c(4, 4)) +
+    te(temp, month, k = c(4, 4), by = trend),
+  
+  # VAR1 model with uncorrelated process errors
+  trend_model = 'VAR1',
+  family = gaussian(),
+  data = plankton_train)
+

Get names of all parameters whose priors can be modified:

+
priors[, 3]
+#>  [1] "(Intercept)"                                                                                                                                                                                                                                                           
+#>  [2] "process error sd"                                                                                                                                                                                                                                                      
+#>  [3] "diagonal autocorrelation population mean"                                                                                                                                                                                                                              
+#>  [4] "off-diagonal autocorrelation population mean"                                                                                                                                                                                                                          
+#>  [5] "diagonal autocorrelation population variance"                                                                                                                                                                                                                          
+#>  [6] "off-diagonal autocorrelation population variance"                                                                                                                                                                                                                      
+#>  [7] "shape1 for diagonal autocorrelation precision"                                                                                                                                                                                                                         
+#>  [8] "shape1 for off-diagonal autocorrelation precision"                                                                                                                                                                                                                     
+#>  [9] "shape2 for diagonal autocorrelation precision"                                                                                                                                                                                                                         
+#> [10] "shape2 for off-diagonal autocorrelation precision"                                                                                                                                                                                                                     
+#> [11] "observation error sd"                                                                                                                                                                                                                                                  
+#> [12] "te(temp,month) smooth parameters, te(temp,month):trendtrend1 smooth parameters, te(temp,month):trendtrend2 smooth parameters, te(temp,month):trendtrend3 smooth parameters, te(temp,month):trendtrend4 smooth parameters, te(temp,month):trendtrend5 smooth parameters"
+

And their default prior distributions:

+
priors[, 4]
+#>  [1] "(Intercept) ~ student_t(3, -0.1, 2.5);"
+#>  [2] "sigma ~ student_t(3, 0, 2.5);"         
+#>  [3] "es[1] = 0;"                            
+#>  [4] "es[2] = 0;"                            
+#>  [5] "fs[1] = sqrt(0.455);"                  
+#>  [6] "fs[2] = sqrt(0.455);"                  
+#>  [7] "gs[1] = 1.365;"                        
+#>  [8] "gs[2] = 1.365;"                        
+#>  [9] "hs[1] = 0.071175;"                     
+#> [10] "hs[2] = 0.071175;"                     
+#> [11] "sigma_obs ~ student_t(3, 0, 2.5);"     
+#> [12] "lambda_trend ~ normal(5, 30);"
+

Setting priors is easy in mvgam as you can use +brms routines. Here we use more informative Normal priors +for both error components, but we impose a lower bound of 0.2 for the +observation errors:

+
priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2),
+            prior(normal(0.5, 0.25), class = sigma))
+

You may have noticed something else unique about this model: there is +no intercept term in the observation formula. This is because a shared +intercept parameter can sometimes be unidentifiable with respect to the +latent VAR process, particularly if our series have similar long-run +averages (which they do in this case because they were z-scored). We +will often get better convergence in these State-Space models if we drop +this parameter. mvgam accomplishes this by fixing the +coefficient for the intercept to zero. Now we can fit the first model, +which assumes that process errors are contemporaneously uncorrelated

+
var_mod <- mvgam(  
+  # observation formula, which is empty
+  y ~ -1,
+  
+  # process model formula, which includes the smooth functions
+  trend_formula = ~ te(temp, month, k = c(4, 4)) +
+    te(temp, month, k = c(4, 4), by = trend),
+  
+  # VAR1 model with uncorrelated process errors
+  trend_model = 'VAR1',
+  family = gaussian(),
+  data = plankton_train,
+  newdata = plankton_test,
+  
+  # include the updated priors
+  priors = priors)
+
+
+

Inspecting SS models

+

This model’s summary is a bit different to other mvgam +summaries. It separates parameters based on whether they belong to the +observation model or to the latent process model. This is because we may +often have covariates that impact the observations but not the latent +process, so we can have fairly complex models for each component. You +will notice that some parameters have not fully converged, particularly +for the VAR coefficients (called A in the output) and for +the process errors (Sigma). Note that we set +include_betas = FALSE to stop the summary from printing +output for all of the spline coefficients, which can be dense and hard +to interpret:

+
summary(var_mod, include_betas = FALSE)
+#> GAM observation formula:
+#> y ~ 1
+#> <environment: 0x0000012cc9a7bc50>
+#> 
+#> GAM process formula:
+#> ~te(temp, month, k = c(4, 4)) + te(temp, month, k = c(4, 4), 
+#>     by = trend)
+#> <environment: 0x0000012cc9a7bc50>
+#> 
+#> Family:
+#> gaussian
+#> 
+#> Link function:
+#> identity
+#> 
+#> Trend model:
+#> VAR1
+#> 
+#> N process models:
+#> 5 
+#> 
+#> N series:
+#> 5 
+#> 
+#> N timepoints:
+#> 112 
+#> 
+#> Status:
+#> Fitted using Stan 
+#> 4 chains, each with iter = 1500; warmup = 1000; thin = 1 
+#> Total post-warmup draws = 2000
+#> 
+#> 
+#> Observation error parameter estimates:
+#>              2.5%  50% 97.5% Rhat n_eff
+#> sigma_obs[1] 0.20 0.26  0.34 1.01   444
+#> sigma_obs[2] 0.26 0.40  0.53 1.02   227
+#> sigma_obs[3] 0.41 0.64  0.82 1.02   121
+#> sigma_obs[4] 0.25 0.37  0.50 1.01   218
+#> sigma_obs[5] 0.32 0.44  0.54 1.02   235
+#> 
+#> GAM observation model coefficient (beta) estimates:
+#>             2.5% 50% 97.5% Rhat n_eff
+#> (Intercept)    0   0     0  NaN   NaN
+#> 
+#> Process model VAR parameter estimates:
+#>           2.5%    50% 97.5% Rhat n_eff
+#> A[1,1]  0.0052  0.510 0.850 1.02   136
+#> A[1,2] -0.3400 -0.040 0.190 1.00   480
+#> A[1,3] -0.5300 -0.055 0.320 1.01   328
+#> A[1,4] -0.2700  0.036 0.410 1.00   622
+#> A[1,5] -0.0780  0.140 0.530 1.02   206
+#> A[2,1] -0.1600  0.010 0.210 1.01   595
+#> A[2,2]  0.6100  0.790 0.910 1.00   547
+#> A[2,3] -0.3900 -0.130 0.033 1.01   369
+#> A[2,4] -0.0430  0.110 0.360 1.01   404
+#> A[2,5] -0.0440  0.064 0.210 1.00   678
+#> A[3,1] -0.3200  0.012 0.460 1.06    58
+#> A[3,2] -0.5300 -0.190 0.024 1.03   137
+#> A[3,3]  0.0780  0.430 0.710 1.01   281
+#> A[3,4] -0.0300  0.230 0.660 1.04   113
+#> A[3,5] -0.0730  0.130 0.410 1.02   168
+#> A[4,1] -0.1800  0.050 0.350 1.05    74
+#> A[4,2] -0.1100  0.052 0.240 1.02   339
+#> A[4,3] -0.4300 -0.110 0.100 1.03   152
+#> A[4,4]  0.5000  0.750 0.960 1.02   216
+#> A[4,5] -0.1900 -0.034 0.130 1.01   691
+#> A[5,1] -0.2400  0.070 0.550 1.05    77
+#> A[5,2] -0.4500 -0.130 0.057 1.03   137
+#> A[5,3] -0.6200 -0.190 0.110 1.03   145
+#> A[5,4] -0.0560  0.200 0.630 1.02   156
+#> A[5,5]  0.5300  0.740 0.960 1.03   160
+#> 
+#> Process error parameter estimates:
+#>             2.5%  50% 97.5% Rhat n_eff
+#> Sigma[1,1] 0.029 0.27  0.67 1.02    96
+#> Sigma[1,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[1,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[1,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[1,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,2] 0.063 0.11  0.18 1.02   318
+#> Sigma[2,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[2,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,3] 0.052 0.16  0.30 1.04   111
+#> Sigma[3,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[3,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[4,4] 0.051 0.12  0.25 1.03   111
+#> Sigma[4,5] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,1] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,2] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,3] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,4] 0.000 0.00  0.00  NaN   NaN
+#> Sigma[5,5] 0.097 0.21  0.36 1.02   145
+#> 
+#> Approximate significance of GAM process smooths:
+#>                              edf Ref.df    F p-value    
+#> te(temp,month)              5.04     15 2.00 0.04308 *  
+#> te(temp,month):seriestrend1 1.11     15 0.12 1.00000    
+#> te(temp,month):seriestrend2 1.81     15 0.28 0.99754    
+#> te(temp,month):seriestrend3 5.39     15 2.74 0.00072 ***
+#> te(temp,month):seriestrend4 3.19     15 0.56 0.91066    
+#> te(temp,month):seriestrend5 1.55     15 0.52 0.99801    
+#> ---
+#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+#> 
+#> Stan MCMC diagnostics:
+#> n_eff / iter looks reasonable for all parameters
+#> Rhats above 1.05 found for 4 parameters
+#>  *Diagnose further to investigate why the chains have not mixed
+#> 0 of 2000 iterations ended with a divergence (0%)
+#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
+#> Chain 2: E-FMI = 0.1696
+#> Chain 4: E-FMI = 0.1882
+#>  *E-FMI below 0.2 indicates you may need to reparameterize your model
+#> 
+#> Samples were drawn using NUTS(diag_e) at Fri Apr 19 8:19:31 AM 2024.
+#> For each parameter, n_eff is a crude measure of effective sample size,
+#> and Rhat is the potential scale reduction factor on split MCMC chains
+#> (at convergence, Rhat = 1)
+

The convergence of this model isn’t fabulous (more on this in a +moment). But we can again plot the smooth functions, which this time +operate on the process model. We can see the same plot using +trend_effects = TRUE in the plotting functions:

+
plot(var_mod, 'smooths', trend_effects = TRUE)
+

+

The VAR matrix is of particular interest here, as it captures lagged +dependencies and cross-dependencies in the latent process model:

+
mcmc_plot(var_mod, variable = 'A', regex = TRUE, type = 'hist')
+

+

Unfortunately bayesplot doesn’t know this is a matrix of +parameters so what we see is actually the transpose of the VAR matrix. A +little bit of wrangling gives us these histograms in the correct +order:

+
A_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    A_pars[i, j] <- paste0('A[', i, ',', j, ']')
+  }
+}
+mcmc_plot(var_mod, 
+          variable = as.vector(t(A_pars)), 
+          type = 'hist')
+

+

There is a lot happening in this matrix. Each cell captures the +lagged effect of the process in the column on the process in the row in +the next timestep. So for example, the effect in cell [1,3], which is +quite strongly negative, means that an increase in the process +for series 3 (Greens) at time \(t\) is +expected to lead to a subsequent decrease in the process for +series 1 (Bluegreens) at time \(t+1\). +The latent process model is now capturing these effects and the smooth +seasonal effects, so the trend plot shows our best estimate of what the +true count should have been at each time point:

+
plot(var_mod, type = 'trend', series = 1)
+

+
plot(var_mod, type = 'trend', series = 3)
+

+

The process error \((\Sigma)\) +captures unmodelled variation in the process models. Again, we fixed the +off-diagonals to 0, so the histograms for these will look like flat +boxes:

+
Sigma_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']')
+  }
+}
+mcmc_plot(var_mod, 
+          variable = as.vector(t(Sigma_pars)), 
+          type = 'hist')
+

+

The observation error estimates \((\sigma_{obs})\) represent how much the +model thinks we might miss the true count when we take our imperfect +measurements:

+
mcmc_plot(var_mod, variable = 'sigma_obs', regex = TRUE, type = 'hist')
+

+

These are still a bit hard to identify overall, especially when +trying to estimate both process and observation error. Often we need to +make some strong assumptions about which of these is more important for +determining unexplained variation in our observations.

+
+
+

Correlated process errors

+

Let’s see if these estimates improve when we allow the process errors +to be correlated. Once again, we need to first update the priors for the +observation errors:

+
priors <- c(prior(normal(0.5, 0.1), class = sigma_obs, lb = 0.2),
+            prior(normal(0.5, 0.25), class = sigma))
+

And now we can fit the correlated process error model

+
varcor_mod <- mvgam(  
+  # observation formula, which remains empty
+  y ~ -1,
+  
+  # process model formula, which includes the smooth functions
+  trend_formula = ~ te(temp, month, k = c(4, 4)) +
+    te(temp, month, k = c(4, 4), by = trend),
+  
+  # VAR1 model with correlated process errors
+  trend_model = 'VAR1cor',
+  family = gaussian(),
+  data = plankton_train,
+  newdata = plankton_test,
+  
+  # include the updated priors
+  priors = priors)
+

Plot convergence diagnostics for the two models, which shows that +both models display similar levels of convergence:

+
mcmc_plot(varcor_mod, type = 'rhat') +
+  labs(title = 'VAR1cor')
+

+
mcmc_plot(var_mod, type = 'rhat') +
+  labs(title = 'VAR1')
+

+

The \((\Sigma)\) matrix now captures +any evidence of contemporaneously correlated process error:

+
Sigma_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    Sigma_pars[i, j] <- paste0('Sigma[', i, ',', j, ']')
+  }
+}
+mcmc_plot(varcor_mod, 
+          variable = as.vector(t(Sigma_pars)), 
+          type = 'hist')
+

+

This symmetric matrix tells us there is support for correlated +process errors. For example, series 1 and 3 (Bluegreens and Greens) show +negatively correlated process errors, while series 1 and 4 (Bluegreens +and Other.algae) show positively correlated errors. But it is easier to +interpret these estimates if we convert the covariance matrix to a +correlation matrix. Here we compute the posterior median process error +correlations:

+
Sigma_post <- as.matrix(varcor_mod, variable = 'Sigma', regex = TRUE)
+median_correlations <- cov2cor(matrix(apply(Sigma_post, 2, median),
+                                      nrow = 5, ncol = 5))
+rownames(median_correlations) <- colnames(median_correlations) <- levels(plankton_train$series)
+
+round(median_correlations, 2)
+#>             Bluegreens Diatoms Greens Other.algae Unicells
+#> Bluegreens        1.00   -0.04   0.16       -0.03     0.34
+#> Diatoms          -0.04    1.00  -0.22        0.50     0.17
+#> Greens            0.16   -0.22   1.00        0.18     0.47
+#> Other.algae      -0.03    0.50   0.18        1.00     0.28
+#> Unicells          0.34    0.17   0.47        0.28     1.00
+

Because this model is able to capture correlated errors, the VAR +matrix has changed slightly:

+
A_pars <- matrix(NA, nrow = 5, ncol = 5)
+for(i in 1:5){
+  for(j in 1:5){
+    A_pars[i, j] <- paste0('A[', i, ',', j, ']')
+  }
+}
+mcmc_plot(varcor_mod, 
+          variable = as.vector(t(A_pars)), 
+          type = 'hist')
+

+

We still have some evidence of lagged cross-dependence, but some of +these interactions have now been pulled more toward zero. But which +model is better? Forecasts don’t appear to differ very much, at least +qualitatively (here are forecasts for three of the series, for each +model):

+
plot(var_mod, type = 'forecast', series = 1, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 3.050376
+
plot(varcor_mod, type = 'forecast', series = 1, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 2.901518
+
plot(var_mod, type = 'forecast', series = 2, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 5.979108
+
plot(varcor_mod, type = 'forecast', series = 2, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 5.763997
+
plot(var_mod, type = 'forecast', series = 3, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 4.027927
+
plot(varcor_mod, type = 'forecast', series = 3, newdata = plankton_test)
+

+
#> Out of sample CRPS:
+#> [1] 4.013858
+

We can compute the variogram score for out of sample forecasts to get +a sense of which model does a better job of capturing the dependence +structure in the true evaluation set:

+
# create forecast objects for each model
+fcvar <- forecast(var_mod)
+fcvarcor <- forecast(varcor_mod)
+
+# plot the difference in variogram scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better
+diff_scores <- score(fcvarcor, score = 'variogram')$all_series$score -
+  score(fcvar, score = 'variogram')$all_series$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(variogram[VAR1cor]~-~variogram[VAR1]))
+abline(h = 0, lty = 'dashed')
+

+

And we can also compute the energy score for out of sample forecasts +to get a sense of which model provides forecasts that are better +calibrated:

+
# plot the difference in energy scores; a negative value means the VAR1cor model is better, while a positive value means the VAR1 model is better
+diff_scores <- score(fcvarcor, score = 'energy')$all_series$score -
+  score(fcvar, score = 'energy')$all_series$score
+plot(diff_scores, pch = 16, cex = 1.25, col = 'darkred', 
+     ylim = c(-1*max(abs(diff_scores), na.rm = TRUE),
+              max(abs(diff_scores), na.rm = TRUE)),
+     bty = 'l',
+     xlab = 'Forecast horizon',
+     ylab = expression(energy[VAR1cor]~-~energy[VAR1]))
+abline(h = 0, lty = 'dashed')
+

+

The models tend to provide similar forecasts, though the correlated +error model does slightly better overall. We would probably need to use +a more extensive rolling forecast evaluation exercise if we felt like we +needed to only choose one for production. mvgam offers some +utilities for doing this (i.e. see ?lfo_cv for +guidance).

+
+
+

Further reading

+

The following papers and resources offer a lot of useful material +about multivariate State-Space models and how they can be applied in +practice:

+

Heaps, Sarah E. “Enforcing +stationarity through the prior in vector autoregressions.” +Journal of Computational and Graphical Statistics 32.1 (2023): +74-83.

+

Hannaford, Naomi E., et al. “A +sparse Bayesian hierarchical vector autoregressive model for microbial +dynamics in a wastewater treatment plant.Computational +Statistics & Data Analysis 179 (2023): 107659.

+

Holmes, Elizabeth E., Eric J. Ward, and Wills Kellie. “MARSS: +multivariate autoregressive state-space models for analyzing time-series +data.R Journal. 4.1 (2012): 11.

+

Ward, Eric J., et al. “Inferring +spatial structure from time‐series data: using multivariate state‐space +models to detect metapopulation structure of California sea lions in the +Gulf of California, Mexico.Journal of Applied Ecology +47.1 (2010): 47-56.

+

Auger‐Méthé, Marie, et al. “A +guide to state–space modeling of ecological time series.” +Ecological Monographs 91.4 (2021): e01470.

+
+
+
+

Interested in contributing?

+

I’m actively seeking PhD students and other researchers to work in +the areas of ecological forecasting, multivariate model evaluation and +development of mvgam. Please reach out if you are +interested (n.clark’at’uq.edu.au)

+
+ + + + + + + + + + + diff --git a/inst/doc/vignette.rds b/inst/doc/vignette.rds new file mode 100644 index 00000000..32d8463e Binary files /dev/null and b/inst/doc/vignette.rds differ