diff --git a/.Rbuildignore b/.Rbuildignore index 206a9a08..114f0403 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^mvgam\.Rproj$ ^\.Rproj\.user$ +^LICENSE\.md$ diff --git a/DESCRIPTION b/DESCRIPTION index 24cf163a..f2213eca 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,21 @@ Package: mvgam -Title: What the Package Does (One Line, Title Case) +Title: Multivariate Bayesian Generalised Additive Models for discrete time series Version: 0.0.0.9000 Authors@R: - person(given = "First", - family = "Last", + person(given = "Nicholas", + family = "Clark", role = c("aut", "cre"), - email = "first.last@example.com", - comment = c(ORCID = "YOUR-ORCID-ID")) -Description: What the package does (one paragraph). -License: `use_mit_license()`, `use_gpl3_license()` or friends to - pick a license + email = "nicholas.j.clark1214@.com") +Description: Multivariate Bayesian Generalised Additive Models for discrete time series +Maintainer: Nicholas J Clark +License: MIT + file LICENSE +Depends: R (>= 4.0.0), mgcv, rjags +Imports: parallel, + coda, + MCMCpack, + runjags, + MCMCvis, + dplyr Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..10a9fe51 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2021 +COPYRIGHT HOLDER: Nicholas Clark diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 00000000..5608c0db --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2021 Nicholas Clark + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE index 6ae92683..f19d6267 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(hpd) +export(mvjagam) +export(plot_mvgam_fc) +export(plot_mvgam_trend) diff --git a/R/all_neon_tick_data.R b/R/all_neon_tick_data.R new file mode 100644 index 00000000..61d020a7 --- /dev/null +++ b/R/all_neon_tick_data.R @@ -0,0 +1,15 @@ +#' NEON Amblyomma and Ixodes tick abundance survey data +#' +#' A dataset containing timeseries of Amblyomma americanum and Ixodes scapularis nymph abundances at NEON sites +#' +#' @format A tibble/dataframe containing covariate information alongside the main fields of: +#' \describe{ +#' \item{Year}{Year of sampling} +#' \item{epiWeek}{Epidemiological week of sampling} +#' \item{plot_ID}{NEON plot ID for survey location} +#' \item{siteID}{NEON site ID for survey location} +#' \item{amblyomma_americanum}{Counts of A. americanum nymphs} +#' \item{ixodes_scapularis}{Counts of I. scapularis nymphs} +#' } +#' @source \url{https://www.neonscience.org/data} +"all_neon_tick_data" diff --git a/R/hpd.R b/R/hpd.R new file mode 100644 index 00000000..526b79c1 --- /dev/null +++ b/R/hpd.R @@ -0,0 +1,58 @@ +# Calculate the highest posterior density interval +#' +#'This function uses estimated densities to calculate HPD intevals. Code originally supplied by Martyn Plummer +#' +#'@param x \code{vector} of values representing the distribution to be summarised +#'@param coverage \code{numeric} value specifying the width of the HPD interval. Default is 0.95 +#'@return A \code{list} containing the reconciled forecast distributions for each series in \code{y}. Each element in +#'the \code{vector} with three values: lower estimate, median estimate and upper estimate of the HPD interval +#' +#'@export +#' +hpd <- function(x, coverage = 0.95) +{ + x <- as.matrix(x) + out <- matrix(NA, nrow = ncol(x), ncol = 3) + rownames(out) <- dimnames(x)[[2]] + colnames(out) <- c("mode", "lower", "upper") + + f <- function(p) { + if (p == density.range[2]) { + set.coverage <- 0 + } + else { + p.upper <- min(y.density$y[y.density$y > p]) + p.lower <- max(y.density$y[y.density$y <= p]) + cov.upper <- sum(y.counts[y.density$y >= p.upper]) / sum(y.counts) + cov.lower <- sum(y.counts[y.density$y >= p.lower]) / sum(y.counts) + c <- (p.upper - p)/(p.upper - p.lower) + set.coverage <- c * cov.upper + (1 - c) * cov.lower + } + return(set.coverage - coverage) + } + + for (i in 1:ncol(x)) { + y <- unclass(x[,i]) + y.density <- stats::density(y, n=1024) + m <- length(y.density$x) + + ## Find the midpoint + out[i,1] <- y.density$x[which.max(y.density$y)] + dx <- diff(range(y.density$x)) / m + breaks <- c(y.density$x[1] - dx / 2, y.density$x + dx /2) + y.counts <- hist(y, breaks = breaks, plot = FALSE)$counts + density.range <- range(y.density$y) + uniroot.out <- stats::uniroot(f, density.range) + + ## Assuming that we have a single interval, find the limits + out[i,2:3] <- range(y.density$x[y.density$y > uniroot.out$root]) + + ## Check! + if (sum(abs(diff(y.density$y > uniroot.out$root))) != 2) { + warning("HPD set is not a closed interval") + } + } + out <- c(out[2], out[1], out[3]) + names(out) <- c('lower', 'median', 'upper') + return(out) +} diff --git a/R/mvjagam.R b/R/mvjagam.R new file mode 100644 index 00000000..4be75ff5 --- /dev/null +++ b/R/mvjagam.R @@ -0,0 +1,493 @@ +#'Fit a Bayesian multivariate GAM to a set of discrete time series +#' +#'This function estimates the posterior distribution for a multivariate GAM that includes +#'smooth seasonality and possible other smooth functions specified in the GAM formula. State-space latent trends +#'are estimated for each series, with a number of options for specifying the structures of the trends +#' +#' +#'@param formula A \code{character} string specifying the GAM formula. These are exactly like the formula +#'for a GLM except that smooth terms, s, te, ti and t2, can be added to the right hand side +#'to specify that the linear predictor depends on smooth functions of predictors (or linear functionals of these). +#'@param data_train A\code{dataframe} containing the model response variable and covariates +#'required by the GAM \code{formula}. Should include columns: +#''y' (the discrete outcomes; NAs allowed) +#''series' (character or factor index of the series IDs) +#''season' (numeric index of the seasonal time point for each observation; should not have any missing) +#''year' the numeric index for year +#''in_season' indicator for whether the observation is in season or not. If the counts tend to go to zero +#'during the off season (as in tick counts for example), setting this to zero can be useful as trends won't contribute during +#'during this time but they continue to evolve, allowing the trend from the past season to continue evolving rather than forcing +#'it to zero +#'Any other variables to be included in the linear predictor of \code{formula} must also be present +#'@param data_test \code{dataframe} containing at least 'series', 'season', 'year' and 'in_season' for the forecast horizon, in +#'addition to any other variables included in the linear predictor of \code{formula} +#'@param use_nb \code{logical} If \code{TRUR}, use a Negative Binomial likelihood with estimated +#'overdispersion parameter r; +#'if \code{FALSE}, set r to \code{10,000} to approximate the Poisson distribution +#'@param use_mv \code{logical} If \code{TRUE} and the total number of series is \code{<=5}, +#'a multivariate gaussian distribution is used for the state space trend errors. If \code{TRUE} and the total number +#'of series is \code{>5}, a set of latent dynamic factors is estimated to capture possible dependencies in the state +#'space trends. If \code{FALSE}, independent gaussian distributions are used for the trends +#'@param n.chains \code{integer} the number of parallel chains for the model +#'@param n.adapt \code{integer} the number of iterations for adaptation. See adapt for details. +#'If \code{n.adapt} = 0 then no adaptation takes place +#'@param n.iter \code{integer} the number of iterations of the Markov chain to run +#'@param auto_update \code{logical} If \code{TRUE}, the model is run for up to \code{3} additional sets of +#'\code{n.iter} iterations, or until the lower 15% of effective sample sizes reaches \code{100} +#'@param phi_prior \code{character} specifying (in JAGS syntax) the prior distributions for the AR1 coefficients in the +#'latent trends +#'@param rho_prior \code{character} specifying (in JAGS syntax) the prior distributions for the smooth penalty parameters +#'@param r_prior \code{character} specifying (in JAGS syntax) the prior distribution for the Negative Binomial +#'overdispersion parameter +#'@param tau_prior \code{character} specifying (in JAGS syntax) the prior distributions for the independent gaussian +#'precisions used for the latent trends (ignored if \code{use_mv == TRUE}) +#'@details A \code{\link[mcgv]{jagam}} model file is generated from \code{formula} and modified to include the latent +#'AR1 state space trends. The model parameters are esimated in a Bayesian framework using Gibbs sampling +#'in \code{\link[rjags]{jags.model}}. +#' +#'@return A \code{list} object containing JAGS model output, the text representation of the model file, +#'the mgcv model output (for easily generating simulations at +#'unsampled covariate values), the names of the smooth parameters and the matrix of indices for each series +#' +#'@export + +mvjagam = function(formula, + data_train, + data_test, + use_nb = TRUE, + use_mv = FALSE, + n.adapt = 1000, + n.chains = 2, + n.burnin = 5000, + n.iter = 2000, + thin = 2, + auto_update = TRUE, + phi_prior = 'dbeta(1, 1)', + rho_prior = 'dunif(-12, 12)', + r_prior = 'dexp(0.001)', + tau_prior = 'dunif(1, 100)'){ + + # Fill in any NA's with arbitrary values so the observations aren't dropped when jags data is created + orig_y <- data_train$y + data_train$y[which(is.na(data_train$y))] <- floor(sample(seq(0,50), + length(which(is.na(data_train$y))), + T)) + + # Estimate the GAM model using mgcv so that the linear predictor matrix can be easily calculated + # when simulating from the JAGS model later on + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = poisson(), + drop.unused.levels = FALSE) + + # Make jags file and appropriate data structures + ss_jagam <- mgcv::jagam(formula, + data = data_train, + family = poisson(), + drop.unused.levels = FALSE, + file = 'base_gam.txt', + sp.prior = 'log.uniform') + + # Read in the base (unmodified) jags model file + base_model <- suppressWarnings(readLines('base_gam.txt')) + + # Need to remove lines from the linear predictor section and replace with + # the modified state space representation + lines_remove <- c(1:grep('## response', base_model)) + base_model <- base_model[-lines_remove] + base_model[grep('rho\\[i\\] ~', base_model)] <- paste0(' rho[i] ~ ', rho_prior) + + # Add replacement lines for trends and linear predictor + if(!use_mv){ + modification <- c("model { + + ## GAM linear predictor + eta <- X %*% b + + ## Mean expectations + for (i in 1:n) { + for (s in 1:n_series) { + mu[i, s] <- exp(eta[ytimes[i, s]] + trend[i, s] * in_season[i]) + } + } + + ## AR1 state space trends + for(s in 1:n_series) { + trend[1, s] ~ dnorm(0, tau[s]) + } + + for (i in 2:n){ + for (s in 1:n_series){ + trend[i, s] ~ dnorm(trend[i - 1, s] * phi[s], tau[s]) + } + } + + for (s in 1:n_series){ + phi[s] ~ dunif(-0.9, 0.9) + tau[s] ~ dexp(0.5) + } + + ## Negative binomial likelihood functions + for (i in 1:n) { + for (s in 1:n_series) { + y[i, s] ~ dnegbin(rate[i, s], r); + rate[i, s] <- ifelse((r / (r + mu[i, s])) < min_eps, min_eps, + (r / (r + mu[i, s]))) + } + } + + ## Set overdispersion parameter to 10000 if not using nb; + ## this approximates the Poisson distribution + r1 ~ dgamma(0.01, 0.01) + r2 <- 10000 + r_indicator <- ifelse(use_nb > 0, 1, 0) + r <- (r1 * r_indicator) + (r2 * (1 - r_indicator)) + + ## Posterior predictions + for (i in 1:n) { + for (s in 1:n_series) { + ypred[i, s] ~ dnegbin(rate[i, s], r) + } + } + ") + + # Create the joined model file + fil <- tempfile(fileext = ".xt") + cat(c(readLines(textConnection(modification)), base_model), file = fil, + sep = "\n") + model_file <- readLines(fil, n = -1) + + # Update and prior distribution choices + model_file[grep('phi\\[s\\] ~', model_file)] <- paste0(' phi[s] ~ ', phi_prior) + model_file[grep('tau\\[s\\] ~', model_file)] <- paste0(' tau[s] ~ ', tau_prior) + model_file[grep('r1 ~', model_file)] <- paste0(' r1 ~ ', r_prior) + + model_file <- textConnection(model_file) + } + + if(use_mv){ + if(length(unique(data_train$series)) <= 5){ + modification <- c("model { + + ## GAM linear predictor + eta <- X %*% b + + ## Mean expectations + for (i in 1:n) { + for (s in 1:n_series) { + mu[i, s] <- exp(eta[ytimes[i, s]] + trend[i, s] * in_season[i]) + } + } + + ## AR1 state space trends with possible covarying errors + trend[1, 1:n_series] ~ dmnorm(zeros, tau[ , ]) + + for (i in 2:n){ + for (s in 1:n_series){ + trend_mus[i, s] <- trend[i - 1, s] * phi[s] + } + } + + for (i in 2:n){ + trend[i, 1:n_series] ~ dmnorm(trend_mus[i, ], tau[ , ]); + } + + # AR1 persistence coefficients + for (s in 1:n_series){ + phi[s] ~ dunif(-0.9, 0.9) + } + + # Wishart is appropriate conjugate prior for dmnorm precision + # df = n_series + 1 sets uniform distribution on correlation parameters + tau[1:n_series, 1:n_series] ~ dwish(diag_mat[ , ], n_series + 1) + + # Scaled residual correlation matrix (between -1 and 1; Gelman & Hill 2007) + Covar.raw[1:n_series, 1:n_series] <- inverse(tau[ , ]) + for(i in 1:n_series){ + for(j in 1:n_series){ + cor[i, j] <- Covar.raw[i, j] / sqrt(Covar.raw[i, i] * Covar.raw[j, j]) + } + sigma[i] <- abs(xi[i]) * sqrt(Covar.raw[i, i]) + # Redundant scaling factor improves sampling efficiency + xi[i] ~ dunif(0, 100) + } + + ## Negative binomial likelihood functions + for (i in 1:n) { + for (s in 1:n_series) { + y[i, s] ~ dnegbin(rate[i, s], r); + rate[i, s] <- ifelse((r / (r + mu[i, s])) < min_eps, min_eps, + (r / (r + mu[i, s]))) + } + } + + ## Set overdispersion parameter to 10000 if not using nb; + ## this approximates the Poisson distribution + r1 ~ dgamma(0.01, 0.01) + r2 <- 10000 + r_indicator <- ifelse(use_nb > 0, 1, 0) + r <- (r1 * r_indicator) + (r2 * (1 - r_indicator)) + + ## Posterior predictions + for (i in 1:n) { + for (s in 1:n_series) { + ypred[i, s] ~ dnegbin(rate[i, s], r) + } + } + ") + + # Create the joined model file + fil <- tempfile(fileext = ".xt") + cat(c(readLines(textConnection(modification)), base_model), file = fil, + sep = "\n") + model_file <- readLines(fil, n = -1) + + # Update and prior distribution choices + model_file[grep('phi\\[s\\] ~', model_file)] <- paste0(' phi[s] ~ ', phi_prior) + model_file[grep('tau\\[s\\] ~', model_file)] <- paste0(' tau[s] ~ ', tau_prior) + model_file[grep('r1 ~', model_file)] <- paste0(' r1 ~ ', r_prior) + + model_file <- textConnection(model_file) + } else { + #### Use the latent variable approach to estimate dependencies when n_series is large + modification <- c("model { + + ## GAM linear predictor + eta <- X %*% b + + ## Mean expectations + for (i in 1:n) { + for (s in 1:n_series) { + mu[i, s] <- exp(eta[ytimes[i, s]] + trend[i, s] * in_season[i]) + } + } + + ## Latent factors evolve as AR1 time series + tau_fac ~ dnorm(50, 0.05)T(0.01,) + for(j in 1:n_lv){ + LV[1, j] ~ dnorm(0, tau_fac) + } + + for(i in 2:n){ + for(j in 1:n_lv){ + LV[i, j] ~ dnorm(LV[i - 1, j] * phi[j], tau_fac) + } + } + + # AR1 persistence coefficients for latent factors + for (s in 1:n_lv){ + phi[s] ~ dunif(0, 1) + } + + ## Latent factor loadings, with identifiability constraints (Hui et al 2015) + ## Upper diagonal 0 + for(i in 1:(n_lv - 1)){ + for(j in (i + 1):n_lv){ + lv_coefs[i, j] <- 0 + } + } + + ## Positive sign constraints on diagonal elements + for(i in 1:n_lv) { + lv_coefs[i, i] ~ dunif(0, 1) + } + + ## Lower diagonal free + for(i in 2:n_lv){ + for(j in 1:(i - 1)){ + lv_coefs[i, j] ~ dunif(-1, 1) + } + } + + ## Other elements also free + for(i in (n_lv + 1):n_series) { + for(j in 1:n_lv){ + lv_coefs[i, j] ~ dunif(-1, 1) + } + } + + ## Trend evolution for the series depends on latent factors; + ## fix the precision here as it may become unidentifiable when estimating + ## precision for the latent factors + for (i in 1:n){ + for (s in 1:n_series){ + trend[i, s] ~ dnorm(inprod(lv_coefs[s,], LV[i,]), 100) + } + } + + ## Negative binomial likelihood functions + for (i in 1:n) { + for (s in 1:n_series) { + y[i, s] ~ dnegbin(rate[i, s], r); + rate[i, s] <- ifelse((r / (r + mu[i, s])) < min_eps, min_eps, + (r / (r + mu[i, s]))) + } + } + + ## Set overdispersion parameter to 10000 if not using nb; + ## this approximates the Poisson distribution + r1 ~ dgamma(0.01, 0.01) + r2 <- 10000 + r_indicator <- ifelse(use_nb > 0, 1, 0) + r <- (r1 * r_indicator) + (r2 * (1 - r_indicator)) + + ## Posterior predictions + for (i in 1:n) { + for (s in 1:n_series) { + ypred[i, s] ~ dnegbin(rate[i, s], r) + } + } + ") + + # Create the joined model file + fil <- tempfile(fileext = ".xt") + cat(c(readLines(textConnection(modification)), base_model), file = fil, + sep = "\n") + model_file <- readLines(fil, n = -1) + + # Update and prior distribution choices + model_file[grep('phi\\[s\\] ~', model_file)] <- paste0(' phi[s] ~ ', phi_prior) + model_file[grep('tau\\[s\\] ~', model_file)] <- paste0(' tau[s] ~ ', tau_prior) + model_file[grep('r1 ~', model_file)] <- paste0(' r1 ~ ', r_prior) + + model_file <- textConnection(model_file) + } + + } + + # Covariate datafrme including training and testing observations + X <- data.frame(rbind(ss_jagam$jags.data$X, + predict(ss_gam, newdata = data_test, type = 'lpmatrix'))) + + # Add a time variable + X$time <- rbind(data_train, data_test[,1:ncol(data_train)]) %>% + dplyr::left_join(rbind(data_train, data_test[,1:ncol(data_train)]) %>% + dplyr::select(year, season) %>% + dplyr::distinct() %>% + dplyr::arrange(year, season) %>% + dplyr::mutate(time = dplyr::row_number())) %>% + dplyr::pull(time) + + # Add an outcome variable + X$outcome <- c(orig_y, rep(NA, NROW(data_test))) + + # Add a series identifier variable + X$series <- as.numeric(rbind(data_train, data_test)$series) + + # Arrange by time then by series + X %>% + dplyr::arrange(time, series) -> X + + # Matrix of indices in X that correspond to timepoints for each series + ytimes <- matrix(NA, nrow = max(X$time), ncol = length(unique(X$series))) + for(i in 1:length(unique(X$series))){ + ytimes[,i] <- which(X$series == i) + } + ss_jagam$jags.data$ytimes <- ytimes + + # Matrix of outcomes in X that correspond to each series at each timepoint + ys_mat <- matrix(NA, nrow = NROW(ytimes), ncol = NCOL(ytimes)) + for(i in 1:length(unique(X$series))){ + ys_mat[,i] <- X$outcome[which(X$series == i)] + } + ss_jagam$jags.data$y <- ys_mat + + # Other necessary variables for JAGS + ss_jagam$jags.data$n <- NROW(ytimes) + ss_jagam$jags.data$n_series <- NCOL(ytimes) + ss_jagam$jags.data$X = as.matrix(X %>% + dplyr::select(-time, -series, -outcome)) + + if(use_mv){ + # We use a Wishart prior for the state space multivariate precision matrix + ss_jagam$jags.data$diag_mat <- diag(ss_jagam$jags.data$n_series) + ss_jagam$jags.data$zeros <- rep(0, ss_jagam$jags.data$n_series) + } + + + # Set use_nb to 0 for approximate Poisson; otherwise to 1 for Negative Binomial + if(use_nb){ + ss_jagam$jags.data$use_nb <- 1 + } else { + ss_jagam$jags.data$use_nb <- 0 + } + + # Machine epsilon for minimum allowable non-zero rate + ss_jagam$jags.data$min_eps <- .Machine$double.eps + + # Ensure inits fall within prior bounds for rho + ss_jagam$jags.ini$rho[ss_jagam$jags.ini$rho > 12] <- 11.99 + ss_jagam$jags.ini$rho[ss_jagam$jags.ini$rho < 12] <- -11.99 + + # Number of latent variables to use + ss_jagam$jags.data$n_lv <- floor(ss_jagam$jags.data$n_series / 2) + + # Binary indicator of in_season + ss_jagam$jags.data$in_season <- rbind(data_train, data_test) %>% + dplyr::select(year, season, in_season) %>% + dplyr::distinct() %>% + dplyr::arrange(year, season) %>% + dplyr::pull(in_season) + + # Initiate adaptation of the model + load.module("glm") + gam_mod <- jags.model(model_file, + data = ss_jagam$jags.data, + inits = ss_jagam$jags.ini, + n.chains = n.chains, + n.adapt = n.adapt) + unlink('base_gam.txt') + + # Update the model for the burnin period + update(gam_mod, n.burnin) + + # Gather posterior samples for the specified parameters + if(!use_mv){ + param <- c('rho', 'b', 'mu', 'ypred', 'r', 'phi', + 'tau', 'trend', 'p_resid') + } else { + if(length(unique(data_train$series)) <= 5){ + param <- c('rho', 'b', 'mu', 'ypred', 'r', 'phi', + 'tau', 'trend', 'p_resid', 'cor') + } else { + param <- c('rho', 'b', 'mu', 'ypred', 'r', 'phi', + 'trend', 'p_resid', 'lv_coefs', 'tau_fac') + } + + } + + out_gam_mod <- coda.samples(gam_mod, + variable.names = param, + n.iter = n.iter, + thin = thin) + if(auto_update){ + # Update until reasonable convergence (exclude 'cor' as it will have some 0 neffs) + mod_summary <- MCMCvis::MCMCsummary(out_gam_mod, c('rho', 'b', 'phi', 'tau')) + for(i in 1:3){ + if(quantile(mod_summary$Rhat, 0.85, na.rm = T) < 1 & + quantile(mod_summary$n.eff, 0.15) > 100){ + break; + } + update(gam_mod, 5000) + out_gam_mod <- rjags::coda.samples(gam_mod, + variable.names = param, + n.iter = n.iter, + thin = thin) + mod_summary <- MCMCvis::MCMCsummary(out_gam_mod, c('rho', 'b', 'phi', 'tau')) + } + } + + model_file <- readLines(fil, n = -1) + unlink(fil) + + return(list(jags_output = out_gam_mod, + model_file = model_file, + mgcv_model = ss_gam, + jags_model = gam_mod, + smooth_param_details = base_model[sort(c(grep('## prior for', base_model), + grep('## prior for', base_model)+1))], + ytimes = ytimes)) + + +} diff --git a/R/plot_mvgam_fc.R b/R/plot_mvgam_fc.R new file mode 100644 index 00000000..c30e51a2 --- /dev/null +++ b/R/plot_mvgam_fc.R @@ -0,0 +1,51 @@ +#'Plot mvjagam posterior predictions for a specified series +#'@param out_gam_mod \code{list} object returned from \code{mvjagam} +#'@param series \code{integer} specifying which series in the set is to be plotted +#'@param data_train A\code{dataframe} containing the model response variable and covariates +#'required by the GAM \code{formula}. Should include columns: +#''y' (the discrete outcomes; NAs allowed) +#''series' (character or factor index of the series IDs) +#''season' (numeric index of the seasonal time point for each observation; should not have any missing) +#''year' the numeric index for year +#''in_season' indicator for whether the observation is in season or not. If the counts tend to go to zero +#'during the off season (as in tick counts for example), setting this to zero can be useful as trends won't contribute during +#'during this time but they continue to evolve, allowing the trend from the past season to continue evolving rather than forcing +#'it to zero +#'Any other variables to be included in the linear predictor of \code{formula} must also be present +#'@param data_test \code{dataframe} containing at least 'series', 'season', 'year' and 'in_season' for the forecast horizon, in +#'addition to any other variables included in the linear predictor of \code{formula} +#'@export +plot_mvgam_fc = function(out_gam_mod, series, data_test, data_train){ + pred_indices <- seq(0, length(out_gam_mod$ytimes), + length.out = NCOL(out_gam_mod$ytimes) + 1) + preds <- MCMCvis::MCMCchains(out_gam_mod$jags_output, 'ypred')[,pred_indices[series]:pred_indices[series + 1]] + preds_last <- preds[1,] + int <- apply(preds, + 2, hpd, 0.95) + plot(preds_last, + type = 'l', ylim = c(0, max(int) + 2), + col = rgb(1,0,0, alpha = 0), + ylab = paste0('Estimated counts for ', levels(data_train$series)[series]), + xlab = 'Time') + int[int<0] <- 0 + polygon(c(seq(1:(NCOL(int))), rev(seq(1:NCOL(int)))), + c(int[1,],rev(int[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 100), border = NA) + int <- apply(preds, + 2, hpd, 0.68) + int[int<0] <- 0 + polygon(c(seq(1:(NCOL(int))), rev(seq(1:NCOL(int)))), + c(int[1,],rev(int[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 180), border = NA) + lines(int[2,], col = rgb(150, 0, 0, max = 255), lwd = 2, lty = 'dashed') + + s_name <- levels(data_train$series)[series] + points(rbind(data_train, data_test) %>% + dplyr::filter(series == s_name) %>% + dplyr::select(year, season, y) %>% + dplyr::distinct() %>% + dplyr::arrange(year, season) %>% + dplyr::pull(y), pch = 16) + abline(v = NROW(data_train) / NCOL(out_gam_mod$ytimes)) + +} diff --git a/R/plot_mvgam_trend.R b/R/plot_mvgam_trend.R new file mode 100644 index 00000000..9ef2cb3f --- /dev/null +++ b/R/plot_mvgam_trend.R @@ -0,0 +1,41 @@ +#'Plot mvjagam latent trend for a specified series +#'@param out_gam_mod \code{list} object returned from \code{mvjagam} +#'@param series \code{integer} specifying which series in the set is to be plotted +#'@param data_train A\code{dataframe} containing the model response variable and covariates +#'required by the GAM \code{formula}. Should include columns: +#''y' (the discrete outcomes; NAs allowed) +#''series' (character or factor index of the series IDs) +#''season' (numeric index of the seasonal time point for each observation; should not have any missing) +#''year' the numeric index for year +#''in_season' indicator for whether the observation is in season or not. If the counts tend to go to zero +#'during the off season (as in tick counts for example), setting this to zero can be useful as trends won't contribute during +#'during this time but they continue to evolve, allowing the trend from the past season to continue evolving rather than forcing +#'it to zero +#'Any other variables to be included in the linear predictor of \code{formula} must also be present +#'@param data_test \code{dataframe} containing at least 'series', 'season', 'year' and 'in_season' for the forecast horizon, in +#'addition to any other variables included in the linear predictor of \code{formula} +#'@export +plot_mvgam_trend = function(out_gam_mod, series, data_test, data_train){ + pred_indices <- seq(0, length(out_gam_mod$ytimes), + length.out = NCOL(out_gam_mod$ytimes) + 1) + preds <- MCMCvis::MCMCchains(out_gam_mod$jags_output, 'trend')[,pred_indices[series]:pred_indices[series + 1]] + preds_last <- preds[1,] + plot(preds_last, + type = 'l', ylim = range(preds), + col = rgb(1,0,0, alpha = 0), + ylab = paste0('Estimated trend for ', levels(data_train$series)[series]), + xlab = 'Time') + int <- apply(preds, + 2, hpd, 0.95) + polygon(c(seq(1:(NCOL(int))), rev(seq(1:NCOL(int)))), + c(int[1,],rev(int[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 100), border = NA) + int <- apply(preds, + 2, hpd, 0.68) + polygon(c(seq(1:(NCOL(int))), rev(seq(1:NCOL(int)))), + c(int[1,],rev(int[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 180), border = NA) + lines(int[2,], col = rgb(150, 0, 0, max = 255), lwd = 2, lty = 'dashed') + abline(v = NROW(data_train) / NCOL(out_gam_mod$ytimes)) + +} diff --git a/data/all_neon_tick_data.rda b/data/all_neon_tick_data.rda new file mode 100644 index 00000000..736490b1 Binary files /dev/null and b/data/all_neon_tick_data.rda differ diff --git a/man/all_neon_tick_data.Rd b/man/all_neon_tick_data.Rd new file mode 100644 index 00000000..be7ed812 --- /dev/null +++ b/man/all_neon_tick_data.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/all_neon_tick_data.R +\docType{data} +\name{all_neon_tick_data} +\alias{all_neon_tick_data} +\title{NEON Amblyomma and Ixodes tick abundance survey data} +\format{ +A tibble/dataframe containing covariate information alongside the main fields of: +\describe{ +\item{Year}{Year of sampling} +\item{epiWeek}{Epidemiological week of sampling} +\item{plot_ID}{NEON plot ID for survey location} +\item{siteID}{NEON site ID for survey location} +\item{amblyomma_americanum}{Counts of A. americanum nymphs} +\item{ixodes_scapularis}{Counts of I. scapularis nymphs} +} +} +\source{ +\url{https://www.neonscience.org/data} +} +\usage{ +all_neon_tick_data +} +\description{ +A dataset containing timeseries of Amblyomma americanum and Ixodes scapularis nymph abundances at NEON sites +} +\keyword{datasets} diff --git a/man/hpd.Rd b/man/hpd.Rd new file mode 100644 index 00000000..2d7c6840 --- /dev/null +++ b/man/hpd.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hpd.R +\name{hpd} +\alias{hpd} +\title{This function uses estimated densities to calculate HPD intevals. Code originally supplied by Martyn Plummer} +\usage{ +hpd(x, coverage = 0.95) +} +\arguments{ +\item{x}{\code{vector} of values representing the distribution to be summarised} + +\item{coverage}{\code{numeric} value specifying the width of the HPD interval. Default is 0.95} +} +\value{ +A \code{list} containing the reconciled forecast distributions for each series in \code{y}. Each element in +the \code{vector} with three values: lower estimate, median estimate and upper estimate of the HPD interval +} +\description{ +This function uses estimated densities to calculate HPD intevals. Code originally supplied by Martyn Plummer +} diff --git a/man/mvjagam.Rd b/man/mvjagam.Rd new file mode 100644 index 00000000..dbb6b598 --- /dev/null +++ b/man/mvjagam.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mvjagam.R +\name{mvjagam} +\alias{mvjagam} +\title{Fit a Bayesian multivariate GAM to a set of discrete time series} +\usage{ +mvjagam( + formula, + data_train, + data_test, + use_nb = TRUE, + use_mv = FALSE, + n.adapt = 1000, + n.chains = 2, + n.burnin = 5000, + n.iter = 2000, + thin = 2, + auto_update = TRUE, + phi_prior = "dbeta(1, 1)", + rho_prior = "dunif(-12, 12)", + r_prior = "dexp(0.001)", + tau_prior = "dunif(1, 100)" +) +} +\arguments{ +\item{formula}{A \code{character} string specifying the GAM formula. These are exactly like the formula +for a GLM except that smooth terms, s, te, ti and t2, can be added to the right hand side +to specify that the linear predictor depends on smooth functions of predictors (or linear functionals of these).} + +\item{data_train}{A\code{dataframe} containing the model response variable and covariates +required by the GAM \code{formula}. Should include columns: +'y' (the discrete outcomes; NAs allowed) +'series' (character or factor index of the series IDs) +'season' (numeric index of the seasonal time point for each observation; should not have any missing) +'year' the numeric index for year +'in_season' indicator for whether the observation is in season or not. If the counts tend to go to zero +during the off season (as in tick counts for example), setting this to zero can be useful as trends won't contribute during +during this time but they continue to evolve, allowing the trend from the past season to continue evolving rather than forcing +it to zero +Any other variables to be included in the linear predictor of \code{formula} must also be present} + +\item{data_test}{\code{dataframe} containing at least 'series', 'season', 'year' and 'in_season' for the forecast horizon, in +addition to any other variables included in the linear predictor of \code{formula}} + +\item{use_nb}{\code{logical} If \code{TRUR}, use a Negative Binomial likelihood with estimated +overdispersion parameter r; +if \code{FALSE}, set r to \code{10,000} to approximate the Poisson distribution} + +\item{use_mv}{\code{logical} If \code{TRUE} and the total number of series is \code{<=5}, +a multivariate gaussian distribution is used for the state space trend errors. If \code{TRUE} and the total number +of series is \code{>5}, a set of latent dynamic factors is estimated to capture possible dependencies in the state +space trends. If \code{FALSE}, independent gaussian distributions are used for the trends} + +\item{n.adapt}{\code{integer} the number of iterations for adaptation. See adapt for details. +If \code{n.adapt} = 0 then no adaptation takes place} + +\item{n.chains}{\code{integer} the number of parallel chains for the model} + +\item{n.iter}{\code{integer} the number of iterations of the Markov chain to run} + +\item{auto_update}{\code{logical} If \code{TRUE}, the model is run for up to \code{3} additional sets of +\code{n.iter} iterations, or until the lower 15\% of effective sample sizes reaches \code{100}} + +\item{phi_prior}{\code{character} specifying (in JAGS syntax) the prior distributions for the AR1 coefficients in the +latent trends} + +\item{rho_prior}{\code{character} specifying (in JAGS syntax) the prior distributions for the smooth penalty parameters} + +\item{r_prior}{\code{character} specifying (in JAGS syntax) the prior distribution for the Negative Binomial +overdispersion parameter} + +\item{tau_prior}{\code{character} specifying (in JAGS syntax) the prior distributions for the independent gaussian +precisions used for the latent trends (ignored if \code{use_mv == TRUE})} +} +\value{ +A \code{list} object containing JAGS model output, the text representation of the model file, +the mgcv model output (for easily generating simulations at +unsampled covariate values), the names of the smooth parameters and the matrix of indices for each series +} +\description{ +This function estimates the posterior distribution for a multivariate GAM that includes +smooth seasonality and possible other smooth functions specified in the GAM formula. State-space latent trends +are estimated for each series, with a number of options for specifying the structures of the trends +} +\details{ +A \code{\link[mcgv]{jagam}} model file is generated from \code{formula} and modified to include the latent +AR1 state space trends. The model parameters are esimated in a Bayesian framework using Gibbs sampling +in \code{\link[rjags]{jags.model}}. +} diff --git a/man/plot_mvgam_fc.Rd b/man/plot_mvgam_fc.Rd new file mode 100644 index 00000000..abf5c726 --- /dev/null +++ b/man/plot_mvgam_fc.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_mvgam_fc.R +\name{plot_mvgam_fc} +\alias{plot_mvgam_fc} +\title{Plot mvjagam posterior predictions for a specified series} +\usage{ +plot_mvgam_fc(out_gam_mod, series, data_test, data_train) +} +\arguments{ +\item{out_gam_mod}{\code{list} object returned from \code{mvjagam}} + +\item{series}{\code{integer} specifying which series in the set is to be plotted} + +\item{data_test}{\code{dataframe} containing at least 'series', 'season', 'year' and 'in_season' for the forecast horizon, in +addition to any other variables included in the linear predictor of \code{formula}} + +\item{data_train}{A\code{dataframe} containing the model response variable and covariates +required by the GAM \code{formula}. Should include columns: +'y' (the discrete outcomes; NAs allowed) +'series' (character or factor index of the series IDs) +'season' (numeric index of the seasonal time point for each observation; should not have any missing) +'year' the numeric index for year +'in_season' indicator for whether the observation is in season or not. If the counts tend to go to zero +during the off season (as in tick counts for example), setting this to zero can be useful as trends won't contribute during +during this time but they continue to evolve, allowing the trend from the past season to continue evolving rather than forcing +it to zero +Any other variables to be included in the linear predictor of \code{formula} must also be present} +} +\description{ +Plot mvjagam posterior predictions for a specified series +} diff --git a/man/plot_mvgam_trend.Rd b/man/plot_mvgam_trend.Rd new file mode 100644 index 00000000..949da301 --- /dev/null +++ b/man/plot_mvgam_trend.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_mvgam_trend.R +\name{plot_mvgam_trend} +\alias{plot_mvgam_trend} +\title{Plot mvjagam latent trend for a specified series} +\usage{ +plot_mvgam_trend(out_gam_mod, series, data_test, data_train) +} +\arguments{ +\item{out_gam_mod}{\code{list} object returned from \code{mvjagam}} + +\item{series}{\code{integer} specifying which series in the set is to be plotted} + +\item{data_test}{\code{dataframe} containing at least 'series', 'season', 'year' and 'in_season' for the forecast horizon, in +addition to any other variables included in the linear predictor of \code{formula}} + +\item{data_train}{A\code{dataframe} containing the model response variable and covariates +required by the GAM \code{formula}. Should include columns: +'y' (the discrete outcomes; NAs allowed) +'series' (character or factor index of the series IDs) +'season' (numeric index of the seasonal time point for each observation; should not have any missing) +'year' the numeric index for year +'in_season' indicator for whether the observation is in season or not. If the counts tend to go to zero +during the off season (as in tick counts for example), setting this to zero can be useful as trends won't contribute during +during this time but they continue to evolve, allowing the trend from the past season to continue evolving rather than forcing +it to zero +Any other variables to be included in the linear predictor of \code{formula} must also be present} +} +\description{ +Plot mvjagam latent trend for a specified series +} diff --git a/test_mvjagam.R b/test_mvjagam.R new file mode 100644 index 00000000..c8266c01 --- /dev/null +++ b/test_mvjagam.R @@ -0,0 +1,285 @@ +#### Testing the mgjagam function #### +# Utility functions, too specific to put into the package but useful for our study +plot_mvgam_season = function(out_gam_mod, series, data_test, data_train, + xlab = 'Season'){ + + pred_dat <- expand.grid(series = levels(data_train$series)[series], + season = seq(min(data_train$season), + max(data_train$season), length.out = 100), + year = mean(data_train$year), + cum_gdd = mean(data_train$cum_gdd, na.rm = T), + siteID = as.character(unique(data_train$siteID[which(data_train$series == + levels(data_train$series)[series])]))) + Xp <- predict(out_gam_mod$mgcv_model, newdata = pred_dat, type = 'lpmatrix') + betas <- MCMCchains(out_gam_mod$jags_output, 'b') + plot(Xp %*% betas[1,] ~ pred_dat$season, ylim = range(Xp %*% betas[1,] + 1.5, + Xp %*% betas[1,] - 1.5), + + col = rgb(150, 0, 0, max = 255, alpha = 10), type = 'l', + ylab = paste0('F(season) for ', levels(data_train$series)[series]), + xlab = xlab) + for(i in 2:1000){ + lines(Xp %*% betas[i,] ~ pred_dat$season, + + col = rgb(150, 0, 0, max = 255, alpha = 10)) + } +} + +plot_mvgam_gdd = function(out_gam_mod, series, data_test, data_train, + xlab = 'Season'){ + + pred_dat <- expand.grid(series = levels(data_train$series)[series], + season = 20, + in_season = 1, + year = mean(data_train$year), + cum_gdd = seq(min(data_train$cum_gdd[which(data_train$series == + levels(data_train$series)[series])], + na.rm = T), + max(data_train$cum_gdd[which(data_train$series == + levels(data_train$series)[series])], + na.rm = T), length.out = 100), + siteID = as.character(unique(data_train$siteID[which(data_train$series == + levels(data_train$series)[series])]))) + Xp <- predict(out_gam_mod$mgcv_model, newdata = pred_dat, type = 'lpmatrix') + betas <- MCMCchains(out_gam_mod$jags_output, 'b') + plot(Xp %*% betas[1,] ~ pred_dat$cum_gdd, + ylim = range(Xp %*% betas[1,] + 1.5, + Xp %*% betas[1,] - 1.5), + + col = rgb(150, 0, 0, max = 255, alpha = 10), type = 'l', + ylab = paste0('Predicted peak count for ', levels(data_train$series)[series]), + xlab = 'Cumulative growing degree days') + for(i in 2:1000){ + lines(Xp %*% betas[i,] ~ pred_dat$cum_gdd, + + col = rgb(150, 0, 0, max = 255, alpha = 10)) + } + rug(data_train$cum_gdd[which(data_train$series == + levels(data_train$series)[series])]) +} + +#### NEON mv_gam #### +library(mvgam) +data("all_neon_tick_data") + +# Prep data for modelling +species = 'Ambloyomma_americanum' + +if(species == 'Ambloyomma_americanum'){ + + plotIDs <- c('SCBI_013', 'SERC_001', 'SERC_005', 'SERC_006', + 'SERC_002', 'SERC_012', 'KONZ_025', 'UKFS_001', + 'UKFS_004', 'UKFS_003', 'ORNL_002', 'ORNL_040', + 'ORNL_008', 'ORNL_007', 'ORNL_009', 'ORNL_003', + 'TALL_001', 'TALL_008', 'TALL_002') + model_dat <- all_neon_tick_data %>% + dplyr::mutate(target = amblyomma_americanum) %>% + dplyr::select(Year, epiWeek, plotID, target) %>% + dplyr::mutate(epiWeek = as.numeric(epiWeek)) %>% + dplyr::filter(Year > 2014 & Year < 2021) %>% + dplyr::mutate(Year_orig = Year) + + model_dat %>% + dplyr::full_join(expand.grid(plotID = unique(model_dat$plotID), + Year_orig = unique(model_dat$Year_orig), + epiWeek = seq(1, 52))) %>% + dplyr::left_join(all_neon_tick_data %>% + dplyr::select(siteID, plotID) %>% + dplyr::distinct()) %>% + # Remove winter tick abundances as we are not interested in modelling them + dplyr::filter(epiWeek > 14) %>% + dplyr::filter(epiWeek < 41) %>% + dplyr::mutate(series = plotID, + season = epiWeek - 14, + year = as.vector(scale(Year_orig)), + y = target) %>% + dplyr::select(-Year, -epiWeek, -target) %>% + dplyr::ungroup() %>% + dplyr::arrange(Year_orig, season, series) -> model_dat + + model_dat %>% + dplyr::left_join(all_neon_tick_data %>% + dplyr::ungroup() %>% + dplyr::select(Year, siteID, cum_sdd, cum_gdd) %>% + dplyr::mutate(Year_orig = Year) %>% + dplyr::select(-Year) %>% + dplyr::distinct()) -> model_dat +} + +# Set indicator for whether time point is 'in-season' or not (trends won't contribute during +# off season, as counts generally go to zero during this time) +model_dat = model_dat %>% + dplyr::mutate(in_season = ifelse(season >=3 & season <= 22, 1, 0)) %>% + dplyr::filter(siteID %in% c('SERC', 'TALL', 'UKFS'), + Year_orig <= 2019) %>% + dplyr::mutate(plotID = factor(plotID), + siteID = factor(siteID), + series = factor(series)) + +data_train = model_dat[1:(floor(nrow(model_dat) * 0.9)),] +data_test = model_dat[((floor(nrow(model_dat) * 0.9)) + 1):nrow(model_dat),] +(nrow(data_train) + nrow(data_test)) == nrow(model_dat) + +# Hypothesis testing +# NULL. There is no seasonal pattern to be estimated, and we simply let the latent +# factors and site-level effects of growing days influence the series dynamics +null_hyp = y ~ siteID + s(cum_gdd, by = siteID, k = 3) - 1 + +# 1. Do all series share same seasonal pattern, with any remaining variation due to +# non-seasonal local variation? +hyp1 = y ~ + siteID + + s(cum_gdd, by = siteID, k = 3) + + # Global cyclic seasonality term (smooth) + s(season, k = 12, m = 2, bs = 'cc') - 1 + +# 2. Do all series share same seasonal pattern but with different magnitudes +# (i.e. random intercepts per series)? +hyp2 = y ~ + siteID + + s(cum_gdd, by = siteID, k = 3) + + s(season, k = 12, m = 2, bs = 'cc') + + # Hierarchical variable intercepts + s(series, bs = 're') - 1 + +# 3. Is there evidence for global seasonality but each site's seasonal pattern deviates +# based on more local conditions? +hyp3 = y ~ + siteID + + s(cum_gdd, by = siteID, k = 3) + + s(season, k = 4, m = 2, bs = 'cc') + + s(series, bs = 're') + + # Site-level deviations from global pattern, which can be wiggly (m=1 to reduce concurvity); + # If these dominate, they will have smaller smoothing parameters and the global seasonality + # will become less important (larger smoothing parameter). Sites with the smallest smooth + # parameters are those that deviate the most from the global seasonality + s(season, by = siteID, m = 1, k = 8) - 1 + +# 4. Is there evidence for global seasonality but each plot's seasonal pattern deviates +# based on even more local conditions than above (i.e. plot-level is not as informative)? +hyp4 = y ~ + siteID + + s(cum_gdd, by = siteID, k = 3) + + s(season, k = 4, m = 2, bs = 'cc') + + s(series, bs = 're') + + # Series-level deviations from global pattern + s(season, by = series, m = 1, k = 8) - 1 + +# If evidence of gdd effects, can also let use a global smoother and then site-level +# deviations for a 'hierarchical' setup +use_mv <- T +out_gam_mod <- mvjagam(formula = hyp1, + data_train = data_train, + data_test = data_test, + n.adapt = 1000, + n.burnin = 1000, + n.iter = 1000, + thin = 1, + auto_update = FALSE, + use_mv = use_mv, + use_nb = FALSE, + # Laplace distribution emphasizes our prior that smooths should not be overly wiggly + # unless the data supports this + rho_prior = 'ddexp(5, 0.25)T(-12, 12)', + phi_prior = 'dbeta(2,2)', + tau_prior = 'dunif(0.1, 100)') + +# View the modified JAGS model file +out_gam_mod$model_file + +# Traces of key parameters +library(MCMCvis) + +# Negative binomial size parameter (set to 10,000 if use_nb = FALSE) +MCMCtrace(out_gam_mod$jags_output, c('r'), pdf = F, n.eff = T) + +# Penalties of smooth components (smaller means an effect is more nonlinear) +out_gam_mod$smooth_param_details # rhos are the logged versions of the lambdas +MCMCtrace(out_gam_mod$jags_output, c('rho'), pdf = F, n.eff = T) + +# AR1 persistence coefficients for latent dynamic factors (or for each +# series if use_mv = FALSE) +MCMCtrace(out_gam_mod$jags_output, c('phi'), pdf = F, n.eff = T) + +# Precision for latent dynamic factors (if using latent factors) +MCMCtrace(out_gam_mod$jags_output, c('tau_fac'), pdf = F, n.eff = T) + +# Precision for latent trends (if using independent Gaussian trends) +MCMCtrace(out_gam_mod$jags_output, c('tau'), pdf = F, n.eff = T) +dev.off() + +# Plot useful posterior quantities for a specific series +opar <- par() +par(mfrow = c(3, 1)) + +# Total number of series in the set +length(unique(data_train$series)) +series = 1 +plot_mvgam_season(out_gam_mod, series = series, data_test = data_test, + data_train = data_train, xlab = 'Epidemiological week') +plot_mvgam_fc(out_gam_mod, series = series, data_test = data_test, + data_train = data_train) +plot_mvgam_trend(out_gam_mod, series = series, data_test = data_test, + data_train = data_train) +par(opar) + +plot_mvgam_gdd(out_gam_mod, series = series, data_test = data_test, + data_train = data_train, xlab = 'Epidemiological week') + +if(use_mv){ + if(length(unique(data_train$series)) > 5){ + # Calculate the correlation matrix from the latent variables + samps <- jags.samples(out_gam_mod$jags_model, + variable.names = 'lv_coefs', + n.iter = 1000, thin = 1) + lv_coefs <- samps$lv_coefs + n_series <- dim(lv_coefs)[1] + n_lv <- dim(lv_coefs)[2] + n_samples <- prod(dim(lv_coefs)[3:4]) + + # Get arrat of latend variable loadings + coef_array <- array(NA, dim = c(n_series, n_lv, n_samples)) + for(i in 1:n_series){ + for(j in 1:n_lv){ + coef_array[i, j, ] <- c(lv_coefs[i, j, , 1], + lv_coefs[i, j, , 2]) + } + } + + # Variances of each series' latent trend are fixed at (1/100)^2 + eps_res <- rep((1/100)^2, n_series) + + # Posterior correlations based on latent variable loadings + correlations <- array(NA, dim = c(n_series, n_series, n_samples)) + for(i in 1:n_samples){ + correlations[,,i] <- cov2cor(tcrossprod(coef_array[,,i]) + diag(eps_res)) + } + mean_correlations <- apply(correlations, c(1,2), function(x) quantile(x, 0.5)) + + # Plot the mean posterior correlations + mean_correlations[upper.tri(mean_correlations)] <- NA + mean_correlations <- data.frame(mean_correlations) + rownames(mean_correlations) <- colnames(mean_correlations) <- levels(data_train$series) + + library(ggplot2) + ggplot(mean_correlations %>% + tibble::rownames_to_column("series1") %>% + tidyr::pivot_longer(-c(series1), names_to = "series2", values_to = "Correlation"), + aes(x = series1, y = series2)) + geom_tile(aes(fill = Correlation)) + + scale_fill_gradient2(low="darkred", mid="white", high="darkblue", + midpoint = 0, + breaks = seq(-1,1,length.out = 5), + limits = c(-1, 1), + name = 'Residual\ncorrelation') + labs(x = '', y = '') + theme_dark() + + theme(axis.text.x = element_text(angle = 45, hjust=1)) + + } +} + +#### If GAM component is LESS supported, we should see evidence in the form of: #### +# 1. Stronger residual correlations, suggesting we are missing some site-level structure +summary(as.vector(as.matrix(mean_correlations))) + +# 2. Smaller precisions for tau_fac (i.e. larger variance for latent dynamic factors) +MCMCsummary(out_gam_mod$jags_output, 'tau_fac') +