diff --git a/DESCRIPTION b/DESCRIPTION index 9d6d0905..befabfd4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,9 +10,9 @@ Description: This package is for fitting Bayesian Generalised Additive Models to The primary purpose of the package is to build a version of dynamic factor models that incorporates the flexibility of GAMs in the linear predictor and latent dynamic trend components that are useful for time series analysis and forecasting. Estimation is performed using Markov Chain Monte Carlo (MCMC) by - the Gibbs sampling software JAGS. The package also includes utilities for online updating of - forecast distributions with a recursive particle filter that uses sequential importance sampling to - assimilate new observations as they become available. + the Gibbs sampling software JAGS (requires version 4.3.0 or higher). The package also includes + utilities for online updating of forecast distributions with a recursive particle filter that + uses sequential importance sampling to assimilate new observations as they become available. Maintainer: Nicholas J Clark License: MIT + file LICENSE Depends: R (>= 3.6.0), mgcv, rjags, parallel diff --git a/NAMESPACE b/NAMESPACE index 47a678f9..d80e0865 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,10 +11,12 @@ export(pfilter_mvgam_init) export(pfilter_mvgam_online) export(pfilter_mvgam_smooth) export(plot_mvgam_fc) +export(plot_mvgam_resids) export(plot_mvgam_smooth) export(plot_mvgam_trend) export(plot_mvgam_uncertainty) export(roll_eval_mvgam) export(series_to_mvgam) export(sim_mvgam) +export(summary_mvgam) importFrom(magrittr,"%>%") diff --git a/NEON_manuscript/Figures/Amb_PITs.pdf b/NEON_manuscript/Figures/Amb_PITs.pdf index 3742d689..0a475337 100644 Binary files a/NEON_manuscript/Figures/Amb_PITs.pdf and b/NEON_manuscript/Figures/Amb_PITs.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/ORNL_002.pdf b/NEON_manuscript/Figures/Amb_forecasts/ORNL_002.pdf index 5c1c5366..97d14f87 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/ORNL_002.pdf and b/NEON_manuscript/Figures/Amb_forecasts/ORNL_002.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/ORNL_003.pdf b/NEON_manuscript/Figures/Amb_forecasts/ORNL_003.pdf index ae6d06fa..e9d8260c 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/ORNL_003.pdf and b/NEON_manuscript/Figures/Amb_forecasts/ORNL_003.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/ORNL_007.pdf b/NEON_manuscript/Figures/Amb_forecasts/ORNL_007.pdf index bf437587..4d78b447 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/ORNL_007.pdf and b/NEON_manuscript/Figures/Amb_forecasts/ORNL_007.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/ORNL_008.pdf b/NEON_manuscript/Figures/Amb_forecasts/ORNL_008.pdf index 4a2640b9..53364112 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/ORNL_008.pdf and b/NEON_manuscript/Figures/Amb_forecasts/ORNL_008.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/ORNL_009.pdf b/NEON_manuscript/Figures/Amb_forecasts/ORNL_009.pdf index 43bda505..9b968b05 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/ORNL_009.pdf and b/NEON_manuscript/Figures/Amb_forecasts/ORNL_009.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/ORNL_040.pdf b/NEON_manuscript/Figures/Amb_forecasts/ORNL_040.pdf index 7a0fc64a..bb64fb6b 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/ORNL_040.pdf and b/NEON_manuscript/Figures/Amb_forecasts/ORNL_040.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/SERC_001.pdf b/NEON_manuscript/Figures/Amb_forecasts/SERC_001.pdf index d9fbe3d6..2853a1c9 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/SERC_001.pdf and b/NEON_manuscript/Figures/Amb_forecasts/SERC_001.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/SERC_002.pdf b/NEON_manuscript/Figures/Amb_forecasts/SERC_002.pdf index 087fa996..9eef64d5 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/SERC_002.pdf and b/NEON_manuscript/Figures/Amb_forecasts/SERC_002.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/SERC_005.pdf b/NEON_manuscript/Figures/Amb_forecasts/SERC_005.pdf index 38ae0bbd..733d76d3 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/SERC_005.pdf and b/NEON_manuscript/Figures/Amb_forecasts/SERC_005.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/SERC_006.pdf b/NEON_manuscript/Figures/Amb_forecasts/SERC_006.pdf index 3ef3d696..11e3adda 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/SERC_006.pdf and b/NEON_manuscript/Figures/Amb_forecasts/SERC_006.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/SERC_012.pdf b/NEON_manuscript/Figures/Amb_forecasts/SERC_012.pdf index 634d3c85..5a2a769f 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/SERC_012.pdf and b/NEON_manuscript/Figures/Amb_forecasts/SERC_012.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/TALL_001.pdf b/NEON_manuscript/Figures/Amb_forecasts/TALL_001.pdf index e7f0ebb5..f2be14f8 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/TALL_001.pdf and b/NEON_manuscript/Figures/Amb_forecasts/TALL_001.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/TALL_002.pdf b/NEON_manuscript/Figures/Amb_forecasts/TALL_002.pdf index 488ed1ad..ed90db35 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/TALL_002.pdf and b/NEON_manuscript/Figures/Amb_forecasts/TALL_002.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/TALL_008.pdf b/NEON_manuscript/Figures/Amb_forecasts/TALL_008.pdf index 51c59d29..5ab38c1b 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/TALL_008.pdf and b/NEON_manuscript/Figures/Amb_forecasts/TALL_008.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/UKFS_001.pdf b/NEON_manuscript/Figures/Amb_forecasts/UKFS_001.pdf index f8bd57c0..464de0ff 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/UKFS_001.pdf and b/NEON_manuscript/Figures/Amb_forecasts/UKFS_001.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/UKFS_003.pdf b/NEON_manuscript/Figures/Amb_forecasts/UKFS_003.pdf index 786e54e8..ed0f18db 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/UKFS_003.pdf and b/NEON_manuscript/Figures/Amb_forecasts/UKFS_003.pdf differ diff --git a/NEON_manuscript/Figures/Amb_forecasts/UKFS_004.pdf b/NEON_manuscript/Figures/Amb_forecasts/UKFS_004.pdf index bb59cdf5..b205b254 100644 Binary files a/NEON_manuscript/Figures/Amb_forecasts/UKFS_004.pdf and b/NEON_manuscript/Figures/Amb_forecasts/UKFS_004.pdf differ diff --git a/NEON_manuscript/Figures/Amb_performances.pdf b/NEON_manuscript/Figures/Amb_performances.pdf index 4a640765..f6591fe2 100644 Binary files a/NEON_manuscript/Figures/Amb_performances.pdf and b/NEON_manuscript/Figures/Amb_performances.pdf differ diff --git a/NEON_manuscript/Figures/Amb_sitewise_analysis.pdf b/NEON_manuscript/Figures/Amb_sitewise_analysis.pdf index 838c5d31..438ea9d3 100644 Binary files a/NEON_manuscript/Figures/Amb_sitewise_analysis.pdf and b/NEON_manuscript/Figures/Amb_sitewise_analysis.pdf differ diff --git a/NEON_manuscript/Figures/Amb_trendcorrelations.pdf b/NEON_manuscript/Figures/Amb_trendcorrelations.pdf index bbefce48..0ebb596d 100644 Binary files a/NEON_manuscript/Figures/Amb_trendcorrelations.pdf and b/NEON_manuscript/Figures/Amb_trendcorrelations.pdf differ diff --git a/NEON_manuscript/Figures/Fig1_extrapolation_example.pdf b/NEON_manuscript/Figures/Fig1_extrapolation_example.pdf new file mode 100644 index 00000000..a96d51bb Binary files /dev/null and b/NEON_manuscript/Figures/Fig1_extrapolation_example.pdf differ diff --git a/NEON_manuscript/Figures/Fig1_simulation_drps_missing_plot.pdf b/NEON_manuscript/Figures/Fig2_simulation_drps_missing_plot.pdf similarity index 51% rename from NEON_manuscript/Figures/Fig1_simulation_drps_missing_plot.pdf rename to NEON_manuscript/Figures/Fig2_simulation_drps_missing_plot.pdf index 5565dce4..24797974 100644 Binary files a/NEON_manuscript/Figures/Fig1_simulation_drps_missing_plot.pdf and b/NEON_manuscript/Figures/Fig2_simulation_drps_missing_plot.pdf differ diff --git a/NEON_manuscript/Figures/Fig3_Ixodes_performances.pdf b/NEON_manuscript/Figures/Fig3_Ixodes_performances.pdf deleted file mode 100644 index 843ad327..00000000 Binary files a/NEON_manuscript/Figures/Fig3_Ixodes_performances.pdf and /dev/null differ diff --git a/NEON_manuscript/Figures/Fig2_simulation_coverage_nseries_plot.pdf b/NEON_manuscript/Figures/Fig3_simulation_coverage_nseries_plot.pdf similarity index 60% rename from NEON_manuscript/Figures/Fig2_simulation_coverage_nseries_plot.pdf rename to NEON_manuscript/Figures/Fig3_simulation_coverage_nseries_plot.pdf index 10863605..38cabff5 100644 Binary files a/NEON_manuscript/Figures/Fig2_simulation_coverage_nseries_plot.pdf and b/NEON_manuscript/Figures/Fig3_simulation_coverage_nseries_plot.pdf differ diff --git a/NEON_manuscript/Figures/Fig4_Ixodes_performances.pdf b/NEON_manuscript/Figures/Fig4_Ixodes_performances.pdf new file mode 100644 index 00000000..2a313f4b Binary files /dev/null and b/NEON_manuscript/Figures/Fig4_Ixodes_performances.pdf differ diff --git a/NEON_manuscript/Figures/Fig5_amb_seasonalities.pdf b/NEON_manuscript/Figures/Fig5_amb_seasonalities.pdf deleted file mode 100644 index 19b711f5..00000000 Binary files a/NEON_manuscript/Figures/Fig5_amb_seasonalities.pdf and /dev/null differ diff --git a/NEON_manuscript/Figures/Fig4_ixodes_example.pdf b/NEON_manuscript/Figures/Fig5_ixodes_example.pdf similarity index 97% rename from NEON_manuscript/Figures/Fig4_ixodes_example.pdf rename to NEON_manuscript/Figures/Fig5_ixodes_example.pdf index ca845e0b..fbd3b9c2 100644 Binary files a/NEON_manuscript/Figures/Fig4_ixodes_example.pdf and b/NEON_manuscript/Figures/Fig5_ixodes_example.pdf differ diff --git a/NEON_manuscript/Figures/Fig6_amb_seasonalities.pdf b/NEON_manuscript/Figures/Fig6_amb_seasonalities.pdf new file mode 100644 index 00000000..1ac8b758 Binary files /dev/null and b/NEON_manuscript/Figures/Fig6_amb_seasonalities.pdf differ diff --git a/NEON_manuscript/Figures/Fig6_amb_uncertainties.pdf b/NEON_manuscript/Figures/Fig7_amb_uncertainties.pdf similarity index 55% rename from NEON_manuscript/Figures/Fig6_amb_uncertainties.pdf rename to NEON_manuscript/Figures/Fig7_amb_uncertainties.pdf index 68fac955..f1513fad 100644 Binary files a/NEON_manuscript/Figures/Fig6_amb_uncertainties.pdf and b/NEON_manuscript/Figures/Fig7_amb_uncertainties.pdf differ diff --git a/NEON_manuscript/Figures/FigS1_simulation_drps_nseries_plot.pdf b/NEON_manuscript/Figures/FigS1_simulation_drps_nseries_plot.pdf index 0ff243b4..d66578eb 100644 Binary files a/NEON_manuscript/Figures/FigS1_simulation_drps_nseries_plot.pdf and b/NEON_manuscript/Figures/FigS1_simulation_drps_nseries_plot.pdf differ diff --git a/NEON_manuscript/Figures/FigS2_simulation_coverage_missing_plot.pdf b/NEON_manuscript/Figures/FigS2_simulation_coverage_missing_plot.pdf index b9d5b805..8f5042ae 100644 Binary files a/NEON_manuscript/Figures/FigS2_simulation_coverage_missing_plot.pdf and b/NEON_manuscript/Figures/FigS2_simulation_coverage_missing_plot.pdf differ diff --git a/NEON_manuscript/Manuscript/Clark_etal_2021_Dynamic_GAMs.docx b/NEON_manuscript/Manuscript/Clark_etal_2021_Dynamic_GAMs.docx new file mode 100644 index 00000000..f13095d1 Binary files /dev/null and b/NEON_manuscript/Manuscript/Clark_etal_2021_Dynamic_GAMs.docx differ diff --git a/NEON_manuscript/Manuscript/~$ark_etal_2021_Dynamic_GAMs.docx b/NEON_manuscript/Manuscript/~$ark_etal_2021_Dynamic_GAMs.docx new file mode 100644 index 00000000..32ca91cf Binary files /dev/null and b/NEON_manuscript/Manuscript/~$ark_etal_2021_Dynamic_GAMs.docx differ diff --git a/NEON_manuscript/amb_hyp_tests.R b/NEON_manuscript/amb_hyp_tests.R index 35367520..2caff84e 100644 --- a/NEON_manuscript/amb_hyp_tests.R +++ b/NEON_manuscript/amb_hyp_tests.R @@ -47,9 +47,9 @@ hyp3 = y ~ s(season, by = series, m = 1, k = 8) - 1 # Fit each hypothesis -n.burnin = 100000 -n.iter = 5000 -thin = 5 +n.burnin = 1000 +n.iter = 1000 +thin = 1 # Use default priors for latent drift terms and for smooth penalties fit_null <- fit_mvgam(data_train = all_data$data_train, @@ -315,22 +315,18 @@ for(i in seq_along(levels(all_data$data_train$series))){ mai = c(.38,.4,.45,.05), mgp = c(2, 1, 0)) plot_mvgam_fc(object = fit_null$out_gam_mod, data_test = all_data$data_test, - data_train = all_data$data_train, series = i) mtext(side = 3, 'Null', cex = 1, line = 1.55) plot_mvgam_fc(object = fit_hyp1$out_gam_mod, data_test = all_data$data_test, - data_train = all_data$data_train, series = i) mtext(side = 3, 'Hyp1', cex = 1, line = 1.35) plot_mvgam_fc(object = fit_hyp2$out_gam_mod, data_test = all_data$data_test, - data_train = all_data$data_train, series = i) mtext(side = 3, 'Hyp2', cex = 1, line = 1.55) plot_mvgam_fc(object = fit_hyp3$out_gam_mod, data_test = all_data$data_test, - data_train = all_data$data_train, series = i) mtext(side = 3, 'Hyp3', cex = 1, line = 1.55) diff --git a/NEON_manuscript/ixodes_hyp_tests.R b/NEON_manuscript/ixodes_hyp_tests.R index f48607c8..d8e9efbf 100644 --- a/NEON_manuscript/ixodes_hyp_tests.R +++ b/NEON_manuscript/ixodes_hyp_tests.R @@ -186,7 +186,7 @@ ggplot(plot_dat_ranks, panel.grid.minor = element_blank()) -> plot2 dir.create('NEON_manuscript/Figures', recursive = T, showWarnings = F) -pdf('NEON_manuscript/Figures/Fig3_Ixodes_performances.pdf', width = 6.25, height = 5) +pdf('NEON_manuscript/Figures/Fig4_Ixodes_performances.pdf', width = 6.25, height = 5) cowplot::plot_grid(plot1, plot2, ncol = 1) dev.off() diff --git a/NEON_manuscript/neon_analysis.R b/NEON_manuscript/neon_analysis.R index 940ecc35..759f1e73 100644 --- a/NEON_manuscript/neon_analysis.R +++ b/NEON_manuscript/neon_analysis.R @@ -10,7 +10,7 @@ library(mvgam) all_data <- prep_neon_data(species = 'Ixodes_scapularis', split_prop = 0.8) # Make plots for manuscript -pdf('NEON_manuscript/Figures/Fig4_ixodes_example.pdf', width = 6.25, height = 5.85) +pdf('NEON_manuscript/Figures/Fig5_ixodes_example.pdf', width = 6.25, height = 5.85) par(mfrow = c(2, 2), mgp = c(2.5, 1, 0), mai = c(0.6, 0.6, 0.2, 0.2)) @@ -39,7 +39,7 @@ dev.off() # Plot some of the varying seasonal shapes from the Amblyomma model load('NEON_manuscript/Results/amb_models.rda') all_data <- prep_neon_data(species = 'Ambloyomma_americanum', split_prop = 0.8) -pdf('NEON_manuscript/Figures/Fig5_amb_seasonalities.pdf', width = 6.25, height = 5.85) +pdf('NEON_manuscript/Figures/Fig6_amb_seasonalities.pdf', width = 6.25, height = 5.85) par(mfrow = c(2, 2), mgp = c(2.5, 1, 0), mai = c(0.6, 0.6, 0.2, 0.2)) @@ -58,14 +58,13 @@ for(i in c(1, 6, 9, 15)){ dev.off() # Plot some of the different uncertainty decompositions for Amblyomma model -pdf('NEON_manuscript/Figures/Fig6_amb_uncertainties.pdf', width = 6.25, height = 5.85) +pdf('NEON_manuscript/Figures/Fig7_amb_uncertainties.pdf', width = 6.25, height = 5.85) par(mfrow = c(2, 2), mgp = c(2.5, 1, 0), mai = c(0.6, 0.6, 0.2, 0.2)) for(i in c(4, 6, 9, 15)){ plot_mvgam_uncertainty(fit_hyp3$out_gam_mod, series = i, data_test = all_data$data_test, - data_train = all_data$data_train, legend_position = 'bottomleft') } diff --git a/NEON_manuscript/neon_utility_functions.R b/NEON_manuscript/neon_utility_functions.R index 270686d1..dfed8e20 100644 --- a/NEON_manuscript/neon_utility_functions.R +++ b/NEON_manuscript/neon_utility_functions.R @@ -135,9 +135,8 @@ plot_mvgam_season = function(out_gam_mod, series, data_test, data_train, levels(data_train$series)[series])]))) Xp <- predict(out_gam_mod$mgcv_model, newdata = pred_dat, type = 'lpmatrix') betas <- MCMCvis::MCMCchains(out_gam_mod$jags_output, 'b') - gam_comps <- MCMCvis::MCMCchains(out_gam_mod$jags_output, 'gam_comp')[,series] - plot(gam_comps[1] * (Xp %*% betas[1,]) ~ pred_dat$season, ylim = range(gam_comps[1] * (Xp %*% betas[1,]) + 2.5, - gam_comps[1] * (Xp %*% betas[1,]) - 2.5), + plot((Xp %*% betas[1,]) ~ pred_dat$season, ylim = range((Xp %*% betas[1,]) + 2.5, + (Xp %*% betas[1,]) - 2.5), col = rgb(150, 0, 0, max = 255, alpha = 10), type = 'l', ylab = paste0('F(season) for ', levels(data_train$series)[series]), @@ -146,9 +145,9 @@ plot_mvgam_season = function(out_gam_mod, series, data_test, data_train, rect(xleft = 4, xright = 18, ybottom = -100, ytop = 100, col = 'gray90', border = NA) abline(h = 0, lwd=1) - text(x=11,y=max(gam_comps[1] * (Xp %*% betas[1,]) + 2.3), labels = 'Peak tick season') + text(x=11,y=max((Xp %*% betas[1,]) + 2.3), labels = 'Peak tick season') for(i in 2:1000){ - lines(gam_comps[i] * (Xp %*% betas[i,]) ~ pred_dat$season, + lines((Xp %*% betas[i,]) ~ pred_dat$season, col = rgb(150, 0, 0, max = 255, alpha = 10)) } @@ -173,10 +172,9 @@ plot_mvgam_gdd = function(out_gam_mod, series, data_test, data_train, levels(data_train$series)[series])]))) Xp <- predict(out_gam_mod$mgcv_model, newdata = pred_dat, type = 'lpmatrix') betas <- MCMCvis::MCMCchains(out_gam_mod$jags_output, 'b') - gam_comps <- MCMCvis::MCMCchains(out_gam_mod$jags_output, 'gam_comp')[,series] preds <- matrix(NA, nrow = 1000, ncol = length(pred_dat$cum_gdd)) for(i in 1:1000){ - preds[i,] <- rnbinom(length(pred_dat$cum_gdd), mu = exp(gam_comps[i] * (Xp %*% betas[i,])), + preds[i,] <- rnbinom(length(pred_dat$cum_gdd), mu = exp((Xp %*% betas[i,])), size = MCMCvis::MCMCsummary(out_gam_mod$jags_output, 'r')$mean) } int <- apply(preds, diff --git a/NEON_manuscript/portal_example.R b/NEON_manuscript/portal_example.R index b50b9e62..89d90542 100644 --- a/NEON_manuscript/portal_example.R +++ b/NEON_manuscript/portal_example.R @@ -20,23 +20,24 @@ ds_resids = function(truth, fitted, size){ # Convert PP catches to timeseries and then to mvgam format catches <- portal_dat$PP -plot(catches, type = 'l') # Keep only data from the year 2004 onward for this example to make the series more manageable series <- xts::as.xts(subset(ts(catches, start = c(1979, 9), frequency = 12), start = c(293))) head(series) colnames(series) <- c('PP') +plot(series) PP_data <- series_to_mvgam(series, freq = 12, train_prop = 0.9) rm(series, portal_dat) -# Fit standard GAM to the training data using an AR3 model (the maximum order that mvgam allows) for the residuals. -# For a GAMM model, we cannot use negative binomial so will stick with Poisson (even though) -# a negative binomial is likely more appropriate here. We add a 'time' index for the -# form of the residual autocorrelation +# Fit standard GAM to the training data using an AR3 model for the residuals. +# For a GAMM model, we cannot use negative binomial so will stick with Poisson (even though +# a negative binomial is likely more appropriate here). We add a 'time' index for the +# form of the residual autocorrelation and use a 1st derivative penalty for the year effect +# to prevent linear extrapolation of the yearly trend when forecasting PP_data$data_train$time <- seq(1:NROW(PP_data$data_train)) gam_mod <- gamm(y ~ s(season, bs = 'cc', k = 8) + - s(year, bs = 'gp', k = 4) + ti(season, year), + s(year, bs = 'gp', k = 4, m = 1) + ti(season, year), data = PP_data$data_train, family = 'poisson', correlation = corARMA(form = ~ time, p = 3)) summary(gam_mod$gam) @@ -51,11 +52,14 @@ gam_resids <- ds_resids(truth = gam_mod$gam$y, size = 1000) acf(gam_resids, na.action = na.pass) +# Standardized residuals also show autocorrelation +acf(residuals(gam_mod$lme,type="normalized"),main="standardized residual ACF") + # Which kind of model might be suitable for the trend? -plot(gam_resids, type = 'l') +plot(residuals(gam_mod$lme,type="normalized"), type = 'l') forecast::auto.arima(gam_resids) -# DS residuals show strong autocorrelation. Do Pearson residuals? +# DS and standardized residuals both still show strong autocorrelation. Do Pearson residuals? gam_pearson_resids <- (gam_mod$gam$y - fv)/sqrt(fv) acf(gam_pearson_resids) forecast::auto.arima(gam_pearson_resids) @@ -65,25 +69,29 @@ forecast::auto.arima(gam_pearson_resids) # which also highlights existing autocorrelation gam.check(gam_mod$gam) -# Equivalent model using mvgam's dynamic trend component. +# Now a model using mvgam # These models generally require at least 5 - 15K iterations # of adaptation to tune the samplers efficiently, so it can be a little slow. Note that # the model will predict for the data_test observations by considering the outcomes as -# missing for these observations +# missing for these observations. We also incorporate prior knowledge about upper bounds on +# this series, which exist due to limits on the number of traps used in each trapping session PP_data$data_train$time <- NULL df_gam_mod <- mvjagam(data_train = PP_data$data_train, data_test = PP_data$data_test, formula = y ~ s(season, bs = c('cc'), k = 8) + - s(year, bs = 'gp', k = 4) + ti(season, year), + s(year, bs = 'gp', k = 4, m = 1) + ti(season, year), trend_model = 'AR3', - family = 'poisson', + family = 'nb', n.burnin = 10000, - n.iter = 2000, - thin = 2, - auto_update = F) + n.iter = 1000, + thin = 1, + auto_update = F, + upper_bounds = 100) +summary_mvgam(df_gam_mod) +summary(df_gam_mod$mgcv_model) # ACF of DS residuals shows no remaining autocorrelation -acf(df_gam_mod$resids$PP, na.action = na.pass) +plot_mvgam_resids(df_gam_mod, 1) # Smooth function plots from each model par(mfrow=c(1,2)) @@ -123,8 +131,10 @@ polygon(c(seq(1:(NCOL(cred_ints))), rev(seq(1:NCOL(cred_ints)))), lines(cred_ints[2,], col = rgb(150, 0, 0, max = 255), lwd = 2, lty = 'dashed') points(PP_data$data_test$y[1:12], pch = 16) -# The mgcv model overpredicts for the out of sample test set, though I don't know -# how to incorporate the autocorrelation component into this forecast +# The mgcv model overpredicts for the out of sample test set and has unreasonably narrow +# prediction intervals, though admittedly I don't know how to easily +# incorporate the autocorrelation component into this forecast. + # Forecast for the dynamic gam model on the same y-axis scale fits <- MCMCvis::MCMCchains(df_gam_mod$jags_output, 'ypred') fits <- fits[,(NROW(df_gam_mod$obs_data)+1):(NROW(df_gam_mod$obs_data)+13)] @@ -154,21 +164,23 @@ par(mfrow = c(1,1)) ## Online forecasting from the dynamic gam # Initiate particle filter by assimilating the next observation in data_test -pfilter_mvgam_init(object = df_gam_mod, n_particles = 20000, n_cores = 3, +pfilter_mvgam_init(object = df_gam_mod, n_particles = 40000, n_cores = 4, data_assim = PP_data$data_test) -# Assimilate some of the next observations -pfilter_mvgam_online(data_assim = PP_data$data_test[1:3,], n_cores = 3, +# Assimilate some of the next observations in the series using the +# kernel smoothing sequential monte carlo algorithm +pfilter_mvgam_online(data_assim = PP_data$data_test[1:5,], n_cores = 4, kernel_lambda = 1) # Forecast from particles using the covariate information in remaining data_test observations -fc <- pfilter_mvgam_fc(file_path = 'pfilter', n_cores = 3, +fc <- pfilter_mvgam_fc(file_path = 'pfilter', n_cores = 4, data_test = PP_data$data_test, ylim = c(0, 100), plot_legend = F) -# Compare to original forecast +# Compare the updated forecast to the original forecast par(mfrow=c(1,2)) plot_mvgam_fc(df_gam_mod, 1, data_test = PP_data$data_test, ylim = c(0, 100)) fc$PP() points(c(PP_data$data_train$y, PP_data$data_test$y), pch = 16) + diff --git a/NEON_manuscript/simulation_analysis.R b/NEON_manuscript/simulation_analysis.R index 7327dce2..145afc46 100644 --- a/NEON_manuscript/simulation_analysis.R +++ b/NEON_manuscript/simulation_analysis.R @@ -1,4 +1,105 @@ #### Simulation analysis #### +# Simulate a series with a nonlinear trend to visualise how a spline extrapolates +library(mvgam) +library(xts) +library(forecast) +data("AirPassengers") +set.seed(1e8) +dat <- floor(AirPassengers + cumsum(rnorm(length(AirPassengers), + sd = 35))) +dat <- dat + abs(min(dat)) +series <- ts(dat, start = c(1949, 1), frequency = 12) +fake_data <- series_to_mvgam(series, freq = 365, train_prop = 0.74) + +# Fit a GAM to the data using mgcv; use a wiggly Gaussian Process smooth for the +# trend and a cyclic smooth for the seasonality +gam_mod <- gam(y ~ s(year, k = 9, bs = 'tp') + + s(season, bs = 'cc', k = 10) + + ti(season, year), + data = fake_data$data_train, + family = nb(), + method = 'REML') + +# Calculate predictions for the mgcv model to inspect extrapolation behaviour +Xp <- predict(gam_mod, newdata = rbind(fake_data$data_train, + fake_data$data_test), type = 'lpmatrix') +vc <- vcov(gam_mod) +sim <- MASS::mvrnorm(1000, + mu = coef(gam_mod), Sigma = vc) +dims_needed <- dim(exp(Xp %*% t(sim))) +fits <- rnbinom(n = prod(dims_needed), mu = as.vector(exp(Xp %*% t(sim))), + size = gam_mod$family$getTheta(TRUE)) +fits <- t(matrix(fits, nrow = dims_needed[1], ncol = dims_needed[2])) + +# Plot the smooth function and forecast 95% and 68% HPD intervals +pdf('NEON_manuscript/Figures/Fig1_extrapolation_example.pdf', width = 6.25, height = 5.85) +par(mfrow = c(2, 2), + mgp = c(2.5, 1, 0), + mai = c(0.6, 0.6, 0.2, 0.2)) +plot(gam_mod, select = 1, shade = T, main = '2nd derivative penalty') +cred_ints <- apply(fits, 2, function(x) hpd(x, 0.95)) +ylims <- c(0, + max(cred_ints) + 2) +plot(cred_ints[3,] ~ seq(1:NCOL(cred_ints)), type = 'l', + col = rgb(1,0,0, alpha = 0), + ylim = ylims, + ylab = 'Predicted counts', + xlab = 'Time') +polygon(c(seq(1:(NCOL(cred_ints))), rev(seq(1:NCOL(cred_ints)))), + c(cred_ints[1,],rev(cred_ints[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 100), border = NA) +cred_ints <- apply(fits, 2, function(x) hpd(x, 0.68)) +polygon(c(seq(1:(NCOL(cred_ints))), rev(seq(1:NCOL(cred_ints)))), + c(cred_ints[1,],rev(cred_ints[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 180), border = NA) +lines(cred_ints[2,], col = rgb(150, 0, 0, max = 255), lwd = 2, lty = 'dashed') +lines(as.vector(series)) +abline(v = NROW(fake_data$data_train), + lty = 'dashed') +text(x=NROW(fake_data$data_train) + 5, + y=3000, labels = 'Forecast horizon', srt = -90) + +# Repeat by shifting the training window slightly so see how the extrapolation of the +# mgcv model is simply reacting to autocorrelation +fake_data <- series_to_mvgam(series, freq = 365, train_prop = 0.74) +gam_mod <- gam(y ~ s(year, k = 9, bs = 'tp', m = 1) + + s(season, bs = 'cc', k = 10) + + ti(season, year), + data = fake_data$data_train, + family = nb(), + method = 'REML') +plot(gam_mod, select = 1, shade = T, main = '1st derivative penalty') +Xp <- predict(gam_mod, newdata = rbind(fake_data$data_train, + fake_data$data_test), type = 'lpmatrix') +vc <- vcov(gam_mod) +sim <- MASS::mvrnorm(1000, + mu = coef(gam_mod), Sigma = vc) +dims_needed <- dim(exp(Xp %*% t(sim))) +fits <- rnbinom(n = prod(dims_needed), mu = as.vector(exp(Xp %*% t(sim))), + size = gam_mod$family$getTheta(TRUE)) +fits <- t(matrix(fits, nrow = dims_needed[1], ncol = dims_needed[2])) +cred_ints <- apply(fits, 2, function(x) hpd(x, 0.95)) +ylims <- c(0, + max(cred_ints) + 2) +plot(cred_ints[3,] ~ seq(1:NCOL(cred_ints)), type = 'l', + col = rgb(1,0,0, alpha = 0), + ylim = ylims, + ylab = 'Predicted counts', + xlab = 'Time') +polygon(c(seq(1:(NCOL(cred_ints))), rev(seq(1:NCOL(cred_ints)))), + c(cred_ints[1,],rev(cred_ints[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 100), border = NA) +cred_ints <- apply(fits, 2, function(x) hpd(x, 0.68)) +polygon(c(seq(1:(NCOL(cred_ints))), rev(seq(1:NCOL(cred_ints)))), + c(cred_ints[1,],rev(cred_ints[3,])), + col = rgb(150, 0, 0, max = 255, alpha = 180), border = NA) +lines(cred_ints[2,], col = rgb(150, 0, 0, max = 255), lwd = 2, lty = 'dashed') +lines(as.vector(series)) +abline(v = NROW(fake_data$data_train), + lty = 'dashed') +dev.off() + +#### Analysis plots from the simulation component #### load('NEON_manuscript/Results/sim_results.rda') run_parameters <- expand.grid(n_series = c(4, 12), T = c(72), @@ -81,7 +182,7 @@ ggplot(drps_plot_dat %>% scale_fill_viridis(discrete = T, begin = 0.2, end = 1, guide = FALSE) + theme_bw() + coord_flip() + labs(x = '', y = 'Normalised DRPS calibration (lower is better)', title = 'Strong trend') -> plot2 -pdf('NEON_manuscript/Figures/Fig1_simulation_drps_missing_plot.pdf') +pdf('NEON_manuscript/Figures/Fig2_simulation_drps_missing_plot.pdf') cowplot::plot_grid(plot1, plot2, ncol = 1) dev.off() @@ -154,6 +255,6 @@ ggplot(coverage_plot_dat %>% ylim(0, 1) + theme_bw() + coord_flip() + labs(x = '', y = '90% interval coverage', title = 'Strong trend') -> plot2 -pdf('NEON_manuscript/Figures/Fig2_simulation_coverage_nseries_plot.pdf') +pdf('NEON_manuscript/Figures/Fig3_simulation_coverage_nseries_plot.pdf') cowplot::plot_grid(plot1, plot2, ncol = 1) dev.off() diff --git a/R/compare_mvgams.R b/R/compare_mvgams.R index 145c2c22..461d7295 100644 --- a/R/compare_mvgams.R +++ b/R/compare_mvgams.R @@ -51,6 +51,11 @@ rownames(model_summary) <- c('Model 1', 'Model 2') cat('DRPS summaries per model (lower is better)\n') print(model_summary) +# Print 90% interval coverages for each model +cat('\n90% interval coverages per model (closer to 0.9 is better)\n') +cat('Model 1', mod1_eval$interval_coverage, '\n') +cat('Model 2', mod2_eval$interval_coverage) + # Set up plotting loop and return summary plots of DRPS ask <- TRUE @@ -68,12 +73,13 @@ if(i == 1){ mod2_eval$drps_horizon_summary$mean_drps) colnames(plot_dat) <- seq(1:NCOL(plot_dat)) barplot(plot_dat, + ylim = c(0, max(plot_dat, na.rm = T) * 1.5), beside = T, xlab = 'Forecast horizon', ylab = 'Mean DRPS', col = c("darkgrey", "black"), legend.text = c('Model 1', 'Model 2'), - args.legend = list(x = "topleft")) + args.legend = list(x = "top")) } if(ask){ diff --git a/R/eval_mvgam.R b/R/eval_mvgam.R index 0eb36f1e..5c9758fd 100644 --- a/R/eval_mvgam.R +++ b/R/eval_mvgam.R @@ -395,7 +395,7 @@ eval_mvgam = function(object, scores <- matrix(NA, nrow = length(truth), ncol = 2) for(i in indices_keep){ scores[i,] <- drps_score(truth = as.vector(truth)[i], - fc = fc[i,], interval_width) + fc = fc[,i], interval_width) } } scores diff --git a/R/hpd.R b/R/hpd.R index 8c2fd1e1..7d0ab3a8 100644 --- a/R/hpd.R +++ b/R/hpd.R @@ -4,8 +4,8 @@ #' #'@param x \code{vector} of values representing the distribution to be summarised #'@param coverage \code{numeric} value specifying the width of the HPD interval. Default is 0.95 -#'@return A \code{list} containing the reconciled forecast distributions for each series in \code{y}. Each element in -#'the \code{vector} with three values: lower estimate, median estimate and upper estimate of the HPD interval +#'@return A \code{matrix} containing the estimated lower, and upper bounds of the interval along with the estimated +#'mode #' #'@export #' diff --git a/R/mvjagam.R b/R/mvjagam.R index aeff6f01..0fea3070 100644 --- a/R/mvjagam.R +++ b/R/mvjagam.R @@ -9,6 +9,10 @@ #'@param formula A \code{character} string specifying the GAM formula. These are exactly like the formula #'for a GLM except that smooth terms, s, te, ti and t2, can be added to the right hand side #'to specify that the linear predictor depends on smooth functions of predictors (or linear functionals of these). +#'@param knots An optional \code{list} containing user specified knot values to be used for basis construction. +#'For most bases the user simply supplies the knots to be used, which must match up with the k value supplied +#'(note that the number of knots is not always just k). Different terms can use different numbers of knots, +#'unless they share a covariate. #'@param data_train A \code{dataframe} containing the model response variable and covariates #'required by the GAM \code{formula}. Should include columns: #''y' (the discrete outcomes; \code{NA}s allowed) @@ -50,10 +54,15 @@ #'is computationally expensive in \code{JAGS} but can lead to better estimates when true bounds exist. Default is to remove #'truncation entirely (i.e. there is no upper bound for each series) #'@details A \code{\link[mcgv]{jagam}} model file is generated from \code{formula} and modified to include the latent -#'state space trends. Prior distributions for most important terms can be altered by the user to inspect model +#'state space trends. Prior distributions for most important model parameters can be altered by the user to inspect model #'sensitivities to given priors. Note that latent trends are estimated on the log scale so choose tau, AR and phi priors #'accordingly. The model parameters are esimated in a Bayesian framework using Gibbs sampling -#'in \code{\link[rjags]{jags.model}}. For the time being, all series are assumed to have the same overdispersion +#'in \code{\link[rjags]{jags.model}}. When using a dynamic factor model for the trends, factor loadings are given +#'regularized horseshoe priors to theoretically allow some factors to be dropped from the model by squeezing loadings for +#'an entire factor to zero. This is done to help protect against selecting too many latent factors than are needed to +#'capture dependencies in the data, so it can often be advantageous to set n_lv to a slightly larger number. However +#'larger numbers of factors do come with additional computational costs so these should be balanced as well. +#'For the time being, all series are assumed to have the same overdispersion #'parameter when using a negative binomial distribution, though this will be relaxed in future versions. For each #'series, randomized quantile (i.e. Dunn-Smyth) residuals are calculated for inspecting model diagnostics using #'medians of posterior predictions. If the fitted model is appropriate then Dunn-Smyth residuals will be @@ -70,6 +79,7 @@ #'@export mvjagam = function(formula, + knots, data_train, data_test, prior_simulation = FALSE, @@ -100,20 +110,40 @@ mvjagam = function(formula, # Estimate the GAM model using mgcv so that the linear predictor matrix can be easily calculated # when simulating from the JAGS model later on - ss_gam <- mgcv::gam(formula(formula), - data = data_train, - method = "REML", - family = poisson(), - drop.unused.levels = FALSE) + if(!missing(knots)){ + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = poisson(), + drop.unused.levels = FALSE, + knots = knots) + } else { + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = poisson(), + drop.unused.levels = FALSE) + } # Make jags file and appropriate data structures - ss_jagam <- mgcv::jagam(formula, - data = data_train, - family = poisson(), - drop.unused.levels = FALSE, - file = 'base_gam.txt', - sp.prior = 'gamma', - diagonalize = F) + if(!missing(knots)){ + ss_jagam <- mgcv::jagam(formula, + data = data_train, + family = poisson(), + drop.unused.levels = FALSE, + file = 'base_gam.txt', + sp.prior = 'gamma', + diagonalize = F, + knots = knots) + } else { + ss_jagam <- mgcv::jagam(formula, + data = data_train, + family = poisson(), + drop.unused.levels = FALSE, + file = 'base_gam.txt', + sp.prior = 'gamma', + diagonalize = F) + } # Fill with NAs if this is a simulation from the priors if(prior_simulation){ @@ -314,14 +344,20 @@ mvjagam = function(formula, ## Latent factor loadings are penalized using a regularized horseshoe prior ## to allow loadings for entire factors to be 'dropped', reducing overfitting. Still - ## need to impose identifiability constraints by setting upper diagonal to zero - - ## Global shrinkage penalty (half cauchy) - lv_tau ~ dt(0, 1, 1)T(0, ) - - ## Shrinkage penalties for each factor (half cauchy) + ## need to impose identifiability constraints + + ## Global shrinkage penalty (half cauchy, following Gelman et + ## al. 2008: A weakly informative default prior distribution for logistic and other + ## regression models) + lv_tau ~ dscaled.gamma(0.5, 5) + + ## Shrinkage penalties for each factor, which are also half cuachy to allow + ## the entire factor's set of coefficients to be squeezed toward zero if supported by + ## the data. The prior for individual factor penalties allows each factor to possibly + ## have a relatively large penalty, which shrinks the prior for that factor's loadings + ## substantially for (j in 1:n_lv){ - penalty[j] ~ dt(0, 1, 1)T(0, ) + penalty[j] ~ dscaled.gamma(0.5, 0.5) } ## Upper triangle of loading matrix set to zero @@ -333,20 +369,20 @@ mvjagam = function(formula, ## Positive constraints on loading diagonals for(j in 1:n_lv) { - lv_coefs[j, j] ~ dnorm(0, (1/lv_tau) * (1/penalty[j]))T(0, ); + lv_coefs[j, j] ~ dnorm(0, lv_tau * penalty[j])T(0, 1); } ## Lower diagonal free for(j in 2:n_lv){ for(j2 in 1:(j - 1)){ - lv_coefs[j, j2] ~ dnorm(0, (1/lv_tau) * (1/penalty[j2]))T(-1, 1); + lv_coefs[j, j2] ~ dnorm(0, lv_tau * penalty[j2])T(-1, 1); } } ## Other elements also free for(j in (n_lv + 1):n_series) { for(j2 in 1:n_lv){ - lv_coefs[j, j2] ~ dnorm(0, (1/lv_tau) * (1/penalty[j2]))T(-1, 1); + lv_coefs[j, j2] ~ dnorm(0, lv_tau * penalty[j2])T(-1, 1); } } @@ -615,7 +651,12 @@ mvjagam = function(formula, }) names(series_resids) <- levels(data_train$series) - return(list(jags_output = out_gam_mod, + return(list(call = formula, + family = ifelse(family == 'nb', + 'Negative Binomial', + 'Poisson'), + pregam = ss_jagam$pregam, + jags_output = out_gam_mod, model_file = model_file, mgcv_model = ss_gam, jags_model = gam_mod, diff --git a/R/plot_mvgam_fc.R b/R/plot_mvgam_fc.R index 2996fdd4..184686bc 100644 --- a/R/plot_mvgam_fc.R +++ b/R/plot_mvgam_fc.R @@ -7,13 +7,14 @@ #'Note this is only useful if the same \code{data_test} was also included when fitting the original model. #'@param hide_xlabels \code{logical}. If \code{TRUE}, no xlabels are printed to allow the user to add custom labels using #'\code{axis} from base \code{R} +#'@param ylab Optional \code{character} string specifying the y-axis label #'@param ylim Optional \code{vector} of y-axis limits (min, max) #'@details Posterior predictions are drawn from the fitted \code{mvjagam} and used to calculate 95 percent and #'68 percent highest posterior density credible intervals. These are plotted along with the true observed data #'that was used to train the model. #'@return A base \code{R} graphics plot #'@export -plot_mvgam_fc = function(object, series, data_test, hide_xlabels = FALSE, ylim){ +plot_mvgam_fc = function(object, series, data_test, hide_xlabels = FALSE, ylab, ylim){ data_train <- object$obs_data ends <- seq(0, dim(MCMCvis::MCMCchains(object$jags_output, 'ypred'))[2], @@ -31,17 +32,21 @@ plot_mvgam_fc = function(object, series, data_test, hide_xlabels = FALSE, ylim){ ylim <- c(0, max(int) + 2) } + if(missing(ylab)){ + ylab <- paste0('Estimated counts for ', levels(data_train$series)[series]) + } + if(hide_xlabels){ plot(preds_last, type = 'l', ylim = ylim , col = rgb(1,0,0, alpha = 0), - ylab = paste0('Estimated counts for ', levels(data_train$series)[series]), + ylab = ylab, xlab = '', xaxt = 'n') } else { plot(preds_last, type = 'l', ylim = ylim, col = rgb(1,0,0, alpha = 0), - ylab = paste0('Estimated counts for ', levels(data_train$series)[series]), + ylab = ylab, xlab = 'Time') } diff --git a/R/plot_mvgam_resids.R b/R/plot_mvgam_resids.R new file mode 100644 index 00000000..d2110681 --- /dev/null +++ b/R/plot_mvgam_resids.R @@ -0,0 +1,49 @@ +#'Residual diagnostics for a fitted mvjagam object +#' +#'This function takes a fitted \code{mvjagam} object and returns various residual diagnostic plots +#' +#'@param object \code{list} object returned from \code{mvjagam} +#'@param series \code{integer} specifying which series in the set is to be plotted +#'@author Nicholas J Clark +#'@details A total of four base \code{R} plots are generated to examine Dunn-Smyth residuals for +#'the specified series. Plots include a residuals vs fitted values plot, +#'a Q-Q plot, and two plots to check for any remaining temporal autocorrelation in the residuals. +#'Note, all plots use posterior medians of fitted values / residuals, so uncertainty is not represented. +#'@return A series of base \code{R} plots +#'@export +plot_mvgam_resids = function(object, series = 1){ + +data_train <- object$obs_data +ends <- seq(0, dim(MCMCvis::MCMCchains(object$jags_output, 'ypred'))[2], + length.out = NCOL(object$ytimes) + 1) +starts <- ends + 1 +starts <- c(1, starts[-c(1, (NCOL(object$ytimes)+1))]) +ends <- ends[-1] + +preds <- MCMCvis::MCMCchains(object$jags_output, 'ypred')[,starts[series]:ends[series]] +median_preds <- apply(preds, 2, function(x) hpd(x)[2]) +.pardefault <- par(no.readonly=T) +par(.pardefault) +par(mfrow = c(2, 2)) + +# Fitted vs redisuals plot +plot(median_preds[1:length(object$resids[[series]])], + object$resids[[series]], + main = 'Resids vs Fitted Values', + xlab = 'Fitted values', + ylab = 'Residuals') + +# Q-Q plot +qqnorm(object$resids[[series]]) +qqline(object$resids[[series]], col = 2) + +# ACF plot +acf(object$resids[[series]], main = 'ACF', na.action = na.pass) + +# PACF plot +pacf(object$resids[[series]], main = 'pACF', na.action = na.pass) + +invisible() +par(.pardefault) + +} diff --git a/R/series_to_mvgam.R b/R/series_to_mvgam.R index 607af931..87205ed9 100644 --- a/R/series_to_mvgam.R +++ b/R/series_to_mvgam.R @@ -46,8 +46,8 @@ series_to_mvgam <- function(series, freq, train_prop = 0.85){ # Extract information on years and seasons from the series object if(type == 'ts'){ - dates <- lubridate::date(time(zoo::as.zoo(series))) - years <- lubridate::year(time(zoo::as.zoo(series))) + dates <- lubridate::date_decimal(as.numeric(time(series))) + years <- lubridate::year(dates) seasons <- as.vector(1 + ((time(series) %% 1) * frequency(series))) } @@ -81,8 +81,7 @@ series_to_mvgam <- function(series, freq, train_prop = 0.85){ season = rep(seasons, n_series), year = rep(years, n_series), date = rep(dates, n_series), - series = as.factor(sort(rep(series_names, T))), - in_season = 1) %>% + series = as.factor(sort(rep(series_names, T)))) %>% dplyr::arrange(year, season, series) diff --git a/R/summary_mvgam.R b/R/summary_mvgam.R new file mode 100644 index 00000000..ed6cb6c2 --- /dev/null +++ b/R/summary_mvgam.R @@ -0,0 +1,411 @@ +#'Summary for a fitted mvjagam object +#' +#'This function takes a fitted \code{mvjagam} object and prints various useful summaries from it +#' +#'@param object \code{list} object returned from \code{mvjagam} +#'@author Nicholas J Clark +#'@details A brief summary of the model's call is printed, along with posterior intervals for +#'some of the key parameters in the model. Note that some smooths have extra penalties on the null space, +#'so summaries for the \code{rho} parameters may include more penalty terms than the number of smooths in +#'the original model formula. The function also returns an approximate smooth term +#'significance table, but note that this signficance comes with the usual caveats associated with calculations +#'of p-values for smooth terms in \code{\link[mcgv]{summary.gam}} (i.e. all p-values are computed without +#'considering uncertainty in the smoothing parameter estimates). +#'@return A \code{list} is printed on-screen showing the summaries for the model +#'@export +summary_mvgam = function(object){ +# Convert samples for betas and rhos to mgcv format using sim2jam; this is necessary to +# calculate effective degrees of freedom and approximate p-values for smooth terms + sam <- jags.samples(object$jags_model,c("b","rho"),n.iter=500,thin=1) + jam <- mgcv::sim2jam(sam, object$pregam, edf.type = 1) + jam$sp <- exp(sam$rho) + rm(sam) + + #### Functions used directly from Simon Wood's mgcv summary functions R script: + # https://github.com/cran/mgcv/blob/master/R/mgcv.r + testStat <- function(p,X,V,rank=NULL,type=0,res.df= -1) { + ## Implements Wood (2013) Biometrika 100(1), 221-228 + ## The type argument specifies the type of truncation to use. + ## on entry `rank' should be an edf estimate + ## 0. Default using the fractionally truncated pinv. + ## 1. Round down to k if k<= rank < k+0.05, otherwise up. + ## res.df is residual dof used to estimate scale. <=0 implies + ## fixed scale. + + qrx <- qr(X,tol=0) + R <- qr.R(qrx) + V <- R%*%V[qrx$pivot,qrx$pivot,drop=FALSE]%*%t(R) + V <- (V + t(V))/2 + ed <- eigen(V,symmetric=TRUE) + ## remove possible ambiguity from statistic... + siv <- sign(ed$vectors[1,]);siv[siv==0] <- 1 + ed$vectors <- sweep(ed$vectors,2,siv,"*") + + k <- max(0,floor(rank)) + nu <- abs(rank - k) ## fractional part of supplied edf + if (type==1) { ## round up is more than .05 above lower + if (rank > k + .05||k==0) k <- k + 1 + nu <- 0;rank <- k + } + + if (nu>0) k1 <- k+1 else k1 <- k + + ## check that actual rank is not below supplied rank+1 + r.est <- sum(ed$values > max(ed$values)*.Machine$double.eps^.9) + if (r.est0&&k>0) { + if (k>1) vec[,1:(k-1)] <- t(t(vec[,1:(k-1)])/sqrt(ed$val[1:(k-1)])) + b12 <- .5*nu*(1-nu) + if (b12<0) b12 <- 0 + b12 <- sqrt(b12) + B <- matrix(c(1,b12,b12,nu),2,2) + ev <- diag(ed$values[k:k1]^-.5,nrow=k1-k+1) + B <- ev%*%B%*%ev + eb <- eigen(B,symmetric=TRUE) + rB <- eb$vectors%*%diag(sqrt(eb$values))%*%t(eb$vectors) + vec1 <- vec + vec1[,k:k1] <- t(rB%*%diag(c(-1,1))%*%t(vec[,k:k1])) + vec[,k:k1] <- t(rB%*%t(vec[,k:k1])) + } else { + vec1 <- vec <- if (k==0) t(t(vec)*sqrt(1/ed$val[1])) else + t(t(vec)/sqrt(ed$val[1:k])) + if (k==1) rank <- 1 + } + ## there is an ambiguity in the choise of test statistic, leading to slight + ## differences in the p-value computation depending on which of 2 alternatives + ## is arbitrarily selected. Following allows both to be computed and p-values + ## averaged (can't average test stat as dist then unknown) + d <- t(vec)%*%(R%*%p) + d <- sum(d^2) + d1 <- t(vec1)%*%(R%*%p) + d1 <- sum(d1^2) + ##d <- d1 ## uncomment to avoid averaging + + rank1 <- rank ## rank for lower tail pval computation below + + ## note that for <1 edf then d is not weighted by EDF, and instead is + ## simply refered to a chi-squared 1 + + if (nu>0) { ## mixture of chi^2 ref dist + if (k1==1) rank1 <- val <- 1 else { + val <- rep(1,k1) ##ed$val[1:k1] + rp <- nu+1 + val[k] <- (rp + sqrt(rp*(2-rp)))/2 + val[k1] <- (rp - val[k]) + } + + if (res.df <= 0) pval <- (psum.chisq(d,val)+psum.chisq(d1,val))/2 else { ## (liu2(d,val) + liu2(d1,val))/2 else + k0 <- max(1,round(res.df)) + pval <- (psum.chisq(0,c(val,-d/k0),df=c(rep(1,length(val)),k0)) + psum.chisq(0,c(val,-d1/k0),df=c(rep(1,length(val)),k0)) )/2 + #pval <- (simf(d,val,res.df) + simf(d1,val,res.df))/2 + } + } else { pval <- 2 } + ## integer case still needs computing, + ## OLD: also liu/pearson approx only good in + ## upper tail. In lower tail, 2 moment approximation is better (Can check this + ## by simply plotting the whole interesting range as a contour plot!) + ##if (pval > .5) + if (pval > 1) { + if (res.df <= 0) pval <- (pchisq(d,df=rank1,lower.tail=FALSE)+pchisq(d1,df=rank1,lower.tail=FALSE))/2 else + pval <- (pf(d/rank1,rank1,res.df,lower.tail=FALSE)+pf(d1/rank1,rank1,res.df,lower.tail=FALSE))/2 + } + list(stat=d,pval=min(1,pval),rank=rank) + } ## end of testStat + + reTest <- function(b,m) { + ## Test the mth smooth for equality to zero + ## and accounting for all random effects in model + + ## check that smooth penalty matrices are full size. + ## e.g. "fs" type smooths estimated by gamm do not + ## have full sized S matrices, and we can't compute + ## p-values here.... + if (ncol(b$smooth[[m]]$S[[1]]) != b$smooth[[m]]$last.para-b$smooth[[m]]$first.para+1) { + return(list(stat=NA,pval=NA,rank=NA)) + } + + ## find indices of random effects other than m + rind <- rep(0,0) + for (i in 1:length(b$smooth)) if (!is.null(b$smooth[[i]]$random)&&b$smooth[[i]]$random&&i!=m) rind <- c(rind,i) + ## get frequentist cov matrix of effects treating smooth terms in rind as random + rc <- recov(b,rind,m) + Ve <- rc$Ve + ind <- b$smooth[[m]]$first.para:b$smooth[[m]]$last.para + B <- mroot(Ve[ind,ind,drop=FALSE]) ## BB'=Ve + + Rm <- rc$Rm + + b.hat <- coef(b)[ind] + d <- Rm%*%b.hat + stat <- sum(d^2)/b$sig2 + ev <- eigen(crossprod(Rm%*%B)/b$sig2,symmetric=TRUE,only.values=TRUE)$values + ev[ev<0] <- 0 + rank <- sum(ev>max(ev)*.Machine$double.eps^.8) + + if (b$scale.estimated) { + #pval <- simf(stat,ev,b$df.residual) + k <- max(1,round(b$df.residual)) + pval <- psum.chisq(0,c(ev,-stat/k),df=c(rep(1,length(ev)),k)) + } else { + #pval <- liu2(stat,ev) + pval <- psum.chisq(stat,ev) + } + list(stat=stat,pval=pval,rank=rank) + } ## end reTest + + recov <- function(b,re=rep(0,0),m=0) { + ## b is a fitted gam object. re is an array of indices of + ## smooth terms to be treated as fully random.... + ## Returns frequentist Cov matrix based on the given + ## mapping from data to params, but with dist of data + ## corresponding to that implied by treating terms indexed + ## by re as random effects... (would be usual frequentist + ## if nothing treated as random) + ## if m>0, then this indexes a term, not in re, whose + ## unpenalized cov matrix is required, with the elements of re + ## dropped. + if (!inherits(b,"gam")) stop("recov works with fitted gam objects only") + if (is.null(b$full.sp)) sp <- b$sp else sp <- b$full.sp + if (length(re)<1) { + if (m>0) { + ## annoyingly, need total penalty + np <- length(coef(b)) + k <- 1;S1 <- matrix(0,np,np) + for (i in 1:length(b$smooth)) { + ns <- length(b$smooth[[i]]$S) + ind <- b$smooth[[i]]$first.para:b$smooth[[i]]$last.para + if (ns>0) for (j in 1:ns) { + S1[ind,ind] <- S1[ind,ind] + sp[k]*b$smooth[[i]]$S[[j]] + k <- k + 1 + } + } + LRB <- rbind(b$R,t(mroot(S1))) + ii <- b$smooth[[m]]$first.para:b$smooth[[m]]$last.para + ## ii is cols of LRB related to smooth m, which need + ## to be moved to the end... + LRB <- cbind(LRB[,-ii],LRB[,ii]) + ii <- (ncol(LRB)-length(ii)+1):ncol(LRB) + Rm <- qr.R(qr(LRB,tol=0,LAPACK=FALSE))[ii,ii] ## unpivoted QR + } else Rm <- NULL + return(list(Ve=(t(b$Ve)+b$Ve)*.5,Rm=Rm)) + } + + if (m%in%re) stop("m can't be in re") + ## partition R into R1 ("fixed") and R2 ("random"), with S1 and S2 + p <- length(b$coefficients) + rind <- rep(FALSE,p) ## random coefficient index + for (i in 1:length(re)) { + rind[b$smooth[[re[i]]]$first.para:b$smooth[[re[i]]]$last.para] <- TRUE + } + p2 <- sum(rind) ## number random + p1 <- p - p2 ## number fixed + map <- rep(0,p) ## remaps param indices to indices in split version + map[rind] <- 1:p2 ## random + map[!rind] <- 1:p1 ## fixed + + ## split R... + R1 <- b$R[,!rind] ## fixed effect columns + R2 <- b$R[,rind] ## random effect columns + ## seitdem ich dich kennen, hab ich ein probleme, + + ## assemble S1 and S2 + S1 <- matrix(0,p1,p1);S2 <- matrix(0,p2,p2) + + k <- 1 + for (i in 1:length(b$smooth)) { + ns <- length(b$smooth[[i]]$S) + ind <- map[b$smooth[[i]]$first.para:b$smooth[[i]]$last.para] + is.random <- i%in%re + if (ns>0) for (j in 1:ns) { + if (is.random) S2[ind,ind] <- S2[ind,ind] + sp[k]*b$smooth[[i]]$S[[j]] else + S1[ind,ind] <- S1[ind,ind] + sp[k]*b$smooth[[i]]$S[[j]] + k <- k + 1 + } + } + ## pseudoinvert S2 + if (nrow(S2)==1) { + S2[1,1] <- 1/sqrt(S2[1,1]) + } else if (max(abs(diag(diag(S2))-S2))==0) { + ds2 <- diag(S2) + ind <- ds2 > max(ds2)*.Machine$double.eps^.8 + ds2[ind] <- 1/ds2[ind];ds2[!ind] <- 0 + diag(S2) <- sqrt(ds2) + } else { + ev <- eigen((S2+t(S2))/2,symmetric=TRUE) + ind <- ev$values > max(ev$values)*.Machine$double.eps^.8 + ev$values[ind] <- 1/ev$values[ind];ev$values[!ind] <- 0 + ## S2 <- ev$vectors%*%(ev$values*t(ev$vectors)) + S2 <- sqrt(ev$values)*t(ev$vectors) + } + ## choleski of cov matrix.... + ## L <- chol(diag(p)+R2%*%S2%*%t(R2)) ## L'L = I + R2 S2^- R2' + L <- chol(diag(p) + crossprod(S2%*%t(R2))) + + ## now we need the square root of the unpenalized + ## cov matrix for m + if (m>0) { + ## llr version + LRB <- rbind(L%*%R1,t(mroot(S1))) + ii <- map[b$smooth[[m]]$first.para:b$smooth[[m]]$last.para] + ## ii is cols of LRB related to smooth m, which need + ## to be moved to the end... + LRB <- cbind(LRB[,-ii],LRB[,ii]) + ii <- (ncol(LRB)-length(ii)+1):ncol(LRB) ## need to pick up final block + Rm <- qr.R(qr(LRB,tol=0,LAPACK=FALSE))[ii,ii,drop=FALSE] ## unpivoted QR + } else Rm <- NULL + + list(Ve= crossprod(L%*%b$R%*%b$Vp)/b$sig2, ## Frequentist cov matrix + Rm=Rm) + # mapi <- (1:p)[!rind] ## indexes mapi[j] is index of total coef vector to which jth row/col of Vb/e relates + + } ## end of recov + +#### Standard summary of formula and model argumements #### +message("GAM formula:") +print(object$call) +message() + +message("Family:") +cat(paste0(object$family, '\n')) +message() + +if(object$use_lv){ + message("Number of latent factors:") + cat(object$n_lv, '\n') + message() +} + +message('N series:') +cat(NCOL(object$ytimes), '\n') +message() + +message('N observations per series:') +cat(NROW(object$obs_data) / NCOL(object$ytimes), '\n') +message() + +if(object$family == 'Negative Binomial'){ + message("Dispersion parameter estimate:") + print(MCMCvis::MCMCsummary(object$jags_output, 'r')[,c(3:7)]) + message() +} + +#### Summary table and approximate tests for non-flat smooth functions #### +coef_names <- names(object$mgcv_model$coefficients) +m <- length(object$mgcv_model$smooth) +useR <- TRUE +residual.df<-length(object$y)-sum(object$edf) +est.disp <- object$mgcv_model$scale.estimated + +df <- edf1 <- edf <- s.pv <- chi.sq <- array(0, m) +if (m>0) { # form test statistics for each smooth + ## Bayesian p-values required + if (useR) X <- object$mgcv_model$R else { + sub.samp <- max(1000,2*length(object$coefficients)) + if (nrow(object$model)>sub.samp) { ## subsample to get X for p-values calc. + kind <- temp.seed(11) + ind <- sample(1:nrow(object$model),sub.samp,replace=FALSE) ## sample these rows from X + X <- predict(object,object$model[ind,],type="lpmatrix") + #RNGkind(kind[1],kind[2]) + #assign(".Random.seed",seed,envir=.GlobalEnv) ## RNG behaves as if it had not been used + temp.seed(kind) + } else { ## don't need to subsample + X <- model.matrix(object) + } + X <- X[!is.na(rowSums(X)),] ## exclude NA's (possible under na.exclude) + } ## end if (m>0) + ii <- 0 + + # Median beta params for smooths and their covariances + V <- cov(MCMCvis::MCMCchains(object$jags_output, 'b')) + object$mgcv_model$Vp <- V + object$mgcv_model$Vc <- V + p <- MCMCvis::MCMCsummary(object$jags_output, 'b')[,c(4)] + names(p) <- coef_names + object$mgcv_model$coefficients <- p + + # Smoothing parameters and EDF + object$mgcv_model$sp <- jam$sp + object$mgcv_model$edf <- jam$edf + + for (i in 1:m) { ## loop through smooths + start <- object$mgcv_model$smooth[[i]]$first.para;stop <- object$mgcv_model$smooth[[i]]$last.para + + edf1i <- edfi <- sum(object$mgcv_model$edf[start:stop]) # edf for this smooth + ## extract alternative edf estimate for this smooth, if possible... + if (!is.null(object$mgcv_model$edf1)) edf1i <- sum(object$mgcv_model$edf1[start:stop]) + Xt <- X[start:stop,start:stop,drop=FALSE] + fx <- if (inherits(object$mgcv_model$smooth[[i]],"tensor.smooth")&& + !is.null(object$mgcv_model$smooth[[i]]$fx)) all(object$mgcv_model$smooth[[i]]$fx) else object$mgcv_model$smooth[[i]]$fixed + if (!fx&&object$mgcv_model$smooth[[i]]$null.space.dim==0&&!is.null(object$mgcv_model$R)) { ## random effect or fully penalized term + res <- reTest(object$mgcv_model,i) + } else { ## Inverted Nychka interval statistics + rdf <- -1 + res <- testStat(p[start:stop],Xt,V,min(ncol(Xt),edf1i),type=0,res.df = residual.df) + } + + if (!is.null(res)) { + ii <- ii + 1 + df[ii] <- res$rank + chi.sq[ii] <- res$stat + s.pv[ii] <- res$pval + edf1[ii] <- edf1i + edf[ii] <- edfi + names(chi.sq)[ii]<- object$mgcv_model$smooth[[i]]$label + } + } + if (ii==0) df <- edf1 <- edf <- s.pv <- chi.sq <- array(0, 0) else { + df <- df[1:ii];chi.sq <- chi.sq[1:ii];edf1 <- edf1[1:ii] + edf <- edf[1:ii];s.pv <- s.pv[1:ii] + } + if (!est.disp) { + s.table <- cbind(edf, df, chi.sq, s.pv) + dimnames(s.table) <- list(names(chi.sq), c("edf", "Ref.df", "Chi.sq", "p-value")) + } else { + s.table <- cbind(edf, df, chi.sq/df, s.pv) + dimnames(s.table) <- list(names(chi.sq), c("edf", "Ref.df", "F", "p-value")) + } + +} + +message('GAM smooth term approximate significances:') +printCoefmat(s.table, digits = 4, signif.stars = T) +message() + +message("GAM coefficient (beta) estimates:") +mvgam_coefs <- MCMCvis::MCMCsummary(object$jags_output, 'b')[,c(3:7)] +rownames(mvgam_coefs) <- coef_names +print(mvgam_coefs) +message() + +message("GAM smoothing parameter (rho) estimates:") +rho_coefs <- MCMCvis::MCMCsummary(object$jags_output, 'rho')[,c(3:7)] +rho_names <- unlist(lapply(seq(1:length(object$mgcv_model$smooth)), function(i){ + + number_seq <- seq(1:(1 + object$mgcv_model$smooth[[i]]$null.space.dim + + (length(object$mgcv_model$smooth[[i]]$sp) - 1))) + number_seq[1] <- '' + + paste0(rep(object$mgcv_model$smooth[[i]]$label, + 1 + object$mgcv_model$smooth[[i]]$null.space.dim + + (length(object$mgcv_model$smooth[[i]]$sp) - 1)), + number_seq) +})) +rownames(rho_coefs) <- rho_names +print(rho_coefs) +message() + +message("Latent trend drift and AR parameter estimates:") +print(MCMCvis::MCMCsummary(object$jags_output, c('phi', 'ar1', + 'ar2', 'ar3'))[,c(3:7)]) +message() +} + + + diff --git a/man/hpd.Rd b/man/hpd.Rd index 2d7c6840..d6ec22c0 100644 --- a/man/hpd.Rd +++ b/man/hpd.Rd @@ -12,8 +12,8 @@ hpd(x, coverage = 0.95) \item{coverage}{\code{numeric} value specifying the width of the HPD interval. Default is 0.95} } \value{ -A \code{list} containing the reconciled forecast distributions for each series in \code{y}. Each element in -the \code{vector} with three values: lower estimate, median estimate and upper estimate of the HPD interval +A \code{matrix} containing the estimated lower, and upper bounds of the interval along with the estimated +mode } \description{ This function uses estimated densities to calculate HPD intevals. Code originally supplied by Martyn Plummer diff --git a/man/mvjagam.Rd b/man/mvjagam.Rd index 0aa94b56..3d5363d3 100644 --- a/man/mvjagam.Rd +++ b/man/mvjagam.Rd @@ -6,6 +6,7 @@ \usage{ mvjagam( formula, + knots, data_train, data_test, prior_simulation = FALSE, @@ -30,6 +31,11 @@ mvjagam( for a GLM except that smooth terms, s, te, ti and t2, can be added to the right hand side to specify that the linear predictor depends on smooth functions of predictors (or linear functionals of these).} +\item{knots}{An optional \code{list} containing user specified knot values to be used for basis construction. +For most bases the user simply supplies the knots to be used, which must match up with the k value supplied +(note that the number of knots is not always just k). Different terms can use different numbers of knots, +unless they share a covariate.} + \item{data_train}{A \code{dataframe} containing the model response variable and covariates required by the GAM \code{formula}. Should include columns: 'y' (the discrete outcomes; \code{NA}s allowed) @@ -100,10 +106,15 @@ dynamic factors to capture trend dependencies in a reduced dimension format, or } \details{ A \code{\link[mcgv]{jagam}} model file is generated from \code{formula} and modified to include the latent -state space trends. Prior distributions for most important terms can be altered by the user to inspect model +state space trends. Prior distributions for most important model parameters can be altered by the user to inspect model sensitivities to given priors. Note that latent trends are estimated on the log scale so choose tau, AR and phi priors accordingly. The model parameters are esimated in a Bayesian framework using Gibbs sampling -in \code{\link[rjags]{jags.model}}. For the time being, all series are assumed to have the same overdispersion +in \code{\link[rjags]{jags.model}}. When using a dynamic factor model for the trends, factor loadings are given +regularized horseshoe priors to theoretically allow some factors to be dropped from the model by squeezing loadings for +an entire factor to zero. This is done to help protect against selecting too many latent factors than are needed to +capture dependencies in the data, so it can often be advantageous to set n_lv to a slightly larger number. However +larger numbers of factors do come with additional computational costs so these should be balanced as well. +For the time being, all series are assumed to have the same overdispersion parameter when using a negative binomial distribution, though this will be relaxed in future versions. For each series, randomized quantile (i.e. Dunn-Smyth) residuals are calculated for inspecting model diagnostics using medians of posterior predictions. If the fitted model is appropriate then Dunn-Smyth residuals will be diff --git a/man/plot_mvgam_fc.Rd b/man/plot_mvgam_fc.Rd index 41d846f3..83faf334 100644 --- a/man/plot_mvgam_fc.Rd +++ b/man/plot_mvgam_fc.Rd @@ -4,7 +4,7 @@ \alias{plot_mvgam_fc} \title{Plot mvjagam posterior predictions for a specified series} \usage{ -plot_mvgam_fc(object, series, data_test, hide_xlabels = FALSE, ylim) +plot_mvgam_fc(object, series, data_test, hide_xlabels = FALSE, ylab, ylim) } \arguments{ \item{object}{\code{list} object returned from \code{mvjagam}} @@ -19,6 +19,8 @@ Note this is only useful if the same \code{data_test} was also included when fit \item{hide_xlabels}{\code{logical}. If \code{TRUE}, no xlabels are printed to allow the user to add custom labels using \code{axis} from base \code{R}} +\item{ylab}{Optional \code{character} string specifying the y-axis label} + \item{ylim}{Optional \code{vector} of y-axis limits (min, max)} } \value{ diff --git a/man/plot_mvgam_resids.Rd b/man/plot_mvgam_resids.Rd new file mode 100644 index 00000000..4d4db0cb --- /dev/null +++ b/man/plot_mvgam_resids.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_mvgam_resids.R +\name{plot_mvgam_resids} +\alias{plot_mvgam_resids} +\title{Residual diagnostics for a fitted mvjagam object} +\usage{ +plot_mvgam_resids(object, series = 1) +} +\arguments{ +\item{object}{\code{list} object returned from \code{mvjagam}} + +\item{series}{\code{integer} specifying which series in the set is to be plotted} +} +\value{ +A series of base \code{R} plots +} +\description{ +This function takes a fitted \code{mvjagam} object and returns various residual diagnostic plots +} +\details{ +A total of four base \code{R} plots are generated to examine Dunn-Smyth residuals for +the specified series. Plots include a residuals vs fitted values plot, +a Q-Q plot, and two plots to check for any remaining temporal autocorrelation in the residuals. +Note, all plots use posterior medians of fitted values / residuals, so uncertainty is not represented. +} +\author{ +Nicholas J Clark +} diff --git a/man/summary_mvgam.Rd b/man/summary_mvgam.Rd new file mode 100644 index 00000000..efe2f800 --- /dev/null +++ b/man/summary_mvgam.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summary_mvgam.R +\name{summary_mvgam} +\alias{summary_mvgam} +\title{Summary for a fitted mvjagam object} +\usage{ +summary_mvgam(object) +} +\arguments{ +\item{object}{\code{list} object returned from \code{mvjagam}} +} +\value{ +A \code{list} is printed on-screen showing the summaries for the model +} +\description{ +This function takes a fitted \code{mvjagam} object and prints various useful summaries from it +} +\details{ +A brief summary of the model's call is printed, along with posterior intervals for +some of the key parameters in the model. Note that some smooths have extra penalties on the null space, +so summaries for the \code{rho} parameters may include more penalty terms than the number of smooths in +the original model formula. The function also returns an approximate smooth term +significance table, but note that this signficance comes with the usual caveats associated with calculations +of p-values for smooth terms in \code{\link[mcgv]{summary.gam}} (i.e. all p-values are computed without +considering uncertainty in the smoothing parameter estimates), with the added challenge that +Estimated Degrees of Freedom (edf) for each smooth term are taken from a call to \code{\link[mcgv]{gam}} +that does not include the latent trend component of the model. +} +\author{ +Nicholas J Clark +} diff --git a/pfilter/particles.rda b/pfilter/particles.rda new file mode 100644 index 00000000..e26782a3 Binary files /dev/null and b/pfilter/particles.rda differ diff --git a/test_mvjagam.R b/test_mvjagam.R index 2681765b..171211c9 100644 --- a/test_mvjagam.R +++ b/test_mvjagam.R @@ -17,12 +17,19 @@ rm(series) mod <- mvjagam(data_train = fake_data$data_train, data_test = fake_data$data_test, formula = y ~ s(season, bs = c('cc')), + knots = list(season = c(0.5, 12.5)), family = 'nb', use_lv = F, trend_model = 'AR3', - n.burnin = 10000, + n.burnin = 1000, auto_update = F) +# Check the model summary and plot the smooth term +summary_mvgam(mod) +summary(mod$mgcv_model) +plot_mvgam_resids(mod, 1) +plot_mvgam_smooth(mod, 1, 'season') + # Fit a mis-specified model for testing the model comparison functions by # smoothing on a white noise covariate fake_data$data_train$fake_cov <- rnorm(NROW(fake_data$data_train)) @@ -37,22 +44,16 @@ mod2 <- mvjagam(data_train = fake_data$data_train, n.iter = 1000, thin = 1, auto_update = F) +summary_mvgam(mod2) +plot_mvgam_smooth(mod2, 1, 'fake_cov') +plot_mvgam_resids(mod2) # Compare the models using rolling forecast DRPS evaluation compare_mvgams(mod, mod2, fc_horizon = 6, - n_evaluations = 25, n_cores = 4) - -# Summary plots and diagnostics for the preferred model (Model 1) -# Check Dunn-Smyth residuals for autocorrelation -plot(mod$resids$Air) -lines(mod$resids$Air) -acf(mod$resids$Air) -pacf(mod$resids$Air) - -# Plot the estimated seasonality smooth function -plot_mvgam_smooth(mod, smooth = 'season') + n_evaluations = 30, n_cores = 3) # Plot the posterior distribution of in-sample and out-of-sample predictions +# with true observations overlaid as points plot_mvgam_fc(mod, series = 1, data_test = fake_data$data_test) # Plot the estimated latent trend @@ -70,8 +71,8 @@ par(mfrow=c(1,1)) pfilter_mvgam_init(object = mod, n_particles = 10000, n_cores = 2, data_assim = fake_data$data_test) -# Assimilate some observations -pfilter_mvgam_online(data_assim = fake_data$data_test[1:2,], n_cores = 2, +# Assimilate some of the next out of sample observations +pfilter_mvgam_online(data_assim = fake_data$data_test[1:7,], n_cores = 2, kernel_lambda = 1) # Forecast from particles using the covariate information in remaining data_test observations @@ -111,47 +112,44 @@ trends_mod <- mvjagam(data_train = trends_data$data_train, data_test = trends_data$data_test, formula = y ~ s(season, k = 6, m = 2, bs = 'cc') + s(season, by = series, k = 10, m = 1) - 1, + knots = list(season = c(0.5, 12.5)), use_lv = T, trend_model = 'AR3', n_lv = 3, family = 'nb', - n.burnin = 5000, + n.burnin = 1000, n.iter = 1000, thin = 1, upper_bounds = rep(100, length(terms)), auto_update = F) +summary_mvgam(trends_mod) -# Poor model for comparison +# A fixed seasonality model for comparison trends_mod2 <- mvjagam(data_train = trends_data$data_train, data_test = trends_data$data_test, - formula = y ~ s(season, k = 3, bs = 'cc') - 1, + formula = y ~ s(season, k = 6, bs = 'cc') - 1, + knots = list(season = c(0.5, 12.5)), use_lv = T, - trend_model = 'RW', - n_lv = 2, - family = 'poisson', - n.burnin = 50, - n.iter = 500, + trend_model = 'AR3', + n_lv = 3, + family = 'nb', + n.burnin = 1000, + n.iter = 1000, thin = 1, upper_bounds = rep(100, length(terms)), auto_update = F) +summary_mvgam(trends_mod2) -# Compare the models using rolling forecast DRPS evaluation +# Compare the models using rolling forecast DRPS evaluation. Here we focus on +# near-term forecasts (horizon = 3) when comparing model performances par(mfrow = c(1,1)) -compare_mvgams(trends_mod, trends_mod2, fc_horizon = 12, - n_evaluations = 20) +compare_mvgams(trends_mod, trends_mod2, fc_horizon = 3, + n_cores = 3, n_evaluations = 20, n_samples = 2500) # Look at Dunn-Smyth residuals for some series from preferred model (Model 1) -hist(trends_mod$resids$`dog tick`) -plot(trends_mod$resids$`dog tick`) -lines(trends_mod$resids$`dog tick`) -acf(trends_mod$resids$`dog tick`) -pacf(trends_mod$resids$`dog tick`) - -hist(trends_mod$resids$`la nina`) -plot(trends_mod$resids$`la nina`) -lines(trends_mod$resids$`la nina`) -acf(trends_mod$resids$`la nina`) -pacf(trends_mod$resids$`la nina`) +plot_mvgam_resids(trends_mod, 1) +plot_mvgam_resids(trends_mod, 2) +plot_mvgam_resids(trends_mod, 3) # Plot posterior predictive distributions par(mfrow = c(4, 1))