Skip to content

Commit

Permalink
fix error plotting factors with 2 levels
Browse files Browse the repository at this point in the history
  • Loading branch information
nicholasjclark committed Jun 15, 2023
1 parent 0562150 commit dd94a5a
Show file tree
Hide file tree
Showing 8 changed files with 238 additions and 80 deletions.
27 changes: 22 additions & 5 deletions R/dynamic.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,26 @@ dynamic = function(variable, rho = 5, stationary = TRUE){
#' @noRd
interpret_mvgam = function(formula, N){

facs <- colnames(attr(terms.formula(formula), 'factors'))

# Re-arrange so that random effects always come last
if(any(grepl('bs = \"re\"', facs, fixed = TRUE))){
newfacs <- facs[!grepl('bs = \"re\"', facs, fixed = TRUE)]
refacs <- facs[grepl('bs = \"re\"', facs, fixed = TRUE)]
int <- attr(terms.formula(formula), 'intercept')
newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',paste(paste(newfacs, collapse = '+'),
'+',
paste(refacs, collapse = '+'),
collapse = '+'),
ifelse(int == 0, ' - 1', '')))
} else {
newformula <- formula
}
attr(newformula, '.Environment') <- attr(formula, '.Environment')

# Check if any terms use the dynamic wrapper
response <- terms.formula(formula)[[2]]
tf <- terms.formula(formula, specials = c("dynamic"))
response <- terms.formula(newformula)[[2]]
tf <- terms.formula(newformula, specials = c("dynamic"))
which_dynamics <- attr(tf,"specials")$dynamic

# Update the formula to the correct Gaussian Process spline
Expand Down Expand Up @@ -96,7 +113,7 @@ interpret_mvgam = function(formula, N){

# Replace dynamic terms with the correct specification
update_terms <- vector(length = length(which_dynamics))
old_rhs <- as.character(formula)[3]
old_rhs <- as.character(newformula)[3]
old_rhs <- gsub(" ", "", (old_rhs), fixed = TRUE)
old_rhs <- gsub(',stationary=TRUE|,stationary=FALSE', '',
old_rhs)
Expand All @@ -114,10 +131,10 @@ interpret_mvgam = function(formula, N){

# Return the updated formula for passing to mgcv
updated_formula <- as.formula(paste(response, '~', old_rhs))
attr(updated_formula, '.Environment') <- attr(formula, '.Environment')
attr(updated_formula, '.Environment') <- attr(newformula, '.Environment')

} else {
updated_formula <- formula
updated_formula <- newformula
}

return(updated_formula)
Expand Down
2 changes: 1 addition & 1 deletion R/evaluate_mvgams.R
Original file line number Diff line number Diff line change
Expand Up @@ -920,7 +920,7 @@ variogram_score = function(truth, fc, log = FALSE, weights){
if(i == j){
out[i,j] <- 0
} else {
v_fc <- quantile(abs(fc[i,] - fc[j,]) ^ 0.5, 0.5)
v_fc <- quantile(abs(fc[i,] - fc[j,]) ^ 0.5, 0.5, na.rm = TRUE)
v_dat <- abs(truth[i] - truth[j]) ^ 0.5
out[i,j] <- 2 * weights[i, j] * ((v_dat - v_fc) ^ 2)
}
Expand Down
145 changes: 101 additions & 44 deletions R/get_mvgam_priors.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,41 @@
#'Any other variables to be included in the linear predictor of \code{formula} must also be present
#'@param data_train Deprecated. Still works in place of \code{data} but users are recommended to use
#'\code{data} instead for more seamless integration into `R` workflows
#'@param family \code{character}. Must be either 'nb' (for Negative Binomial), 'tw' (for Tweedie) or 'poisson'
#'@param family \code{family} specifying the exponential observation family for the series. Currently supported
#'families are:
#'\itemize{
#' \item`nb()` for count data
#' \item`poisson()` for count data
#' \item`tweedie()` for count data (power parameter `p` fixed at `1.5`)
#' \item`gaussian()` for real-valued data
#' \item`betar()` for proportional data on `(0,1)`
#' \item`lognormal()` for non-negative real-valued data
#' \item`student_t()` for real-valued data
#' \item`Gamma()` for non-negative real-valued data}
#'See \code{\link{mvgam_families}} for more details
#'@param use_lv \code{logical}. If \code{TRUE}, use dynamic factors to estimate series'
#'latent trends in a reduced dimension format. If \code{FALSE}, estimate independent latent trends for each series
#'@param n_lv \code{integer} the number of latent dynamic factors to use if \code{use_lv == TRUE}.
#'Cannot be \code{>n_series}. Defaults arbitrarily to \code{min(2, floor(n_series / 2))}
#'@param trend_model \code{character} specifying the time series dynamics for the latent trend. Options are:
#''None' (no latent trend component; i.e. the GAM component is all that contributes to the linear predictor,
#'and the observation process is the only source of error; similarly to what is estimated by \code{\link[mgcv]{gam}}),
#''RW' (random walk with possible drift),
#''AR1' (AR1 model with intercept),
#''AR2' (AR2 model with intercept) or
#''AR3' (AR3 model with intercept) or
#''VAR1' (with possible drift; only available in \code{Stan}) or
#''GP' (Gaussian Process with squared exponential kernel;
#'only available for estimation in \code{Stan})
#'\itemize{
#' \item `None` (no latent trend component; i.e. the GAM component is all that contributes to the linear predictor,
#'and the observation process is the only source of error; similarly to what is estimated by \code{\link[mgcv]{gam}})
#' \item `RW` (random walk with possible drift)
#' \item `AR1` (with possible drift)
#' \item `AR2` (with possible drift)
#' \item `AR3` (with possible drift)
#' \item `VAR1` (with possible drift; only available in \code{Stan})
#' \item `GP` (Gaussian Process with squared exponential kernel;
#'only available in \code{Stan})}
#'@param trend_map Optional `data.frame` specifying which series should depend on which latent
#'trends. Useful for allowing multiple series to depend on the same latent trend process, but with
#'different observation processes. If supplied, a latent factor model is set up by setting
#'`use_lv = TRUE` and using the mapping to set up the shared trends. Needs to have column names
#'`series` and `trend`, with integer values in the `trend` column to state which trend each series
#'should depend on. The `series` column should have a single unique entry for each series in the
#'data (names should perfectly match factor levels of the `series` variable in `data`). See examples
#'in \code{\link{mvgam}} for details
#'@param drift \code{logical} estimate a drift parameter in the latent trend components. Useful if the latent
#'trend is expected to broadly follow a non-zero slope. Note that if the latent trend is more or less stationary,
#'the drift parameter can become unidentifiable, especially if an intercept term is included in the GAM linear
Expand Down Expand Up @@ -125,6 +145,7 @@ get_mvgam_priors = function(formula,
n_lv,
use_stan = TRUE,
trend_model = 'None',
trend_map,
drift = FALSE){

# Validate the family argument
Expand All @@ -139,9 +160,6 @@ get_mvgam_priors = function(formula,
"student",
"Gamma"))

# Validate the trend argument
trend_model <- evaluate_trend_model(trend_model)

if(missing("data") & missing("data_train")){
stop('Argument "data" is missing with no default')
}
Expand All @@ -150,6 +168,48 @@ get_mvgam_priors = function(formula,
data_train <- data
}

# Validate the trend argument
trend_model <- evaluate_trend_model(trend_model)

# Check trend_map is correctly specified
if(!missing(trend_map)){

# No point in trend mapping if trend model is 'None'
if(trend_model == 'None'){
stop('cannot set up latent trends when "trend_model = None"',
call. = FALSE)
}

# Trend mapping not supported by JAGS
if(!use_stan){
stop('trend mapping not available for JAGS',
call. = FALSE)
}

# trend_map must have an entry for each unique time series
if(!all(sort(trend_map$series) == sort(unique(data_train$series)))){
stop('argument "trend_map" must have an entry for every unique time series in "data"',
call. = FALSE)
}

# trend_map must not specify a greater number of trends than there are series
if(max(trend_map$trend) > length(unique(data_train$series))){
stop('argument "trend_map" specifies more latent trends than there are series in "data"',
call. = FALSE)
}

# trend_map must not skip any trends
if(!all(sort(unique(trend_map$trend)) == seq(1:max(trend_map$trend)))){
stop('argument "trend_map" must link at least one series to each latent trend')
}

# If trend_map correctly specified, set use_lv to TRUE
use_lv <- TRUE

# Model should be set up using dynamic factors of the correct length
n_lv <- max(trend_map$trend)
}

# JAGS cannot support latent GP or VAR trends
if(!use_stan & trend_model %in%c ('GP', 'VAR1')){
warning('gaussian process and VAR trends not yet supported for JAGS; reverting to Stan')
Expand Down Expand Up @@ -442,27 +502,19 @@ get_mvgam_priors = function(formula,

if(trend_model == 'RW'){
if(use_stan){
if(use_lv){
trend_df <- data.frame(param_name = c('vector<lower=0>[n_lv] sigma;'),
param_length = n_lv,
param_info = c('trend sd'),
prior = c('sigma ~ exponential(1);'),
example_change = c(
paste0('sigma ~ exponential(',
round(runif(min = 0.01, max = 1, n = 1), 2),
');'
)))
} else {
trend_df <- data.frame(param_name = c('vector<lower=0>[n_series] sigma;'),
param_length = length(unique(data_train$series)),
param_info = c('trend sd'),
prior = c('sigma ~ exponential(1);'),
example_change = c(
paste0('sigma ~ exponential(',
round(runif(min = 0.01, max = 1, n = 1), 2),
');'
)))
}
trend_df <- data.frame(param_name = c(paste0('vector<lower=0>[',
ifelse(use_lv, 'n_lv', 'n_series'),
'] sigma;')),
param_length = ifelse(use_lv,
n_lv,
length(unique(data_train$series))),
param_info = c('trend sd'),
prior = c('sigma ~ exponential(2);'),
example_change = c(
paste0('sigma ~ exponential(',
round(runif(min = 0.01, max = 1, n = 1), 2),
');'
)))

} else {
trend_df <- data.frame(param_name = c('vector<lower=0>[n_series] sigma'),
Expand Down Expand Up @@ -736,18 +788,23 @@ get_mvgam_priors = function(formula,

# Remove options for trend variance priors if using a dynamic factor model
if(use_lv){
trend_df %>%
dplyr::filter(!grepl(paste0('vector<lower=0>[',
ifelse(use_lv, 'n_lv', 'n_series'),
'] sigma;'), param_name)) -> trend_df
if(missing(trend_map)){
trend_df %>%
dplyr::filter(!grepl(paste0('vector<lower=0>[',
ifelse(use_lv, 'n_lv', 'n_series'),
'] sigma;'), param_name,
fixed = TRUE)) -> trend_df
}

if(use_stan){
trend_df <- rbind(trend_df,
data.frame(param_name = c('vector[M] L;'),
param_length = n_lv * length(unique(data_train$series)),
param_info = c('factor loadings'),
prior = c('L ~ student_t(5, 0, 1);'),
example_change = 'L ~ std_normal();'))
if(missing(trend_map)){
trend_df <- rbind(trend_df,
data.frame(param_name = c('vector[M] L;'),
param_length = n_lv * length(unique(data_train$series)),
param_info = c('factor loadings'),
prior = c('L ~ student_t(5, 0, 1);'),
example_change = 'L ~ std_normal();'))
}
}
}

Expand Down
24 changes: 17 additions & 7 deletions R/lfo_cv.mvgam.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ lfo_cv.mvgam = function(object,

# Initialize the process for i = min_t, generating a
# conditional forecast for all of the future data
data_splits <- cv_split(all_data, last_train = min_t)
data_splits <- cv_split(all_data, last_train = min_t,
fc_horizon = fc_horizon)

# Fit model to training and forecast the testing
fit_past <- update(object,
Expand Down Expand Up @@ -161,7 +162,8 @@ lfo_cv.mvgam = function(object,
refits <- c(refits, i)

# Subset the data to now include the last set of training observations
data_splits <- cv_split(all_data, last_train = i)
data_splits <- cv_split(all_data, last_train = i,
fc_horizon = fc_horizon)

# Re-fit the model
fit_past <- update(fit_past,
Expand Down Expand Up @@ -279,18 +281,24 @@ plot.mvgam_lfo = function(x, ...){

#' Function to generate training and testing splits
#' @noRd
cv_split = function(data, last_train){
if(class(data)[1] == 'list'){
cv_split = function(data, last_train, fc_horizon = 1){
if(inherits(data, 'list')){

# Find indices of training and testing splits
temp_dat = data.frame(time = data$time,
series = data$series) %>%
dplyr::mutate(index = dplyr::row_number()) %>%
dplyr::arrange(time, series)

indices_train <- temp_dat %>%
dplyr::filter(time <= last_train) %>%
dplyr::pull(index)

indices_test <- temp_dat %>%
dplyr::filter(time > last_train &
time <= (last_train + fc_horizon)) %>%
dplyr::pull(index)

# Split
data_train <- lapply(data, function(x){
if(is.matrix(x)){
Expand All @@ -302,18 +310,20 @@ cv_split = function(data, last_train){

data_test <- lapply(data, function(x){
if(is.matrix(x)){
matrix(x[-indices_train,], ncol = NCOL(x))
matrix(x[indices_test,], ncol = NCOL(x))
} else {
x[-indices_train]
x[indices_test]
}
})

} else {
data_train <- data %>%
dplyr::filter(time <= last_train) %>%
dplyr::arrange(time, series)

data_test <- data %>%
dplyr::filter(time > last_train) %>%
dplyr::filter(time > last_train &
time <= (last_train + fc_horizon)) %>%
dplyr::arrange(time, series)
}

Expand Down
4 changes: 3 additions & 1 deletion R/mvgam.R
Original file line number Diff line number Diff line change
Expand Up @@ -1347,7 +1347,8 @@ mvgam = function(formula,
trend_map = trend_map,
data_train = data_train,
ytimes = ytimes,
n_lv = n_lv)
n_lv = n_lv,
trend_model = trend_model)
vectorised$model_file <- trend_map_setup$model_file
vectorised$model_data <- trend_map_setup$model_data
}
Expand Down Expand Up @@ -1385,6 +1386,7 @@ mvgam = function(formula,
#### Return only the model file and all data / inits needed to run the model
# outside of mvgam ####
if(!run_model){
unlink('base_gam.txt')
output <- structure(list(call = orig_formula,
family = family_char,
trend_model = trend_model,
Expand Down
Loading

0 comments on commit dd94a5a

Please sign in to comment.