bayesian-process-models.qmd

# Applying Bayesian calibration methods to a process model {#sec-bayes-process}

In @sec-bayes, you learned how to estimate parameter values using Bayesian methods. Now, we will apply these methods to estimate parameters in our forest process model from @sec-process-model. Recall that the advantage of the Bayesian approach is that it estimates the parameters given the data, whereas likelihood approaches technically estimate the data given the parameters.

## Running MCMC on the forest process model

```{r}
#| echo: TRUE
#| message: FALSE
#| warning: FALSE
library(tidyverse)
library(patchwork)
source("R/helpers.R")
source("R/forest_model.R")
set.seed(100)
```

```{r}
site <- "OSBS"
```

Read in observations

```{r}
obs <- read_csv("data/site_carbon_data.csv", show_col_types = FALSE)
```

Set up dates of simulation, parameters, initial conditions, and meteorology inputs

```{r}
sim_dates <- seq(as_date("2022-01-01"), Sys.Date() - 2, by = "1 day")

ens_members <- 1
params <- list()
params$alpha <- rep(0.02, ens_members)
params$SLA <- rep(4.74, ens_members)
params$leaf_frac <- rep(0.315, ens_members)
params$Ra_frac <- rep(0.5, ens_members)
params$Rbasal <- rep(0.002, ens_members)
params$Q10 <- rep(2.1, ens_members)
params$litterfall_rate <- rep(1/(2.0*365), ens_members) #Two year leaf lifespan
params$litterfall_start <- rep(250, ens_members)
params$litterfall_length<- rep(60, ens_members)
params$mortality <- rep(0.00015, ens_members) #Wood lives about 18 years on average (all trees, branches, roots, course roots)
params$sigma.leaf <- rep(0.0, ens_members) #0.01 
params$sigma.wood <- rep(0.0, ens_members) #0.01 ## wood biomass
params$sigma.soil <- rep(0.0, ens_members)# 0.01
params <- as.data.frame(params)


state_init <- rep(NA, 3)

state_init[1] <- obs |> 
  filter(datetime %in% sim_dates,
         variable == "lai") |> 
  na.omit() |> 
  slice(1) |> 
  mutate(observation = observation / (mean(params$SLA) * 0.1)) |> 
  pull(observation)

state_init[2] <- obs |> 
  filter(variable == "wood",
         datetime %in% sim_dates) |> 
  na.omit() |> 
  slice(1) |> 
  pull(observation)

state_init[3] <- obs |> 
  filter(variable == "som") |> 
  na.omit() |> 
  slice(1) |> 
  pull(observation)

inputs <- get_historical_met(site = site, sim_dates, use_mean = TRUE)
inputs_ensemble <- assign_met_ensembles(inputs, ens_members)
```

Set up MCMC configuration

```{r}
#Set MCMC Configuration
num_iter <- 1500
log_likelihood_prior_current <- -10000000000
accept <- 0


#Initialize chain
num_pars <- 3
jump_params <- c(0.001, 0.0002, 1)
fit_params <- array(NA, dim = c(num_pars, num_iter))
fit_params[1, 1] <- params$alpha
fit_params[2, 1] <- params$Rbasal
fit_params[3, 1] <- params$litterfall_start
prior_mean <- c(0.029, 0.002, 200)
prior_sd <- c(0.005, 0.0005, 10)
```

Run MCMC

```{r}
for(iter in 2:num_iter){
  
  #Loop through parameter value
  
  for(j in 1:num_pars){
    
    proposed_pars <- fit_params[, iter - 1]
    proposed_pars[j] <- rnorm(1, mean = fit_params[j, iter - 1], sd = jump_params[j])
    
    log_prior <- dnorm(proposed_pars[1], mean = prior_mean[1], sd = prior_sd[1], log = TRUE) + 
      dnorm(proposed_pars[2], mean = prior_mean[2], sd = prior_sd[2], log = TRUE) +
      dnorm(proposed_pars[3], mean = prior_mean[3], sd = prior_sd[3], log = TRUE)
    
    params$alpha  <- proposed_pars[1]
    params$Rbasal  <- proposed_pars[2]
    params$litterfall_start <- proposed_pars[3]
    
    #Set initial conditions
    output <- array(NA, dim = c(length(sim_dates), ens_members, 12)) #12 is the number of outputs
    output[1, , 1] <- state_init[1]
    output[1, , 2] <- state_init[2]
    output[1, , 3] <- state_init[3]
    
    for(t in 2:length(sim_dates)){
      output[t, , ]  <- forest_model(t, 
                                     states = matrix(output[t-1 , , 1:3], nrow = ens_members) , 
                                     parms = params, 
                                     inputs = matrix(inputs_ensemble[t ,, ], nrow = ens_members))
    }
    
    output_df <- output_to_df(output, sim_dates, sim_name = "fitting")
    
    combined_output_obs <- combine_model_obs(output_df, 
                                             obs,
                                             variables = c("lai", "wood", "som", "nee"), 
                                             sds = c(0.1, 1, 20, 0.01))
    
    log_likelihood <- sum(dnorm(x =  combined_output_obs$observation, 
                                mean = combined_output_obs$prediction, 
                                sd = combined_output_obs$sds, log = TRUE))
    
    log_likelihood_prior_proposed <- log_prior + log_likelihood
    
    z <- exp(log_likelihood_prior_proposed - log_likelihood_prior_current)
    
    r <- runif(1, min = 0, max = 1)
    
    if(z >  r){
      fit_params[j, iter] <- proposed_pars[j]
      log_likelihood_prior_current <- log_likelihood_prior_proposed
      accept <- accept + 1
    }else{
      fit_params[j, iter] <- fit_params[j, iter - 1]
      log_likelihood_prior_current <- log_likelihood_prior_current #this calculation isn't necessary but is here to show you the logic
    }
  }
}
```

Examine acceptance rate (goal is 23-45%)

```{r}
accept / (num_iter * num_pars)
```

Process MCMC chain by removing the first 500 iterations and pivoting to a long format

```{r}
nburn <- 500
parameter_MCMC <- tibble(iter = nburn:num_iter,
            alpha = fit_params[1, nburn:num_iter],
            Rbasal = fit_params[2, nburn:num_iter],
            litterfall_start = fit_params[3, nburn:num_iter]) %>%
  pivot_longer(-iter, values_to = "value", names_to = "parameter")
```

@fig-chains

```{r}
#| warning: FALSE
#| fig-cap: The MCMC chains and posterior distributions for the calibrated parameters
#| label: fig-chains 

p1 <- ggplot(parameter_MCMC, aes(x = iter, y = value)) +
  geom_line() +
  facet_wrap(~parameter, scales = "free") +
  theme_bw()

p2 <- ggplot(parameter_MCMC, aes(x = value)) +
  geom_histogram() +
  facet_wrap(~parameter, scales = "free") +
  theme_bw()

p1 / p2
```

## Examining the influence of parameter optimization on model predictions

### Simulation with prior parameter distributions

```{r}
ens_members <- 100

inputs_ensemble <- assign_met_ensembles(inputs, ens_members)

#Set initial conditions
output <- array(NA, dim = c(length(sim_dates), ens_members, 12)) #12 is the number of outputs
output[1, , 1] <- state_init[1]
output[1, , 2] <- state_init[2]
output[1, , 3] <- state_init[3]

params <- list()
params$alpha <- rep(0.02, ens_members)
params$SLA <- rep(4.74, ens_members)
params$leaf_frac <- rep(0.315, ens_members)
params$Ra_frac <- rep(0.5, ens_members)
params$Rbasal <- rep(0.002, ens_members)
params$Q10 <- rep(2.1, ens_members)
params$litterfall_rate <- rep(1/(2.0*365), ens_members) #Two year leaf lifespan
params$litterfall_start <- rep(200, ens_members)
params$litterfall_length<- rep(70, ens_members)
params$mortality <- rep(0.00015, ens_members) #Wood lives about 18 years on average (all trees, branches, roots, course roots)
params$sigma.leaf <- rep(0.0, ens_members) #0.01 
params$sigma.wood <- rep(0.0, ens_members) #0.01 ## wood biomass
params$sigma.soil <- rep(0.0, ens_members)# 0.01
params <- as.data.frame(params)

#Replace parameters with prior distribution
params$alpha <- rnorm(ens_members, mean = prior_mean[1], sd = prior_sd[1])
params$Rbasal <- rnorm(ens_members, mean = prior_mean[2], sd = prior_sd[2])
params$litterfall_start <- rnorm(ens_members, mean = prior_mean[3], sd = prior_sd[3])

for(t in 2:length(sim_dates)){
  
  output[t, , ]  <- forest_model(t, 
                                 states = matrix(output[t-1 , , 1:3], nrow = ens_members) , 
                                 parms = params, 
                                 inputs = matrix(inputs_ensemble[t ,, ], nrow = ens_members))
}

output_df_no_optim <- output_to_df(output, sim_dates, sim_name = "using prior")
```

### Simulation with posterior parameter distributions

```{r}

#Set initial conditions
output <- array(NA, dim = c(length(sim_dates), ens_members, 12)) #12 is the number of outputs
output[1, , 1] <- state_init[1]
output[1, , 2] <- state_init[2]
output[1, , 3] <- state_init[3]

# Sample from posterior distributions.  Use the same index for each
index <- sample(nburn:num_iter, ens_members, replace = TRUE)

params$alpha <- fit_params[1, index]
params$Rbasal <- fit_params[2, index]
params$litterfall_start <- fit_params[3, index]

for(t in 2:length(sim_dates)){
  
  output[t, , ]  <- forest_model(t, 
                                 states = matrix(output[t-1 , , 1:3], nrow = ens_members) , 
                                 parms = params, 
                                 inputs = matrix(inputs_ensemble[t ,, ], nrow = ens_members))
}

output_df_optim <- output_to_df(output, sim_dates, sim_name = "using posteriors")

```

### Visualize the influence of optimization

@fig-sim-prior-post shows a simulation that uses the prior distribution and a simulation that uses the posterior distribution. This highlights the constraint provided by the data on the model predictions.

```{r}
#| warning: FALSE
#| fig-cap: Predictions using the priors and posterior distribution of parameters
#| label: fig-sim-prior-post

obs_filtered <- obs |> 
  filter(datetime > min(output_df_no_optim$datetime))


bind_rows(output_df_no_optim, output_df_optim) |> 
   summarise(median = median(prediction, na.rm = TRUE), 
            upper90 = quantile(prediction, 0.95, na.rm = TRUE),
            lower90 = quantile(prediction, 0.05, na.rm = TRUE),
            .by = c("datetime", "variable", "model_id")) |> 
  filter(variable %in% c("lai", "wood", "som", "nee")) |> 
  ggplot(aes(x = datetime)) +
  geom_ribbon(aes(ymin = lower90, ymax = upper90, fill = model_id), alpha = 0.7) +
  geom_line(aes(y = median, color = model_id)) +
  geom_point(data = obs_filtered, aes(x = datetime, y = observation), color = "blue", alpha = 0.5) +
  facet_wrap(~variable, scale = "free", ncol = 1) +
  theme_bw()
```

### Save posteriors for future use

The calibrated parameter chains will be used in later chapters.

```{r}
  write_csv(parameter_MCMC, "data/saved_parameter_chain.csv")
```

###