diff --git a/DESCRIPTION b/DESCRIPTION index 3617185a..c302a4ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: ibis.iSDM Type: Package Title: Modelling framework for integrated biodiversity distribution scenarios -Version: 0.0.8 +Version: 0.0.9 Authors@R: c(person(given = "Martin", family = "Jung", diff --git a/NEWS.md b/NEWS.md index e35044b1..6ddc9f4b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # ibis.iSDM 0.0.9 (current dev branch) #### New features +* Added new vignette on available functions for data preparation #67 #### Minor improvements and bug fixes * Small fix to `threshold()` now returning threshold values correctly. diff --git a/R/pseudoabsence.R b/R/pseudoabsence.R index 55c0463d..b45a4287 100644 --- a/R/pseudoabsence.R +++ b/R/pseudoabsence.R @@ -257,7 +257,7 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL bias <- terra::resample(bias, background, method = "bilinear") } # Normalize if not already set - if(terra::global(bias, 'max', na.rm = TRUE) > 1 || terra::global(bias, 'min', na.rm = TRUE) < 0 ){ + if(terra::global(bias, 'max', na.rm = TRUE)[,1] > 1 || terra::global(bias, 'min', na.rm = TRUE)[,1] < 0 ){ bias <- predictor_transform(bias, option = "norm") } } else { bias <- NULL } @@ -276,11 +276,11 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL # Now sample from all cells not occupied if(!is.null(bias)){ # Get probability values for cells where no sampling has been conducted - prob_bias <- bias[which(bg1[]==0)] + prob_bias <- bias[which(terra::values(bg1)[,1]==0)][,1] if(any(is.na(prob_bias))) prob_bias[is.na(prob_bias)] <- 0 - abs <- sample(which(bg1[]==0), size = nrpoints, replace = TRUE, prob = prob_bias) + abs <- sample(which(terra::values(bg1)[,1]==0), size = nrpoints, replace = TRUE, prob = prob_bias) } else { - abs <- sample(which(bg1[]==0), size = nrpoints, replace = TRUE) + abs <- sample(which(terra::values(bg1)[,1]==0), size = nrpoints, replace = TRUE) } } else if(method == "buffer"){ assertthat::assert_that(is.numeric(buffer_distance),msg = "Buffer distance parameter not numeric!") @@ -301,7 +301,7 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL # Now sample from all cells not occupied if(!is.null(bias)){ # Get probability values for cells where no sampling has been conducted - prob_bias <- bias[which(bg2[]==1)] + prob_bias <- bias[which(bg2[]==1)][,1] if(any(is.na(prob_bias))) prob_bias[is.na(prob_bias)] <- 0 abs <- sample(which(bg2[]==1), size = nrpoints, replace = TRUE, prob = prob_bias) } else { @@ -317,7 +317,7 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL bg2 <- terra::mask(bg1, mask = pol, inverse = !inside) if(!is.null(bias)){ # Get probability values for cells where no sampling has been conducted - prob_bias <- bias[which(bg2[]==0)] + prob_bias <- bias[which(bg2[]==0)][,1] if(any(is.na(prob_bias))) prob_bias[is.na(prob_bias)] <- 0 abs <- sample(which(bg2[]==0), size = nrpoints, replace = TRUE, prob = prob_bias) } else { @@ -340,11 +340,11 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL bg2 <- terra::mask(bg1, mask = layer, inverse = !inside) if(!is.null(bias)){ # Get probability values for cells where no sampling has been conducted - prob_bias <- bias[which(bg2[]==0)] + prob_bias <- bias[which(terra::values(bg2)[,1]==0)][,1] if(any(is.na(prob_bias))) prob_bias[is.na(prob_bias)] <- 0 - abs <- sample(which(bg2[]==0), size = nrpoints, replace = TRUE, prob = prob_bias) + abs <- sample(which(terra::values(bg2)[,1]==0), size = nrpoints, replace = TRUE, prob = prob_bias) } else { - abs <- sample(which(bg2[]==0), size = nrpoints, replace = TRUE) + abs <- sample(which(terra::values(bg2)[,1]==0), size = nrpoints, replace = TRUE) } rm(bg2) } else if(method == "zones"){ @@ -379,11 +379,11 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL bg2 <- terra::mask(bg1, mask = zones) if(!is.null(bias)){ # Get probability values for cells where no sampling has been conducted - prob_bias <- bias[which(bg2[]==0)] + prob_bias <- bias[which(terra::values(bg2)[,1]==0)][,1] if(any(is.na(prob_bias))) prob_bias[is.na(prob_bias)] <- 0 - abs <- sample(which(bg2[]==0), size = nrpoints, replace = TRUE, prob = prob_bias) + abs <- sample(which(terra::values(bg2)[,1]==0), size = nrpoints, replace = TRUE, prob = prob_bias) } else { - abs <- sample(which(bg2[]==0), size = nrpoints, replace = TRUE) + abs <- sample(which(terra::values(bg2)[,1]==0), size = nrpoints, replace = TRUE) } rm(bg2) } else if(method == "target"){ @@ -398,11 +398,11 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL bg2 <- terra::mask(bg1, mask = layer) if(!is.null(bias)){ # Get probability values for cells where no sampling has been conducted - prob_bias <- bias[which(bg2[]==0)] + prob_bias <- bias[which(terra::values(bg2)[,1]==0)][,1] if(any(is.na(prob_bias))) prob_bias[is.na(prob_bias)] <- 0 - abs <- sample(which(bg2[]==0), size = nrpoints, replace = TRUE, prob = prob_bias) + abs <- sample(which(terra::values(bg2)[,1]==0), size = nrpoints, replace = TRUE, prob = prob_bias) } else { - abs <- sample(which(bg2[]==0), size = nrpoints, replace = TRUE) + abs <- sample(which(terra::values(bg2)[,1]==0), size = nrpoints, replace = TRUE) } rm(bg2) } else { diff --git a/R/utils-spatial.R b/R/utils-spatial.R index ea48833e..93c97fc9 100644 --- a/R/utils-spatial.R +++ b/R/utils-spatial.R @@ -1278,7 +1278,9 @@ thin_observations <- function(df, background, env = NULL, method = "random", min ) check_package("dplyr") # Match method - method <- match.arg(method, choices = c("random", "spatial", "bias", "environmental", "zones"), several.ok = FALSE) + method <- match.arg(method, + choices = c("random", "spatial", "bias", "environmental", "zones"), + several.ok = FALSE) # Label background with id bg <- background @@ -1300,7 +1302,8 @@ thin_observations <- function(df, background, env = NULL, method = "random", min ras <- terra::rasterize(coords, bg) # Get the number of observations per grid cell # Bounds for thining - totake <- c(lower = minpoints, upper = max( terra::global(ras, "min", na.rm = TRUE)[,1], minpoints)) + totake <- c(lower = minpoints, + upper = max( terra::global(ras, "min", na.rm = TRUE)[,1], minpoints)) # -- # if(method == "random"){ @@ -1374,7 +1377,8 @@ thin_observations <- function(df, background, env = NULL, method = "random", min is.factor(zones)) if(!terra::compareGeom(bg, zones, stopOnError = FALSE)){ - zones <- alignRasters(zones, bg, method = "near", func = terra::modal, cl = FALSE) + zones <- alignRasters(zones, bg, method = "near", + func = terra::modal, cl = FALSE) } # Output vector @@ -1398,7 +1402,8 @@ thin_observations <- function(df, background, env = NULL, method = "random", min # Environmental clustering if(!terra::compareGeom(bg, env, stopOnError = FALSE)){ - env <- alignRasters(env, bg, method = "near", func = terra::modal, cl = FALSE) + env <- alignRasters(env, bg, method = "near", + func = terra::modal, cl = FALSE) } # If there are any factors, explode if(any(is.factor(env))){ diff --git a/inst/CITATION b/inst/CITATION index 45139f9d..39463526 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -21,5 +21,5 @@ citEntry( as.person("Maximilian H.K. Hesselbarth") ), year = "2023", - version = "0.0.5", + version = "0.0.9", textVersion = "Jung, M., Hesselbarth, H.K.M. (2023). An integrated species distribution modelling framework for heterogeneous biodiversity data. R package version 0.0.5") diff --git a/vignettes/articles/01_data_preparationhelpers.Rmd b/vignettes/articles/01_data_preparationhelpers.Rmd index 153af1d6..9e6eaadf 100644 --- a/vignettes/articles/01_data_preparationhelpers.Rmd +++ b/vignettes/articles/01_data_preparationhelpers.Rmd @@ -27,4 +27,326 @@ is_check <- ("CheckExEnv" %in% search()) || any(c("_R_CHECK_TIMINGS_", knitr::opts_chunk$set(fig.align = "center", eval = !is_check) ``` - +The first step with any species distribution modelling (SDM) is to collect and prepare +the necessary input data and parameters for the modelling. Predictors or +(environomental) covariates need to be acquired, in many cases formatted, +for example through masking and reprojections, and standardized. Similarly, +Biodiversity data are usually collected through independent field survey or from +available databases such as GBIF. In particular for the `ibis.iSDM` package it +can furthermore be useful to collect any other information available for a +biodiversity feature (e.g. species, habitat, ...) in question. Such as for example +broad delineation of an expert range or parameters related to the habitat preference +of a species. + +Preparing input data for the `ibis.iSDM` package requires a modest understanding +of modern geospatial R-packages, such as `sf`, `terra` and `stars`. If such +knowledge is not existing, it is advised to consult a search engine. Particularly +highlighted is the free [Geocomputation with R](https://r.geocompx.org/) book. +Many unexpected errors or patterns when using the package can usually be tracked +down to data preparation. + +```{r Load the package, warning = FALSE, message=FALSE, eval=TRUE} +# Load the package +library(ibis.iSDM) +library(terra) +library(sf) + +# Don't print out as many messages +options("ibis.setupmessages" = FALSE) +``` + +--- + +## Preparing and altering biodiversity data + +SDM approaches require observation biodiversity data, typically in the form of +presence-only or presence-absence data, which can be available in a range of +different formats from points to polygons. + +There are a range of [existing tools](https://linkinghub.elsevier.com/retrieve/pii/S0304380022003404) +that assist modellers in preparing and cleaning input data (for instance of +biases). This vignette does intend to give an overview of the options. Rather it +highlights the few functions that have been created specifically for the +`ibis.iSDM` package and that might help in some situations. + +### Adding pseudo-absence points to presence-only data + +Although in the philosophy of the `ibis.iSDM` package it is advisable to use +presence-only models in a Poisson point process modelling framework ('poipo' modelling +functions that use background points (see [Warton and Sheperd 2010](http://projecteuclid.org/euclid.aoas/1287409378)). +Yet, a good case can also be made to instead add pseudo-absence points to existing +presence-only data. This allows the use of logistic regressions and 'poipa' methods +in `ibis.iSDM` which are generally easier to interpret (response scale from 0 to 1) and +also faster to fit as model. + + +```{r Loading biodiversity testing data, message=FALSE} +## Lets load some testing data from the package + +# Background layer +background <- terra::rast(system.file("extdata/europegrid_50km.tif",package = "ibis.iSDM", mustWork = TRUE)) +# Load virtual species points +virtual_species <- sf::st_read(system.file("extdata/input_data.gpkg",package = "ibis.iSDM", mustWork = TRUE), "points",quiet = TRUE) +# Add a range +virtual_range <- sf::st_read(system.file('extdata/input_data.gpkg', package='ibis.iSDM'), 'range', quiet = TRUE) + +``` + +Adding pseudo-absence data in the `ibis.iSDM package` works by first specifiying +a *Pseudoabsence options* object that contains parameters how many and where +pseudo-absences are to be sampled. The respective function for this is called +`pseudoabs_settings()`. Further details on the available options here (there are many) can be found in +the help file. +By default the packages uses a random sampling of absence points and the +settings for this can be queried here `ibis_options()$ibis.pseudoabsence`. + +After such options have been defined, pseudoa-absence data can be added to any +point dataset via `add_pseudoabsence()`. + +Example: + +```{r Define and add pseudo-absence data, message=FALSE} + +# Define new settings for sampling points outside the minimum convex polygon of +# the known presence data +abs <- pseudoabs_settings(background = background, + nrpoints = 1000, # Sample 1000 points + method = "mcp", # Option for minimum convex polygon + inside = FALSE # Sample exclusively outside + ) +print( abs ) # See object, check abs$data for the options + +# Now add to the point data +point1 <- add_pseudoabsence(virtual_species, + # Point to the column with the presence information + field_occurrence = 'Observed', + settings = abs) + +plot(point1['Observed']) + +# --- # +# Another option sampling inside the range, but biased by a bias layer +bias <- terra::rast(system.file("extdata/predictors/hmi_mean_50km.tif", + package = "ibis.iSDM", mustWork = TRUE)) + +abs <- pseudoabs_settings(background = background, + nrpoints = 100, # Sample 100 points + method = "range", # Define range as method + inside = TRUE, # Sample exclusively inside + layer = virtual_range, # Define the range + bias = bias # Set a bias layer + ) + +# Add again to the point data +point2 <- add_pseudoabsence(virtual_species, + # Point to the column with the presence information + field_occurrence = 'Observed', + settings = abs) + +plot(point2['Observed']) + +``` + +### Thinning observations + +Many presence-only records are often spatially highly biased with varying +observational processes resulting in quite clustered point observations. For example, +urban areas and natural sites near them are considerably more often frequented by +citizens observed wildlife than sites in remote areas. +Particular for Poisson process models that can be problematic as such models critically +assume - without accounting for it - that the observational process is homogeneous +in space. + +Thinning observations is a method to remove point observations from areas that have +been "oversampled". Critically however it does only remove points from grid cells +of a provided background where this is the case and never removes the entire grid +cell fully. It can also be beneficial for model convergence and modelling speed, +as particular for well-sampled species (e.g. the common blackbird *Turdus merula*) +there are diminishing returns of fitting a SDM with like 1 million presence-only +points instead of just 20000 well separated ones. +The `ibis.iSDM` package has its own implementation for spatial thinning, but +one can also refer to [Aiello-Lammens et al.](https://doi.org/10.1111/ecog.01132) +for an alternative implementation and rationale for thinning. + +**Thinning needs to be conducted with care as it effectively discards data!** + +```{r Thinning, message=TRUE} +## We use the data loaded in above +plot(virtual_species['Observed'], main = "Original data") + +# Random thinning. Note the messages of number of thinned points +point1 <- thin_observations(df = virtual_species, + background = background, + method = 'random', + minpoints = 1 # Retain at minimum one point per grid cell! + ) + +plot(point1['Observed'], main = "Random thinning") + +# Another way: Use environmental thinning to retain enough points +# across the niche defined by a set of covariates +covariates <- terra::rast(list.files(system.file("extdata/predictors/", package = "ibis.iSDM", mustWork = TRUE), "*.tif",full.names = TRUE)) + +point2 <- thin_observations(df = virtual_species, + background = background, + env = covariates, + method = 'environmental', + minpoints = 5 # Retain at minimum five points! + ) + +plot(point2['Observed'], main = "Environmentally stratified data") + +``` + + +--- + +## Preparing and altering predictor data + +In order to be used for species distribution modelling all predictors need to +be provided in a common extent, grain size and geographic projections. They need +to align with the provided background extent to `distribution()` and should ideally +not contain no missing data. If there are missing data, the package will check and +remove during model fitting points that fall into any grid cells with missing data. + +The `ibis.iSDM` package has a number of convenience functions to modify input +predictors. These functions rather provide nuance(s) and variation to the modelling +process, rather than preparing the input data (which needs to be undertaken using +the `terra` package). + +```{r Load Predictors, message=FALSE} +# Load some test covariates +predictors <- terra::rast(list.files(system.file("extdata/predictors/", package = "ibis.iSDM", mustWork = TRUE), "*.tif",full.names = TRUE)) + +``` + + +### Transforming predictors + +For better model convergence it usually makes sense to bring all predictors to a +common unit, for example by noramlizing or scaling them. The `ibis.iSDM` package +has a convenience function that can be applied to any `terra` 'SpatRaster' object. + +**NOTE: This functionality are also available directly in `add_predictors()` as parameter!** + +```{r Transform predictors} +# Let's take a simple layer for an example +layer <- predictors$bio19_mean_50km + +# Transform it in various way +new1 <- predictor_transform(layer, option = "norm") +new2 <- predictor_transform(layer, option = "scale") + +new <- c(layer, new1, new2) +names(new) <- c("original", "normalized", "scaled") +terra::plot( new ) + +# Another common use case is to windsorize a layer, for example by removing +# top outliers form a prediction. +# Here the values are capped to a defined percentile +new3 <- predictor_transform(layer, option = "windsor", + # Clamp the upper values to the 90% percentile + windsor_props = c(0,.9)) + +new <- c(layer, new3) +names(new) <- c("original", "windsorized") +terra::plot( new ) + +``` + +Other options for transformation are also available and are listed in the methods +file. + +### Derivates of predictors + +A simple linear SDM (e.g. `engine_glmnet()`) includes the predictors as such and +thus assumes that any increase in the response variable follows a linear relationship +with the covariate. However, reality is not always that simple and usually it can +be assumed that many relationships are highly non-linear or otherwise complex. + +A standard way to introduce non-linearities to a linear algorithm is to create +derivates of predictors, such as for example a quadratic transformation of temperature. +The `ibis.iSDM` package has a convenience function that can be applied to any +`terra` 'SpatRaster' object to create such additional derivates for a model. +Note that this creates (in some cases substantial) additional predictors. + +**NOTE: This functionality are also available directly in `add_predictors()` as parameter!** + +```{r Creating derivates of predictors} +# Let's take a simple layer for an example +layer <- predictors$ndvi_mean_50km + +# Make a quadratic transformation +new1 <- predictor_derivate(layer, option = "quadratic") + +new <- c(layer, new1) +names(new) <- c("original", "quadratic") +terra::plot( new ) + +# Create some hinge transformations +new2 <- predictor_derivate(layer, option = "hinge", + # The number is controlled by the number of knots + nknots = 4 + ) +terra::plot( new2 ) + +# What does this do precisely? +# Lets check +df <- data.frame( ndvi = terra::values(layer), + terra::values(new2)) + +plot(df$ndvi_mean_50km, df[,2], ylab = "First hinge of ndvi", xlab = "NDVI") +plot(df$ndvi_mean_50km, df[,3], ylab = "Second hinge of ndvi",xlab = "NDVI") +plot(df$ndvi_mean_50km, df[,4], ylab = "Third hinge of ndvi", xlab = "NDVI") +plot(df$ndvi_mean_50km, df[,5], ylab = "Fourth hinge of ndvi",xlab = "NDVI") + +``` + +More fine-tuned control can also be achieved by creating specific interactions +among variables, for example if one expects climate to interact with forest cover. + +```{r Derivate interaction} +# Create interacting variables + +new <- predictor_derivate(predictors,option = "interaction", + int_variables = c("bio01_mean_50km", "CLC3_312_mean_50km")) + +plot(new, main = "Interaction variable") +``` + +### Homogenize missing data among predictors + +As mentioned above, during model training covariates will be extracted for each +biodiversity observational record. Missing data will in this case be discarded. +For example if 10 predictors are considered and a single one has a missing value +in one grid cell, the grid cell is considered missing among all other predictors +as well. +The `ibis.iSDM` package has some convenience functions to easily harmonize and check +the extent of missing data in a set of predictors which can be convenient for +assessing errors during data preparation. + +```{r} +# Make a subset of all predictors to show the concept +layers <- subset(predictors, c("aspect_mean_50km", + "CLC3_312_mean_50km", + "elevation_mean_50km")) + +# All these layers have identical data coverage. +# Now add missing data in one of the layers for testing +layers$CLC3_312_mean_50km[sample(1:ncell(layers), 1000)] <- NA + +# Harmonize the predictors +new <- predictor_homogenize_na(env = layers) + +# Now all the predictors have identical coverage of NA values +terra::plot(new) +# Or assess like this +plot(!terra::noNA(new$aspect_mean_50km), main = "Missing observations") + +``` + +--- + +## Preparing and altering future scenario data + +< Upcoming :) >