From b0bbedb202782f0341adf8d3d6f75e5840c136ab Mon Sep 17 00:00:00 2001 From: Martin Jung <3788377+Martin-Jung@users.noreply.github.com> Date: Thu, 1 Jun 2023 11:47:55 +0200 Subject: [PATCH] Merging current development branch into main (#46) * Starting to work on warnings fixing dependencies and global vars (still some stuff to deal) * Removal of magrittr #41 and checks for deps. in actions * Removal of cmdstanr as not on CRAN... * Update CITATION.cff * Fix check (#42) * Install extra-packages in setup-r-deps step * Fix cmethod consistency * Linewidth examples (usage still issue but not sure how to break) * Starting to fix some deps warnings (still some left) * Forgot cmdstanr not on CRAN * Fix check (#43) * Install extra-packages in setup-r-deps step * Fix cmethod consistency * Linewidth examples (usage still issue but not sure how to break) * Starting to fix some deps warnings (still some left) * Typo in gh-actions * Trying to fix call to require in write_output. Its not beautiful, but working...I think... * Remove calls to require() * Remove not need importFrom and collect them in ibis.iSDM-package.R * Some smaller cosmetic changes * No need to specific Namespace for functions from this package * Remove purrr * Small fixes to docs, split hyperparameter search and feature selection in `train`. Added Boruta * Small bug fix introduced with last commit [no-ci] * Small bug fix introduced with last commit [no-ci] * Update of citation and prep for release version 0.0.3 * Url fix * url fix attempt nr 3 * url fix attempt nr 4 --------- Co-authored-by: mhesselbarth Co-authored-by: Martin-Jung --- .Rbuildignore | 63 +- .github/workflows/R-CMD-check.yaml | 14 +- CITATION.cff | 608 ++-- DESCRIPTION | 281 +- NAMESPACE | 10 +- NEWS.md | 7 + R/add_biodiversity.R | 27 +- R/add_constraint.R | 1342 ++++----- R/add_constraint_MigClim.R | 6 +- R/add_control_bias.R | 4 +- R/add_latent.R | 2 +- R/add_log.R | 4 +- R/add_offset.R | 1262 ++++---- R/add_predictors.R | 2540 ++++++++--------- R/add_priors.R | 8 +- R/bdproto-biodiversitydataset.R | 604 ++-- R/bdproto-biodiversitydistribution.R | 1 - R/bdproto-biodiversityscenario.R | 1170 ++++---- R/bdproto-distributionmodel.R | 842 +++--- R/bdproto-predictors.R | 356 +-- R/effects.R | 6 +- R/engine_bart.R | 1230 ++++---- R/engine_breg.R | 1592 ++++++----- R/engine_gdb.R | 40 +- R/engine_glmnet.R | 1396 ++++----- R/engine_inla.R | 2046 ++++++------- R/engine_inlabru.R | 2500 ++++++++-------- R/engine_stan.R | 1680 +++++------ R/engine_xgboost.R | 1636 +++++------ R/ensemble.R | 1086 +++---- R/ibis.iSDM-package.R | 29 + R/misc.R | 370 ++- R/plot.R | 418 +-- R/project.R | 926 +++--- R/pseudoabsence.R | 18 +- R/similarity.R | 880 +++--- R/summary.R | 22 +- R/threshold.R | 752 ++--- R/train.R | 405 ++- R/utils-bart.R | 18 +- R/utils-breg.R | 388 +-- R/utils-gdb.R | 600 ++-- R/utils-glmnet.R | 328 +-- R/utils-inla.R | 90 +- R/utils-predictors.R | 1358 +++++---- R/utils-scenario.R | 1219 ++++---- R/utils-spatial.R | 2534 ++++++++-------- R/utils-stan.R | 1024 +++---- R/utils.R | 1591 +++++------ R/validate.R | 996 +++---- R/write_output.R | 1254 ++++---- README.Rmd | 8 +- README.md | 13 +- _pkgdown.yml | 2 +- man/add_biodiversity_poipa.Rd | 3 +- man/add_biodiversity_poipo.Rd | 2 +- man/add_biodiversity_polpa.Rd | 2 +- man/add_biodiversity_polpo.Rd | 2 +- man/add_constrain_MigClim.Rd | 6 +- man/add_control_bias.Rd | 4 +- man/add_latent_spatial.Rd | 2 +- man/add_log.Rd | 2 +- man/add_offset.Rd | 4 +- man/add_offset_bias.Rd | 4 +- man/add_predictors.Rd | 2 +- man/add_predictors_globiom.Rd | 2 +- man/add_pseudoabsence.Rd | 7 +- man/coef.Rd | 6 +- man/effects.Rd | 6 +- man/engine_bart.Rd | 7 +- man/engine_breg.Rd | 9 + man/engine_gdb.Rd | 9 + man/engine_glmnet.Rd | 9 + man/engine_inla.Rd | 9 + man/engine_inlabru.Rd | 9 + man/engine_stan.Rd | 9 + man/engine_xgboost.Rd | 9 + man/find_correlated_predictors.Rd | 25 - man/get_priors.Rd | 8 +- man/ibis.iSDM-package.Rd | 28 + man/pipe.Rd | 35 - man/predictor_filter.Rd | 53 + ...edictors.Rd => predictors_filter_abess.Rd} | 24 +- man/predictors_filter_boruta.Rd | 49 + man/predictors_filter_collinearity.Rd | 36 + man/pseudoabs_settings.Rd | 9 +- man/rm_predictors.Rd | 4 +- man/sel_predictors.Rd | 4 +- man/similarity.Rd | 7 +- man/summary.Rd | 20 +- man/train.Rd | 56 +- man/write_output.Rd | 8 +- man/write_summary.Rd | 8 +- .../testthat/test_BiodiversityDistribution.R | 40 +- tests/testthat/test_Scenarios.R | 17 +- tests/testthat/test_functions.R | 5 +- tests/testthat/test_modelFits.R | 154 +- tests/testthat/test_objectinheritance.R | 30 +- tests/testthat/test_priors.R | 21 +- tests/testthat/test_rangesOffsets.R | 8 +- tests/testthat/test_trainINLA.R | 15 +- vignettes/articles/01_train_simple_model.Rmd | 16 +- vignettes/articles/02_integrate_data.Rmd | 68 +- .../articles/03_biodiversity_projections.Rmd | 24 +- .../06_frequently-asked-questions.Rmd | 49 +- vignettes/articles/contributing.Rmd | 5 +- 106 files changed, 19516 insertions(+), 19040 deletions(-) create mode 100644 R/ibis.iSDM-package.R delete mode 100644 man/find_correlated_predictors.Rd create mode 100644 man/ibis.iSDM-package.Rd delete mode 100644 man/pipe.Rd create mode 100644 man/predictor_filter.Rd rename man/{find_subset_of_predictors.Rd => predictors_filter_abess.Rd} (69%) create mode 100644 man/predictors_filter_boruta.Rd create mode 100644 man/predictors_filter_collinearity.Rd diff --git a/.Rbuildignore b/.Rbuildignore index abb08d19..fa2b885c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,31 +1,32 @@ -^\.travis\.yml$ -^.*\.Rproj$ -^\.Rproj\.user$ -man-roxygen/* -\.github/* -vignettes/_output.yaml -src/stan_files/.*\.o$ -src/stan_files/.*\.cc$ -src/stan_files/.*\.hpp$ -src/stan_files/.*\.exe$ -tests/testthat/stan_files/.*\.o$ -tests/testthat/stan_files/.*\.cc$ -tests/testthat/stan_files/.*\.hpp$ -^\cleanup* -configure.ac -^inst/doc/[A-Za-z0-9\-_]+[.]svg$ -^inst/doc/[A-Za-z0-9\-_]+[.]txt$ -^inst/doc/[A-Za-z0-9\-_]+[.]csv -^inst/doc/\.install_extras$ -^_pkgdown\.yml$ -^docs$ -^\.github$ -^pkgdown$ -LICENSE.md -README.Rmd -^CITATION\.cff$ -^vignettes/articles$ -CONTRIBUTING.md -^cran-comments\.md$ -^revdep$ -^codecov\.yml$ +^\.travis\.yml$ +^.*\.Rproj$ +^\.Rproj\.user$ +man-roxygen/* +\.github/* +vignettes/_output.yaml +src/stan_files/.*\.o$ +src/stan_files/.*\.cc$ +src/stan_files/.*\.hpp$ +src/stan_files/.*\.exe$ +tests/testthat/stan_files/.*\.o$ +tests/testthat/stan_files/.*\.cc$ +tests/testthat/stan_files/.*\.hpp$ +^\cleanup* +configure.ac +^inst/doc/[A-Za-z0-9\-_]+[.]svg$ +^inst/doc/[A-Za-z0-9\-_]+[.]txt$ +^inst/doc/[A-Za-z0-9\-_]+[.]csv +^inst/doc/\.install_extras$ +^_pkgdown\.yml$ +^docs$ +^\.github$ +^pkgdown$ +LICENSE.md +README.Rmd +^CITATION\.cff$ +^vignettes/articles$ +CONTRIBUTING.md +^cran-comments\.md$ +^revdep$ +^codecov\.yml$ +.covrignore diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 53c2b342..3349b73c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -40,6 +40,7 @@ jobs: r-version: ${{ matrix.config.r }} http-user-agent: ${{ matrix.config.http-user-agent }} use-public-rspm: true + extra-repositories: "https://inla.r-inla-download.org/R/testing" - name: Install gdal and igraph dependencies on Linux (gdal) run: | @@ -60,7 +61,18 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: dependencies: '"all"' - extra-packages: any::rcmdcheck + extra-packages: | + any::rcmdcheck, + any::glmnet, + any::glmnetUtils, + any::pdp, + stan-dev/cmdstanr, + any::igraph, + any::xgboost, + any::dbarts, + any::mboost, + any::inlabru, + any::BoomSpikeSlab needs: check - uses: r-lib/actions/check-r-package@v2 diff --git a/CITATION.cff b/CITATION.cff index 65c373b7..89c23af0 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -8,7 +8,7 @@ message: 'To cite package "ibis.iSDM" in publications use:' type: software license: CC-BY-4.0 title: 'ibis.iSDM: Modelling Framework for Integrated Biodiversity Distribution Scenarios' -version: 0.0.2 +version: 0.0.3 abstract: Integrated framework of modelling the distribution of species and ecosystems in a suitability framing. This package allows the estimation of integrated species distribution models (iSDM) based on several sources of evidence and provided presence-only @@ -63,6 +63,66 @@ references: email: hadley@rstudio.com year: '2023' version: '>= 0.2.0' +- type: software + title: doFuture + abstract: 'doFuture: A Universal Foreach Parallel Adapter using the Future API of + the ''future'' Package' + notes: Imports + url: https://doFuture.futureverse.org + repository: https://CRAN.R-project.org/package=doFuture + authors: + - family-names: Bengtsson + given-names: Henrik + email: henrikb@braju.com + year: '2023' + version: '>= 0.12.2' +- type: software + title: foreach + abstract: 'foreach: Provides Foreach Looping Construct' + notes: Imports + url: https://github.com/RevolutionAnalytics/foreach + repository: https://CRAN.R-project.org/package=foreach + authors: + - name: Microsoft + - family-names: Weston + given-names: Steve + year: '2023' +- type: software + title: future + abstract: 'future: Unified Parallel and Distributed Processing in R for Everyone' + notes: Imports + url: https://future.futureverse.org + repository: https://CRAN.R-project.org/package=future + authors: + - family-names: Bengtsson + given-names: Henrik + email: henrikb@braju.com + year: '2023' + version: '>= 1.23.0' +- type: software + title: dplyr + abstract: 'dplyr: A Grammar of Data Manipulation' + notes: Imports + url: https://dplyr.tidyverse.org + repository: https://CRAN.R-project.org/package=dplyr + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + orcid: https://orcid.org/0000-0003-4757-117X + - family-names: François + given-names: Romain + orcid: https://orcid.org/0000-0002-2444-4226 + - family-names: Henry + given-names: Lionel + - family-names: Müller + given-names: Kirill + orcid: https://orcid.org/0000-0002-1416-3412 + - family-names: Vaughan + given-names: Davis + email: davis@posit.co + orcid: https://orcid.org/0000-0003-4777-038X + year: '2023' - type: software title: geodist abstract: 'geodist: Fast, Dependency-Free Geodesic Distance Calculations' @@ -112,19 +172,16 @@ references: orcid: https://orcid.org/0000-0002-9415-4582 year: '2023' - type: software - title: magrittr - abstract: 'magrittr: A Forward-Pipe Operator for R' + title: graphics + abstract: 'R: A Language and Environment for Statistical Computing' notes: Imports - url: https://magrittr.tidyverse.org - repository: https://CRAN.R-project.org/package=magrittr authors: - - family-names: Bache - given-names: Stefan Milton - email: stefan@stefanbache.dk - - family-names: Wickham - given-names: Hadley - email: hadley@rstudio.com + - name: R Core Team + location: + name: Vienna, Austria year: '2023' + institution: + name: R Foundation for Statistical Computing - type: software title: methods abstract: 'R: A Language and Environment for Statistical Computing' @@ -160,41 +217,25 @@ references: institution: name: R Foundation for Statistical Computing - type: software - title: foreach - abstract: 'foreach: Provides Foreach Looping Construct' - notes: Imports - url: https://github.com/RevolutionAnalytics/foreach - repository: https://CRAN.R-project.org/package=foreach - authors: - - name: Microsoft - - family-names: Weston - given-names: Steve - year: '2023' -- type: software - title: future - abstract: 'future: Unified Parallel and Distributed Processing in R for Everyone' - notes: Imports - url: https://future.futureverse.org - repository: https://CRAN.R-project.org/package=future - authors: - - family-names: Bengtsson - given-names: Henrik - email: henrikb@braju.com - year: '2023' - version: '>= 1.23.0' -- type: software - title: doFuture - abstract: 'doFuture: A Universal Foreach Parallel Adapter using the Future API of - the ''future'' Package' + title: posterior + abstract: 'posterior: Tools for Working with Posterior Distributions' notes: Imports - url: https://doFuture.futureverse.org - repository: https://CRAN.R-project.org/package=doFuture + url: https://mc-stan.org/posterior/ + repository: https://CRAN.R-project.org/package=posterior authors: - - family-names: Bengtsson - given-names: Henrik - email: henrikb@braju.com + - family-names: Bürkner + given-names: Paul-Christian + email: paul.buerkner@gmail.com + - family-names: Gabry + given-names: Jonah + email: jsg2201@columbia.edu + - family-names: Kay + given-names: Matthew + email: mjskay@northwestern.edu + - family-names: Vehtari + given-names: Aki + email: Aki.Vehtari@aalto.fi year: '2023' - version: '>= 0.12.2' - type: software title: proto abstract: 'proto: Prototype Object-Based Programming' @@ -224,55 +265,6 @@ references: orcid: https://orcid.org/0000-0001-5872-2872 year: '2023' version: '>= 3.4-5' -- type: software - title: Rcpp - abstract: 'Rcpp: Seamless R and C++ Integration' - notes: Imports - url: https://www.rcpp.org - repository: https://CRAN.R-project.org/package=Rcpp - authors: - - family-names: Eddelbuettel - given-names: Dirk - - family-names: Francois - given-names: Romain - - family-names: Allaire - given-names: JJ - - family-names: Ushey - given-names: Kevin - - family-names: Kou - given-names: Qiang - - family-names: Russell - given-names: Nathan - - family-names: Ucar - given-names: Inaki - - family-names: Bates - given-names: Douglas - - family-names: Chambers - given-names: John - year: '2023' - version: '>= 0.12.0' -- type: software - title: RcppParallel - abstract: 'RcppParallel: Parallel Programming Tools for ''Rcpp''' - notes: Imports - url: https://rcppcore.github.io/RcppParallel/ - repository: https://CRAN.R-project.org/package=RcppParallel - authors: - - family-names: Allaire - given-names: JJ - email: jj@rstudio.com - - family-names: Francois - given-names: Romain - - family-names: Ushey - given-names: Kevin - email: kevin@rstudio.com - - family-names: Vandenbrouck - given-names: Gregory - - family-names: Geelnard - given-names: Marcus - - name: Intel - year: '2023' - version: '>= 5.0.1' - type: software title: rgeos abstract: 'rgeos: Interface to Geometry Engine - Open Source (''GEOS'')' @@ -288,23 +280,6 @@ references: given-names: Colin year: '2023' version: '>= 0.5-5' -- type: software - title: rgdal - abstract: 'rgdal: Bindings for the ''Geospatial'' Data Abstraction Library' - notes: Imports - url: https://gdal.org - repository: https://CRAN.R-project.org/package=rgdal - authors: - - family-names: Bivand - given-names: Roger - email: Roger.Bivand@nhh.no - orcid: https://orcid.org/0000-0003-2392-6140 - - family-names: Keitt - given-names: Tim - - family-names: Rowlingson - given-names: Barry - year: '2023' - version: '>= 1.5-32' - type: software title: sf abstract: 'sf: Simple Features for R' @@ -332,17 +307,16 @@ references: year: '2023' version: '>= 0.5' - type: software - title: testthat - abstract: 'testthat: Unit Testing for R' + title: stats + abstract: 'R: A Language and Environment for Statistical Computing' notes: Imports - url: https://testthat.r-lib.org - repository: https://CRAN.R-project.org/package=testthat authors: - - family-names: Wickham - given-names: Hadley - email: hadley@rstudio.com + - name: R Core Team + location: + name: Vienna, Austria year: '2023' - version: '>= 3.0.0' + institution: + name: R Foundation for Statistical Computing - type: software title: tibble abstract: 'tibble: Simple Data Frames' @@ -374,25 +348,16 @@ references: email: tytso@thunk.org year: '2023' - type: software - title: posterior - abstract: 'posterior: Tools for Working with Posterior Distributions' + title: utils + abstract: 'R: A Language and Environment for Statistical Computing' notes: Imports - url: https://mc-stan.org/posterior/ - repository: https://CRAN.R-project.org/package=posterior authors: - - family-names: Bürkner - given-names: Paul-Christian - email: paul.buerkner@gmail.com - - family-names: Gabry - given-names: Jonah - email: jsg2201@columbia.edu - - family-names: Kay - given-names: Matthew - email: mjskay@northwestern.edu - - family-names: Vehtari - given-names: Aki - email: Aki.Vehtari@aalto.fi + - name: R Core Team + location: + name: Vienna, Austria year: '2023' + institution: + name: R Foundation for Statistical Computing - type: software title: 'R: A Language and Environment for Statistical Computing' notes: Depends @@ -404,7 +369,109 @@ references: year: '2023' institution: name: R Foundation for Statistical Computing - version: '>= 4.0.0' + version: '>= 4.1.0' +- type: software + title: abind + abstract: 'abind: Combine Multidimensional Arrays' + notes: Suggests + repository: https://CRAN.R-project.org/package=abind + authors: + - family-names: Plate + given-names: Tony + email: tplate@acm.org + - family-names: Heiberger + given-names: Richard + year: '2023' +- type: software + title: BoomSpikeSlab + abstract: 'BoomSpikeSlab: MCMC for Spike and Slab Regression' + notes: Suggests + repository: https://CRAN.R-project.org/package=BoomSpikeSlab + authors: + - family-names: Scott + given-names: Steven L. + email: steve.the.bayesian@gmail.com + year: '2023' + version: '>= 1.2.4' +- type: software + title: covr + abstract: 'covr: Test Coverage for Packages' + notes: Suggests + url: https://covr.r-lib.org + repository: https://CRAN.R-project.org/package=covr + authors: + - family-names: Hester + given-names: Jim + email: james.f.hester@gmail.com + year: '2023' +- type: software + title: inlabru + abstract: 'inlabru: Bayesian Latent Gaussian Modelling using INLA and Extensions' + notes: Suggests + url: http://www.inlabru.org + repository: https://CRAN.R-project.org/package=inlabru + authors: + - family-names: Lindgren + given-names: Finn + email: finn.lindgren@gmail.com + orcid: https://orcid.org/0000-0002-5833-2011 + - family-names: Bachl + given-names: Fabian E. + email: bachlfab@gmail.com + year: '2023' +- type: software + title: glmnet + abstract: 'glmnet: Lasso and Elastic-Net Regularized Generalized Linear Models' + notes: Suggests + url: https://glmnet.stanford.edu + repository: https://CRAN.R-project.org/package=glmnet + authors: + - family-names: Friedman + given-names: Jerome + - family-names: Hastie + given-names: Trevor + email: hastie@stanford.edu + - family-names: Tibshirani + given-names: Rob + - family-names: Narasimhan + given-names: Balasubramanian + - family-names: Tay + given-names: Kenneth + - family-names: Simon + given-names: Noah + - family-names: Yang + given-names: James + year: '2023' +- type: software + title: glmnetUtils + abstract: 'glmnetUtils: Utilities for ''Glmnet''' + notes: Suggests + url: https://github.com/hongooi73/glmnetUtils + repository: https://CRAN.R-project.org/package=glmnetUtils + authors: + - family-names: Ooi + given-names: Hong + email: hongooi73@gmail.com + year: '2023' +- type: software + title: dbarts + abstract: 'dbarts: Discrete Bayesian Additive Regression Trees Sampler' + notes: Suggests + url: https://github.com/vdorie/dbarts + repository: https://CRAN.R-project.org/package=dbarts + authors: + - family-names: Dorie + given-names: Vincent + email: vdorie@gmail.com + orcid: https://orcid.org/0000-0002-9576-3064 + - family-names: Chipman + given-names: Hugh + email: hugh.chipman@gmail.com + - family-names: McCulloch + given-names: Robert + email: robert.mcculloch1@gmail.com + year: '2023' + version: '>= 0.9-22' - type: software title: deldir abstract: 'deldir: Delaunay Triangulation and Dirichlet (Voronoi) Tessellation' @@ -414,6 +481,51 @@ references: - family-names: Turner given-names: Rolf year: '2023' +- type: software + title: doParallel + abstract: 'doParallel: Foreach Parallel Adaptor for the ''parallel'' Package' + notes: Suggests + url: https://github.com/RevolutionAnalytics/doparallel + repository: https://CRAN.R-project.org/package=doParallel + authors: + - family-names: Corporation + given-names: Microsoft + - family-names: Weston + given-names: Steve + year: '2023' +- type: software + title: ellipsis + abstract: 'ellipsis: Tools for Working with ...' + notes: Suggests + url: https://ellipsis.r-lib.org + repository: https://CRAN.R-project.org/package=ellipsis + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@rstudio.com + year: '2023' +- type: software + title: igraph + abstract: 'igraph: Network Analysis and Visualization' + notes: Suggests + url: https://igraph.org + repository: https://CRAN.R-project.org/package=igraph + authors: + - family-names: file. + given-names: See AUTHORS + year: '2023' +- type: software + title: knitr + abstract: 'knitr: A General-Purpose Package for Dynamic Report Generation in R' + notes: Suggests + url: https://yihui.org/knitr/ + repository: https://CRAN.R-project.org/package=knitr + authors: + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + year: '2023' - type: software title: mboost abstract: 'mboost: Model-Based Boosting' @@ -438,25 +550,6 @@ references: given-names: Benjamin orcid: https://orcid.org/0000-0003-2810-3186 year: '2023' -- type: software - title: dbarts - abstract: 'dbarts: Discrete Bayesian Additive Regression Trees Sampler' - notes: Suggests - url: https://github.com/vdorie/dbarts - repository: https://CRAN.R-project.org/package=dbarts - authors: - - family-names: Dorie - given-names: Vincent - email: vdorie@gmail.com - orcid: https://orcid.org/0000-0002-9576-3064 - - family-names: Chipman - given-names: Hugh - email: hugh.chipman@gmail.com - - family-names: McCulloch - given-names: Robert - email: robert.mcculloch1@gmail.com - year: '2023' - version: '>= 0.9-22' - type: software title: modEvA abstract: 'modEvA: Model Evaluation and Analysis' @@ -474,52 +567,15 @@ references: given-names: Real year: '2023' - type: software - title: xgboost - abstract: 'xgboost: Extreme Gradient Boosting' + title: ncmeta + abstract: 'ncmeta: Straightforward ''NetCDF'' Metadata' notes: Suggests - url: https://github.com/dmlc/xgboost - repository: https://CRAN.R-project.org/package=xgboost + url: https://github.com/hypertidy/ncmeta + repository: https://CRAN.R-project.org/package=ncmeta authors: - - family-names: Chen - given-names: Tianqi - email: tianqi.tchen@gmail.com - - family-names: He - given-names: Tong - email: hetong007@gmail.com - - family-names: Benesty + - family-names: Sumner given-names: Michael - email: michael@benesty.fr - - family-names: Khotilovich - given-names: Vadim - email: khotilovich@gmail.com - - family-names: Tang - given-names: Yuan - email: terrytangyuan@gmail.com - orcid: https://orcid.org/0000-0001-5243-233X - - family-names: Cho - given-names: Hyunsu - email: chohyu01@cs.washington.edu - - family-names: Chen - given-names: Kailong - - family-names: Mitchell - given-names: Rory - - family-names: Cano - given-names: Ignacio - - family-names: Zhou - given-names: Tianyi - - family-names: Li - given-names: Mu - - family-names: Xie - given-names: Junyuan - - family-names: Lin - given-names: Min - - family-names: Geng - given-names: Yifeng - - family-names: Li - given-names: Yutian - - family-names: Yuan - given-names: Jiaming - email: jm.yuan@outlook.com + email: mdsumner@gmail.com year: '2023' - type: software title: progress @@ -534,27 +590,44 @@ references: given-names: Rich year: '2023' - type: software - title: ellipsis - abstract: 'ellipsis: Tools for Working with ...' + title: rmarkdown + abstract: 'rmarkdown: Dynamic Documents for R' notes: Suggests - url: https://ellipsis.r-lib.org - repository: https://CRAN.R-project.org/package=ellipsis + url: https://pkgs.rstudio.com/rmarkdown/ + repository: https://CRAN.R-project.org/package=rmarkdown authors: + - family-names: Allaire + given-names: JJ + email: jj@rstudio.com + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + - family-names: McPherson + given-names: Jonathan + email: jonathan@rstudio.com + - family-names: Luraschi + given-names: Javier + email: javier@rstudio.com + - family-names: Ushey + given-names: Kevin + email: kevin@rstudio.com + - family-names: Atkins + given-names: Aron + email: aron@rstudio.com - family-names: Wickham given-names: Hadley email: hadley@rstudio.com - year: '2023' -- type: software - title: doParallel - abstract: 'doParallel: Foreach Parallel Adaptor for the ''parallel'' Package' - notes: Suggests - url: https://github.com/RevolutionAnalytics/doparallel - repository: https://CRAN.R-project.org/package=doParallel - authors: - - family-names: Corporation - given-names: Microsoft - - family-names: Weston - given-names: Steve + - family-names: Cheng + given-names: Joe + email: joe@rstudio.com + - family-names: Chang + given-names: Winston + email: winston@rstudio.com + - family-names: Iannone + given-names: Richard + email: rich@rstudio.com + orcid: https://orcid.org/0000-0003-3925-190X year: '2023' - type: software title: rstan @@ -593,79 +666,66 @@ references: - family-names: Lysy given-names: Martin email: mlysy@uwaterloo.ca + - family-names: Johnson + given-names: Andrew year: '2023' version: '>= 2.1.1' - type: software - title: BoomSpikeSlab - abstract: 'BoomSpikeSlab: MCMC for Spike and Slab Regression' - notes: Suggests - repository: https://CRAN.R-project.org/package=BoomSpikeSlab - authors: - - family-names: Scott - given-names: Steven L. - email: steve.the.bayesian@gmail.com - year: '2023' - version: '>= 1.2.4' -- type: software - title: rmarkdown - abstract: 'rmarkdown: Dynamic Documents for R' + title: testthat + abstract: 'testthat: Unit Testing for R' notes: Suggests - url: https://pkgs.rstudio.com/rmarkdown/ - repository: https://CRAN.R-project.org/package=rmarkdown + url: https://testthat.r-lib.org + repository: https://CRAN.R-project.org/package=testthat authors: - - family-names: Allaire - given-names: JJ - email: jj@rstudio.com - - family-names: Xie - given-names: Yihui - email: xie@yihui.name - orcid: https://orcid.org/0000-0003-0645-5666 - - family-names: McPherson - given-names: Jonathan - email: jonathan@rstudio.com - - family-names: Luraschi - given-names: Javier - email: javier@rstudio.com - - family-names: Ushey - given-names: Kevin - email: kevin@rstudio.com - - family-names: Atkins - given-names: Aron - email: aron@rstudio.com - family-names: Wickham given-names: Hadley email: hadley@rstudio.com - - family-names: Cheng - given-names: Joe - email: joe@rstudio.com - - family-names: Chang - given-names: Winston - email: winston@rstudio.com - - family-names: Iannone - given-names: Richard - email: rich@rstudio.com - orcid: https://orcid.org/0000-0003-3925-190X year: '2023' - type: software - title: knitr - abstract: 'knitr: A General-Purpose Package for Dynamic Report Generation in R' + title: xgboost + abstract: 'xgboost: Extreme Gradient Boosting' notes: Suggests - url: https://yihui.org/knitr/ - repository: https://CRAN.R-project.org/package=knitr + url: https://github.com/dmlc/xgboost + repository: https://CRAN.R-project.org/package=xgboost authors: + - family-names: Chen + given-names: Tianqi + email: tianqi.tchen@gmail.com + - family-names: He + given-names: Tong + email: hetong007@gmail.com + - family-names: Benesty + given-names: Michael + email: michael@benesty.fr + - family-names: Khotilovich + given-names: Vadim + email: khotilovich@gmail.com + - family-names: Tang + given-names: Yuan + email: terrytangyuan@gmail.com + orcid: https://orcid.org/0000-0001-5243-233X + - family-names: Cho + given-names: Hyunsu + email: chohyu01@cs.washington.edu + - family-names: Chen + given-names: Kailong + - family-names: Mitchell + given-names: Rory + - family-names: Cano + given-names: Ignacio + - family-names: Zhou + given-names: Tianyi + - family-names: Li + given-names: Mu - family-names: Xie - given-names: Yihui - email: xie@yihui.name - orcid: https://orcid.org/0000-0003-0645-5666 - year: '2023' -- type: software - title: covr - abstract: 'covr: Test Coverage for Packages' - notes: Suggests - url: https://covr.r-lib.org - repository: https://CRAN.R-project.org/package=covr - authors: - - family-names: Hester - given-names: Jim - email: james.f.hester@gmail.com + given-names: Junyuan + - family-names: Lin + given-names: Min + - family-names: Geng + given-names: Yifeng + - family-names: Li + given-names: Yutian + - family-names: Yuan + given-names: Jiaming + email: jm.yuan@outlook.com year: '2023' diff --git a/DESCRIPTION b/DESCRIPTION index d6a81796..4dc3b7d8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,137 +1,144 @@ -Package: ibis.iSDM -Type: Package -Title: Modelling Framework for Integrated Biodiversity Distribution Scenarios -Version: 0.0.2 -Authors@R: - c(person(given = "Martin", - family = "Jung", - role = c("aut", "cre", "cph"), - email = "jung@iiasa.ac.at", - comment = c(ORCID = "0000-0002-7569-1390")), - person(given = "Maximilian H.K.", - family = "Hesselbarth", - role = c("ctb"), - email = "hesselbarth@iiasa.ac.at", - comment = c(ORCID = "0000-0003-1125-9918")) - ) -Maintainer: Martin Jung -Description: Integrated framework of modelling the distribution of species and ecosystems in a suitability framing. This package allows the estimation of integrated species distribution models (iSDM) based on several sources of evidence and provided presence-only and presence-absence datasets. It makes heavy use of point-process models for estimating habitat suitability and allows to include spatial latent effects and priors in the estimation. To do so 'ibis.iSDM' supports a number of engines for Bayesian and more non-parametric machine learning estimation. Further, the 'ibis.iSDM' is specifically customized to support spatial-temporal projections of habitat suitability into the future. -Language: en-GB -License: CC BY 4.0 -Encoding: UTF-8 -Imports: - assertthat (>= 0.2.0), - geodist, - ggplot2, - magrittr, - methods, - ncdf4, - parallel, - foreach, - future (>= 1.23.0), - doFuture (>= 0.12.2), - proto (>= 1.0.0), - raster (>= 3.4-5), - Rcpp (>= 0.12.0), - RcppParallel (>= 5.0.1), - rgeos (>= 0.5-5), - rgdal (>= 1.5-32), - sf (>= 0.9), - stars (>= 0.5), - testthat (>= 3.0.0), - tibble (>= 2.0.0), - uuid, - posterior -Depends: - R(>= 4.0.0) -Suggests: - deldir, - mboost, - dbarts (>= 0.9-22), - modEvA, - xgboost, - progress, - ellipsis, - doParallel, - rstan (>= 2.21.0), - rstantools (>= 2.1.1), - BoomSpikeSlab (>= 1.2.4), - rmarkdown, - knitr, - covr -URL: https://iiasa.github.io/ibis.iSDM/ -BugReports: https://github.com/iiasa/ibis.iSDM/issues -RoxygenNote: 7.2.3 -Config/testthat/edition: 3 -Collate: - 'utils.R' - 'bdproto.R' - 'waiver.R' - 'bdproto-biodiversitydataset.R' - 'bdproto-biodiversitydistribution.R' - 'add_biodiversity.R' - 'bdproto-biodiversityscenario.R' - 'add_constraint.R' - 'add_constraint_MigClim.R' - 'add_control_bias.R' - 'add_latent.R' - 'bdproto-log.R' - 'add_log.R' - 'add_offset.R' - 'bdproto-predictors.R' - 'add_predictors.R' - 'identifier.R' - 'bdproto-distributionmodel.R' - 'bdproto-priorlist.R' - 'bdproto-prior.R' - 'add_priors.R' - 'bdproto-engine.R' - 'bdproto-settings.R' - 'utils-spatial.R' - 'data.R' - 'distribution.R' - 'effects.R' - 'engine_bart.R' - 'engine_breg.R' - 'engine_gdb.R' - 'engine_glmnet.R' - 'utils-inla.R' - 'engine_inla.R' - 'engine_inlabru.R' - 'engine_stan.R' - 'engine_xgboost.R' - 'ensemble.R' - 'misc.R' - 'partial.R' - 'plot.R' - 'print.R' - 'prior_bart.R' - 'prior_breg.R' - 'prior_gdb.R' - 'prior_glmnet.R' - 'prior_inla.R' - 'prior_stan.R' - 'prior_xgb.R' - 'priors.R' - 'project.R' - 'pseudoabsence.R' - 'scenario.R' - 'similarity.R' - 'summary.R' - 'threshold.R' - 'train.R' - 'utils-bart.R' - 'utils-breg.R' - 'utils-gdb.R' - 'utils-glmnet.R' - 'utils-predictors.R' - 'utils-scenario.R' - 'utils-stan.R' - 'utils-xgboost.R' - 'validate.R' - 'write_output.R' - 'zzz.R' -Roxygen: list(markdown = TRUE) -Biarch: true -SystemRequirements: GNU make -VignetteBuilder: knitr +Package: ibis.iSDM +Type: Package +Title: Modelling Framework for Integrated Biodiversity Distribution Scenarios +Version: 0.0.3 +Authors@R: + c(person(given = "Martin", + family = "Jung", + role = c("aut", "cre", "cph"), + email = "jung@iiasa.ac.at", + comment = c(ORCID = "0000-0002-7569-1390")), + person(given = "Maximilian H.K.", + family = "Hesselbarth", + role = c("ctb"), + email = "hesselbarth@iiasa.ac.at", + comment = c(ORCID = "0000-0003-1125-9918")) + ) +Maintainer: Martin Jung +Description: Integrated framework of modelling the distribution of species and ecosystems in a suitability framing. This package allows the estimation of integrated species distribution models (iSDM) based on several sources of evidence and provided presence-only and presence-absence datasets. It makes heavy use of point-process models for estimating habitat suitability and allows to include spatial latent effects and priors in the estimation. To do so 'ibis.iSDM' supports a number of engines for Bayesian and more non-parametric machine learning estimation. Further, the 'ibis.iSDM' is specifically customized to support spatial-temporal projections of habitat suitability into the future. +Language: en-GB +License: CC BY 4.0 +Encoding: UTF-8 +Imports: + assertthat (>= 0.2.0), + doFuture (>= 0.12.2), + foreach, + future (>= 1.23.0), + dplyr, + geodist, + ggplot2, + graphics, + methods, + ncdf4, + parallel, + posterior, + proto (>= 1.0.0), + raster (>= 3.4-5), + rgeos (>= 0.5-5), + sf (>= 0.9), + stars (>= 0.5), + stats, + tibble (>= 2.0.0), + uuid, + utils +Depends: + R(>= 4.1.0) +Suggests: + abind, + BoomSpikeSlab (>= 1.2.4), + covr, + inlabru, + glmnet, + glmnetUtils, + dbarts (>= 0.9-22), + deldir, + doParallel, + ellipsis, + igraph, + knitr, + mboost, + modEvA, + ncmeta, + progress, + rmarkdown, + rstan (>= 2.21.0), + rstantools (>= 2.1.1), + testthat, + xgboost +URL: https://iiasa.github.io/ibis.iSDM/ +BugReports: https://github.com/iiasa/ibis.iSDM/issues +RoxygenNote: 7.2.3 +Config/testthat/edition: 3 +Roxygen: list(markdown = TRUE) +Biarch: true +SystemRequirements: GNU make +VignetteBuilder: knitr +Collate: + 'utils.R' + 'bdproto.R' + 'waiver.R' + 'bdproto-biodiversitydataset.R' + 'bdproto-biodiversitydistribution.R' + 'add_biodiversity.R' + 'bdproto-biodiversityscenario.R' + 'add_constraint.R' + 'add_constraint_MigClim.R' + 'add_control_bias.R' + 'add_latent.R' + 'bdproto-log.R' + 'add_log.R' + 'add_offset.R' + 'bdproto-predictors.R' + 'add_predictors.R' + 'identifier.R' + 'bdproto-distributionmodel.R' + 'bdproto-priorlist.R' + 'bdproto-prior.R' + 'add_priors.R' + 'bdproto-engine.R' + 'bdproto-settings.R' + 'utils-spatial.R' + 'data.R' + 'distribution.R' + 'effects.R' + 'engine_bart.R' + 'engine_breg.R' + 'engine_gdb.R' + 'engine_glmnet.R' + 'utils-inla.R' + 'engine_inla.R' + 'engine_inlabru.R' + 'engine_stan.R' + 'engine_xgboost.R' + 'ensemble.R' + 'ibis.iSDM-package.R' + 'misc.R' + 'partial.R' + 'plot.R' + 'print.R' + 'prior_bart.R' + 'prior_breg.R' + 'prior_gdb.R' + 'prior_glmnet.R' + 'prior_inla.R' + 'prior_stan.R' + 'prior_xgb.R' + 'priors.R' + 'project.R' + 'pseudoabsence.R' + 'scenario.R' + 'similarity.R' + 'summary.R' + 'threshold.R' + 'train.R' + 'utils-bart.R' + 'utils-breg.R' + 'utils-gdb.R' + 'utils-glmnet.R' + 'utils-predictors.R' + 'utils-scenario.R' + 'utils-stan.R' + 'utils-xgboost.R' + 'validate.R' + 'write_output.R' + 'zzz.R' diff --git a/NAMESPACE b/NAMESPACE index c3ff7d1d..5516466b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,7 +30,6 @@ S3method(summary,PredictorDataset) S3method(summary,PriorList) S3method(summary,Settings) S3method(summary,distribution) -export("%>%") export() export(BARTPrior) export(BARTPriors) @@ -113,6 +112,7 @@ export(new_waiver) export(partial) export(posterior_predict_stanfit) export(predictor_derivate) +export(predictor_filter) export(predictor_homogenize_na) export(predictor_transform) export(priors) @@ -196,12 +196,4 @@ import(raster) import(sf) importFrom(foreach,"%do%") importFrom(foreach,"%dopar%") -importFrom(magrittr,"%>%") -importFrom(methods,is) -importFrom(raster,as.data.frame) -importFrom(raster,init) -importFrom(raster,nlayers) -importFrom(raster,raster) -importFrom(raster,stack) importFrom(stats,effects) -importFrom(stats,na.omit) diff --git a/NEWS.md b/NEWS.md index 8be559f6..8f0267e1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# ibis.iSDM 0.0.3 + +* Removed Magittr dependency #41 +* Smaller improvements to documentation and removing of CRAN preventing function calls. +* Made the separation from hyperparameter search functions clearer and added new option to filter highly correlated covariates via `train`. +* Added Boruta for iterative feature selection of predictor variables. + # ibis.iSDM 0.0.2 * Smaller documentation fixes, including to make sure examples and returns are in all exported function documentations. diff --git a/R/add_biodiversity.R b/R/add_biodiversity.R index 3d95fb69..224bd254 100644 --- a/R/add_biodiversity.R +++ b/R/add_biodiversity.R @@ -43,7 +43,7 @@ NULL #' # Load virtual species #' virtual_species <- sf::st_read("inst/extdata/input_data.gpkg", "points") #' # Define model -#' x <- distribution(background) %>% +#' x <- distribution(background) |> #' add_biodiversity_poipo(virtual_species) #' x #' } @@ -83,7 +83,7 @@ methods::setMethod( # Transform to background for analysis if(sf::st_crs(x$background) != sf::st_crs(poipo)){ - poipo <- poipo %>% sf::st_transform(crs = sf::st_crs(x$background)) + poipo <- poipo |> sf::st_transform(crs = sf::st_crs(x$background)) } if(docheck){ @@ -171,8 +171,7 @@ methods::setMethod( #' @examples #' \dontrun{ #' # Define model -#' x <- distribution(background) %>% -#' add_biodiversity_poipa(virtual_species) +#' x <- distribution(background) |> add_biodiversity_poipa(virtual_species) #' x #' } #' @name add_biodiversity_poipa @@ -214,7 +213,7 @@ methods::setMethod( # Transform to background for analysis if(sf::st_crs(x$background) != sf::st_crs(poipa)){ - poipa <- poipa %>% sf::st_transform(crs = sf::st_crs(x$background)) + poipa <- poipa |> sf::st_transform(crs = sf::st_crs(x$background)) } if(docheck){ @@ -300,7 +299,7 @@ methods::setMethod( #' #' @examples #' \dontrun{ -#' x <- distribution(mod) %>% +#' x <- distribution(mod) |> #' add_biodiversity_polpo(protectedArea) #' } #' @name add_biodiversity_polpo @@ -350,7 +349,7 @@ methods::setMethod( # Transform to background for analysis if(sf::st_crs(x$background) != sf::st_crs(polpo)){ - polpo <- polpo %>% sf::st_transform(crs = sf::st_crs(x$background)) + polpo <- polpo |> sf::st_transform(crs = sf::st_crs(x$background)) } # Simulate presence absence points rather than using the range directly @@ -369,7 +368,7 @@ methods::setMethod( size = simulate_points, prob = simulate_bias[which(!is.na(simulate_bias[]))], replace = TRUE) - poipo_on <- as.data.frame(raster::xyFromCell(simulate_bias, ptscell)) + poipo_on <- raster::as.data.frame(raster::xyFromCell(simulate_bias, ptscell)) poipo_on <- sf::st_as_sf(poipo_on, coords = c("x","y"),crs = sf::st_crs(simulate_bias)) } else { @@ -475,7 +474,7 @@ methods::setMethod( #' #' @examples #' \dontrun{ -#' x <- distribution(background) %>% +#' x <- distribution(background) |> #' add_biodiversity_polpa(protectedArea) #' } #' @name add_biodiversity_polpa @@ -522,7 +521,7 @@ methods::setMethod( # Transform to background for analysis if(sf::st_crs(x$background) != sf::st_crs(polpa)){ - polpa <- polpa %>% sf::st_transform(crs = sf::st_crs(x$background)) + polpa <- polpa |> sf::st_transform(crs = sf::st_crs(x$background)) } # Simulate presence absence points rather than using the range directly @@ -541,7 +540,7 @@ methods::setMethod( size = simulate_points, prob = simulate_bias[which(!is.na(simulate_bias[]))], replace = TRUE) - poipa_on <- as.data.frame(raster::xyFromCell(simulate_bias, ptscell)) + poipa_on <- raster::as.data.frame(raster::xyFromCell(simulate_bias, ptscell)) poipa_on <- sf::st_as_sf(poipa_on, coords = c("x","y"),crs = sf::st_crs(simulate_bias)) } else { @@ -677,16 +676,16 @@ format_biodiversity_data <- function(x, field_occurrence, field_space = c("x","y assertthat::assert_that( all(assertthat::has_name(x, field_space)), msg ='No spatial column found in the dataset. Specify manually or set to [x] and [y].') # Select and format - out <- subset(x, select = c(field_space, field_occurrence) ) %>% + out <- subset(x, select = c(field_space, field_occurrence) ) |> tibble::as_tibble() } else { if(inherits(x, "Spatial")) x <- sf::st_as_sf(x) # First convert to sf - #if(inherits(x,'sf')) coords <- sf::st_coordinates(x) %>% tibble::as_tibble() + #if(inherits(x,'sf')) coords <- sf::st_coordinates(x) |> tibble::as_tibble() if(unique(sf::st_geometry_type(x)) %in% c("POINT","MULTIPOINT")){ # Take target column and append coordinates to it out <- cbind(subset(x, select = field_occurrence), - sf::st_coordinates(x)) %>% tibble::as_tibble() + sf::st_coordinates(x)) |> tibble::as_tibble() } else if(unique(sf::st_geometry_type(x)) %in% c('POLYGON','MULTIPOLYGON')){ # Return target column and spatial object as such out <- subset(x, select = field_occurrence) diff --git a/R/add_constraint.R b/R/add_constraint.R index 2a138ab2..cda3ba12 100644 --- a/R/add_constraint.R +++ b/R/add_constraint.R @@ -1,671 +1,671 @@ -#' @include utils.R bdproto-biodiversityscenario.R -NULL - -#' Add a constraint to an existing \code{scenario} -#' -#' @description This function adds a constrain to a [`BiodiversityScenario-class`] object to -#' constrain (future) projections. These constrains can for instance be constraints on a possible -#' dispersal distance, connectivity between identified patches or limitations on species adaptability. -#' **Most constrains require pre-calculated thresholds to present in the [`BiodiversityScenario-class`] object!** -#' @param mod A [`BiodiversityScenario`] object with specified predictors. -#' @param method A [`character`] indicating the type of constraints to be added to the scenario. See details for more -#' information. -#' @param value For many dispersal [`constrain`] this is set as [`numeric`] value specifying a -#' fixed constrain or constant in units \code{"m"} (Default: \code{NULL}). For kissmig the value needs to -#' give the number of iteration steps (or within year migration steps). -#' For adaptability constraints this parameter specifies the extent (in units of standard deviation) to which extrapolations -#' should be performed. -#' @param type A [`character`] indicating the type used in the method. See for instance [`kissmig::kissmig`]. -#' @param layer A [`Raster`] object that can be used for boundary constraints (Default: \code{NULL}). -#' @param pext [`numeric`] indicator for [`kissmig`] of the probability a colonized cell becomes uncolonised, -#' i.e., the species gets locally extinct (Default: \code{0.1}). -#' @param pcor [`numeric`] probability that corner cells are considered in the 3x3 neighbourhood (Default: \code{0.2}). -#' @param ... passed on parameters. See also the specific methods for adding constraints. -#' -#' @seealso [`add_constraint_dispersal`], [`add_constraint_connectivity`], [`add_constraint_adaptability`], [`add_constraint_boundary`] -#' @details -#' Constraints can be added to scenario objects to increase or decrease the suitability of a given area for the -#' target feature. This function acts as a wrapper to add these constraints. -#' Currently supported are the following options: -#' **Dispersal**: -#' * \code{sdd_fixed} - Applies a fixed uniform dispersal distance per modelling timestep. -#' * \code{sdd_nexpkernel} - Applies a dispersal distance using a negative exponential kernel from its origin. -#' * \code{kissmig} - Applies the kissmig stochastic dispersal model. Requires [`kissmig`] package. Applied at each modelling time step. -#' * \code{migclim} - Applies the dispersal algorithm MigClim to the modelled objects. Requires [`MigClim`] package. -#' -#' A comprehensive overview of the benefits of including dispersal constrains in species distribution models -#' can be found in Bateman et al. (2013). -#' -#' **Connectivity**: -#' * \code{hardbarrier} - Defines a hard barrier to any dispersal events. By definition this sets all values larger -#' than \code{0} in the barrier layer to \code{0} in the projection. Barrier has to be provided through the \code{"resistance"} -#' parameter. -#' * \code{resistance} - Allows the provision of a static or dynamic layer that is multiplied with the projection at each -#' time step. Can for example be used to reduce the suitability of any given area (using pressures not included in the model). -#' The respective layer(s) have to be provided through the \code{"resistance"} parameter. Provided layers are incorporated as -#' \code{abs(resistance - 1)} and multiplied with the prediction. -#' -#' **Adaptability**: -#' * \code{nichelimit} - Specifies a limit on the environmental niche to only allow a modest amount of extrapolation beyond the known occurrences. This -#' can be particular useful to limit the influence of increasing marginal responses and avoid biologically unrealistic projections. -#' -#' **Boundary**: -#' * \code{boundary} - Applies a hard boundary constraint on the projection, thus disallowing an expansion of a range outside -#' the provide layer. Similar as specifying projection limits (see [`distribution`]), but can be used to specifically -#' constrain a projection within a certain area (e.g. a species range or an island). -#' -#' @returns Adds constraints data to a [`BiodiversityScenario`] object. -#' @references -#' * Bateman, B. L., Murphy, H. T., Reside, A. E., Mokany, K., & VanDerWal, J. (2013). Appropriateness of full‐, partial‐and no‐dispersal scenarios in climate change impact modelling. Diversity and Distributions, 19(10), 1224-1234. -#' * Nobis MP and Normand S (2014) KISSMig - a simple model for R to account for limited migration in analyses of species distributions. Ecography 37: 1282-1287. -#' * Mendes, P., Velazco, S. J. E., de Andrade, A. F. A., & Júnior, P. D. M. (2020). Dealing with overprediction in species distribution models: How adding distance constraints can improve model accuracy. Ecological Modelling, 431, 109180. -#' @examples -#' \dontrun{ -#' # Assumes that a trained 'model' object exists -#' mod <- scenario(model) |> -#' add_predictors(env = predictors, transform = 'scale', derivates = "none") |> -#' add_constrain_dispersal(method = "kissmig", value = 2, pext = 0.1) |> -#' project() -#' } -#' @name add_constraint -#' @family constraint -#' @aliases add_constraint -#' @keywords scenario -#' @exportMethod add_constraint -#' @export -NULL -methods::setGeneric("add_constraint", - signature = methods::signature("mod"), - function(mod, method, ...) standardGeneric("add_constraint")) - -#' @name add_constraint -#' @rdname add_constraint -#' @usage \S4method{add_constraint}{BiodiversityScenario, character}(mod, method) -methods::setMethod( - "add_constraint", - methods::signature(mod = "BiodiversityScenario"), - function(mod, method, ...) { - assertthat::assert_that( - inherits(mod, "BiodiversityScenario"), - !is.Waiver(mod$get_predictors()), - is.character(method) - ) - # Match method - method <- match.arg(arg = method, - choices = c("sdd_fixed", "sdd_nexpkernel", "kissmig", "migclim", - "hardbarrier","resistance", "boundary", - "nichelimit"), several.ok = FALSE) - - # Now call the respective functions individually - o <- switch(method, - # Fixed dispersal - "sdd_fixed" = add_constraint_dispersal(mod, method = "sdd_fixed", ...), - # Short-distance dispersal - "sdd_nexpkernel" = add_constraint_dispersal(mod, method = "sdd_nexpkernel", ...), - # Add kissmig dispersal - "kissmig" = add_constraint_dispersal(mod, method = "kissmig", ...), - # Using the migclim package - "migclim" = add_constraint_dispersal(mod, method = "migclim", ...), - # --- # - "hardbarrier" = add_constraint_connectivity(mod, method = "hardbarrier", ...), - # --- # - "resistance" = add_constraint_connectivity(mod, method = "resistance", ...), - # --- # - "nichelimit" = add_constraint_adaptability(mod, method = "nichelimit", ...), - # --- # - "boundary" = add_constraint_boundary(mod, ...) - ) - return(o) - } -) - -# ------------------------ # -#### Dispersal constraints #### - -#' @title Adds a dispersal constrain to a scenario object. -#' @name add_constraint_dispersal -#' @aliases add_constraint_dispersal -#' @inheritParams add_constraint -#' @family constraint -#' @keywords scenario -#' @exportMethod add_constraint_dispersal -#' @export -NULL -methods::setGeneric("add_constraint_dispersal", - signature = methods::signature("mod"), - function(mod, method, value = NULL, type = NULL, ...) standardGeneric("add_constraint_dispersal")) - -#' @name add_constraint_dispersal -#' @rdname add_constraint_dispersal -#' @usage \S4method{add_constraint_dispersal}{BiodiversityScenario, character, numeric}(mod, method, value) -methods::setMethod( - "add_constraint_dispersal", - methods::signature(mod = "BiodiversityScenario"), - function(mod, method, value = NULL, type = NULL, ...){ - assertthat::assert_that( - inherits(mod, "BiodiversityScenario"), - !is.Waiver(mod$get_predictors()), - is.character(method), - is.null(value) || is.numeric(value), - is.null(type) || is.character(type) - ) - # Match method - method <- match.arg(arg = method, - choices = c("sdd_fixed", "sdd_nexpkernel", "kissmig", "migclim"), several.ok = FALSE) - - # Other arguments supplied - dots <- list(...) - argnames <- names(dots) - - # Check if there is already a dispersal constrain, if yes raise warning - if(!is.Waiver(mod$get_constraints())){ - # If there are any dispersal constrains in there, raise warning - if(any("dispersal" %in% names(mod$get_constraints()))){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]', 'yellow', 'Overwriting existing dispersal constraint.') - } - } - - # Add processing method # - # --- # - cr <- list() - if(method == "sdd_fixed"){ - # Short-distance dispersal (Fixed) - assertthat::assert_that( - is.numeric(value), msg = "Fixed short distance dispersal needs an annual mean disperal distance value." - ) - cr[['dispersal']] <- list(method = method, - params = c("mean_dispersal_distance" = value)) - } else if(method == "sdd_nexpkernel") { - # Negative exponential kernel - assertthat::assert_that( - is.numeric(value), msg = "Short distance negative exponential kernal dispersal needs an annual mean disperal distance value." - ) - cr[['dispersal']] <- list(method = method, - params = c("mean_dispersal_distance" = value)) - } else if(method == "kissmig"){ - # Check parameters to be correct - check_package("kissmig") - # Gather some default parameters - if(is.null(type)) type <- "DIS" else match.arg(type, c("DIS", "FOC", "LOC", "NOC"), several.ok = FALSE) - assertthat::assert_that( - is.numeric(value), - value > 0, msg = "For kissmig the value needs to give the number of iteration steps (or within time migration steps)." - ) - # probability [0,1] a colonized cell becomes uncolonized between iteration steps, i.e., the species gets locally extinct - if("pext" %in% argnames) pext <- dots[["pext"]] else pext <- 0.1 - # probability [0,1] corner cells are considered in the 3x3 cell neighborhood. Following Nobis & Nomand 2014, 0.2 is recommended for circular spread - if("pcor" %in% argnames) pcor <- dots[["pcor"]] else pcor <- 0.2 - - if(getOption('ibis.setupmessages')) myLog('[Estimation]', 'green', 'KISSMIG options: iterations=',value,'| pext=', pext,'| pcor=', pcor) - - cr[['dispersal']] <- list(method = method, - params = c("iteration" = value, - "type" = type, - "signed" = FALSE, - "pext" = pext, - "pcor" = pcor - )) - - } - if(method == "migclim"){ - # Using the MigClim package for calculating any transitions and - # This requires prior calculated Thresholds! - out <- add_constraint_MigClim(mod = mod, ...) - return(out) - } else { - # --- # - new <- mod$set_constraints(cr) - return( - bdproto(NULL, new) - ) - } - - } -) - -#' Short-distance fixed dispersal function -#' @param baseline_threshold The [`RasterLayer`] with presence/absence information from a previous year. -#' @param new_suit A new [`RasterLayer`] object. -#' @param value A [`numeric`] value of the fixed dispersal threshold. In unit \code{'meters'}. -#' @param resistance A resistance [`RasterLayer`] object with values to be omitted during distance calculation (Default: \code{NULL}). -#' @noRd -#' @keywords internal -.sdd_fixed <- function(baseline_threshold, new_suit, value, resistance = NULL){ - assertthat::assert_that( - is.Raster(baseline_threshold), is.Raster(new_suit), - raster::compareRaster(baseline_threshold, new_suit), - is.numeric(value), - is.null(resistance) || is.Raster(resistance), - # Check that baseline threshold raster is binomial - length(unique(baseline_threshold))==2 - ) - - # Set resistance layer to 0 if set to zero. - if(is.Raster(resistance)){ - baseline_threshold[resistance == 1] <- 2 - # Set resistance to the value omitted - resistance <- 2 - } - # Grow baseline raster by the amount of value at max - # Furthermore divide by value to get a normalized distance - dis <- raster::gridDistance(baseline_threshold, origin = 1, omit = resistance) - ras_dis <- raster::clamp(dis, lower = 0, upper = value) / value - # Invert - ras_dis <- abs(ras_dis - 1) - - # Now multiply the net suitability projection with this mask - # Thus removing any grid cells outside - out <- new_suit * ras_dis - return(out) -} - -#' Short-distance negative exponential kernel dispersal function -#' @param baseline_threshold The [`RasterLayer`] with presence/absence information from a previous year -#' @param new_suit A new [`RasterLayer`] object. -#' @param value A [`numeric`] value of the fixed dispersal threshold. In unit \code{'meters'}. -#' @param normalize Should a normalising constant be used for the exponential dispersal parameter (Default: \code{FALSE}). -#' @param resistance A resistance [`RasterLayer`] object with values to be omitted during distance calculation (Default: \code{NULL}). -#' @noRd -#' @keywords internal -.sdd_nexpkernel <- function(baseline_threshold, new_suit, value, normalize = FALSE, resistance = NULL){ - assertthat::assert_that( - is.Raster(baseline_threshold), is.Raster(new_suit), - raster::compareRaster(baseline_threshold, new_suit), - is.numeric(value), - is.null(resistance) || is.Raster(resistance), - # Check that baseline threshold raster is binomial - length(unique(baseline_threshold))==2 - ) - - # Set resistance layer to 0 if set to zero. - if(is.Raster(resistance)){ - baseline_threshold[resistance == 1] <- 2 - # Set resistance to the value omitted - resistance <- 2 - } - - # Inverse of mean dispersal distance - alpha <- 1/value - - # Grow baseline raster by using an exponentially weighted kernel - ras_dis <- raster::gridDistance(baseline_threshold, origin = 1, omit = resistance) - if(normalize){ - # Normalized (with a constant) negative exponential kernel - ras_dis <- raster::calc(ras_dis, fun = function(x) (1 / (2 * pi * value ^ 2)) * exp(-x / value) ) - } else { - ras_dis <- raster::calc(ras_dis, fun = function(x) exp(-alpha * x)) - } - - # Now multiply the net suitability projection with this mask - # Thus removing any non-suitable grid cells (0) and changing the value of those within reach - out <- new_suit * ras_dis - return(out) -} - -#' Keep it simple migration calculation. -#' @param baseline_threshold The [`RasterLayer`] with presence/absence information from a previous year. -#' @param new_suit A new [`RasterLayer`] object. -#' @param params A [vector] or [list] with passed on parameter values. -#' @param resistance A resistance [`RasterLayer`] object with values to be omitted during distance calculation (Default: \code{NULL}). -#' @noRd -#' @keywords internal -.kissmig_dispersal <- function(baseline_threshold, new_suit, params, resistance = NULL){ - assertthat::assert_that( - is.Raster(baseline_threshold), is.Raster(new_suit), - raster::compareRaster(baseline_threshold, new_suit), - is.vector(params) || is.list(params), - is.null(resistance) || is.logical(resistance) || is.Raster(resistance), - # Check that baseline threshold raster is binomial - length(unique(baseline_threshold))==2 - ) - - check_package('kissmig') - if(!isNamespaceLoaded("kissmig")) { attachNamespace("kissmig");requireNamespace("kissmig") } - - # Set suitability layer to 0 if resistance layer is set - if(is.Raster(resistance)){ - new_suit[resistance>0] <- 0 - } - - # Simulate kissmig for a given threshold and suitability raster - km <- kissmig::kissmig(O = baseline_threshold, - # Rescale newsuit to 0-1 - S = predictor_transform(new_suit, 'norm'), - it = as.numeric( params['iteration'] ), - type = params['type'], - pext = as.numeric(params['pext']), - pcor = as.numeric(params['pcor']) - ) - if(is.factor(km)) km <- raster::deratify(km, complete = TRUE) - - # Now multiply the net suitability projection with this mask - # Thus removing any non-suitable grid cells (0) and changing the value of those within reach - ns <- new_suit * km - - return( - raster::stack(km, ns) - ) -} - -# ------------------------ # -#### Connectivity constraints #### - -#' @title Adds a connectivity constraint to a scenario object. -#' @name add_constraint_connectivity -#' @aliases add_constraint_connectivity -#' @inheritParams add_constraint -#' @param resistance A [`RasterLayer`] object describing a resistance surface or barrier for use in -#' connectivity constrains (Default: \code{NULL}). -#' @family constraint -#' @keywords scenario -#' @exportMethod add_constraint_connectivity -#' @export -NULL -methods::setGeneric("add_constraint_connectivity", - signature = methods::signature("mod"), - function(mod, method, value = NULL, resistance = NULL, ...) standardGeneric("add_constraint_connectivity")) - -#' @name add_constraint_connectivity -#' @rdname add_constraint_connectivity -#' @usage \S4method{add_constraint_connectivity}{BiodiversityScenario, character, numeric, ANY}(mod, method, value, resistance) -methods::setMethod( - "add_constraint_connectivity", - methods::signature(mod = "BiodiversityScenario"), - function(mod, method, value = NULL, resistance = NULL, ...){ - assertthat::assert_that( - inherits(mod, "BiodiversityScenario"), - !is.Waiver(mod$get_predictors()), - is.character(method), - is.null(value) || is.numeric(value), - is.Raster(resistance) || is.null(resistance) - ) - # Match method - method <- match.arg(arg = method, - choices = c("hardbarrier", "resistance"), several.ok = FALSE) - - # Check if there is already a dispersal constrain, if yes raise warning - if(!is.Waiver(mod$get_constraints())){ - # If there are any dispersal constrains in there, raise warning - if(any( "connectivity" %in% names(mod$get_constraints()) )){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','yellow','Overwriting existing connectivity constraint') - } - } - - # Add processing method # - # --- # - co <- list() - if(method == "hardbarrier"){ - # Assert hard barrier - assertthat::assert_that( - is.Raster(resistance), - !is.null(resistance), msg = "Set a hard barrier via the resistance parameter." - ) - # Check that resistance layer is a binary mask - assertthat::assert_that(length(unique(resistance))<=2, - raster::cellStats(resistance,'max')>0, - msg = "Resistance layer should be a binary mark with values 0/1.") - co[['connectivity']] <- list(method = method, - params = c("resistance" = resistance)) - } else if(method == "resistance"){ - # Flexible resistance layer - assertthat::assert_that( - is.Raster(resistance), - !is.null(resistance), msg = "The method resistance requires a specified resistance raster." - ) - # If raster is stack with multiple layers, ensure that time - if(raster::nlayers(resistance)>1){ - # Check that layers have a z dimension and fall within the timeperiod - startend <- mod$get_timeperiod() - assertthat::assert_that( !is.null( raster::getZ(resistance) ), - all( range(raster::getZ(resistance))==startend ), - msg = "If a stack of layers is supplied as resistance, it needs a Z value of equal length to the predictors!") - } - times <- raster::getZ(resistance) - # If resistance layer is bigger than 1, normalize - if(any(raster::cellStats(resistance,'max')>1)){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','yellow','Resistance values larger than 1. Normalizing...') - resistance <- predictor_transform(resistance, option = "norm") - } - resistance <- abs( resistance - 1 ) # Invert - if(!is.null(times)) resistance <- raster::setZ(resistance, times) # Reset times again if found - - co[['connectivity']] <- list(method = method, - params = c("resistance" = resistance)) - } - # --- # - new <- mod$set_constraints(co) - return( - bdproto(NULL, new) - ) - } -) - -# ------------------------ # -#### Adaptability constraints #### - -#' @title Adds an adaptability constraint to a scenario object -#' @description -#' Adaptability constraints assume that suitable habitat for species in (future) projections might be unsuitable if -#' it is outside the range of conditions currently observed for the species. -#' -#' Currently only `nichelimit` is implemented, which adds a simple constrain on the predictor parameter space, which -#' can be defined through the \code{"value"} parameter. For example by setting it to \code{1} (Default), any projections -#' are constrained to be within the range of at maximum 1 standard deviation from the range of covariates used for model -#' training. -#' @name add_constraint_adaptability -#' @aliases add_constraint_adaptability -#' @inheritParams add_constraint -#' @param names A [`character`] vector with names of the predictors for which an adaptability threshold should be set (Default: \code{NULL} for all). -#' @param value A [`numeric`] value in units of standard deviation (Default: \code{1}). -#' @param increment A [`numeric`] constant that is added to value at every time step (Default: \code{0}). -#' Allows incremental widening of the niche space, thus opening constraints. -#' @family constraint -#' @keywords scenario -#' @exportMethod add_constraint_adaptability -#' @export -NULL -methods::setGeneric("add_constraint_adaptability", - signature = methods::signature("mod"), - function(mod, method = "nichelimit", names = NULL, value = 1, increment = 0, ...) standardGeneric("add_constraint_adaptability")) - -#' @name add_constraint_adaptability -#' @rdname add_constraint_adaptability -#' @usage \S4method{add_constraint_adaptability}{BiodiversityScenario, character, character, numeric, numeric}(mod, method, names, value, increment) -methods::setMethod( - "add_constraint_adaptability", - methods::signature(mod = "BiodiversityScenario"), - function(mod, method = "nichelimit", names = NULL, value = 1, increment = 0, ...){ - assertthat::assert_that( - inherits(mod, "BiodiversityScenario"), - !is.Waiver(mod$get_predictors()), - is.character(method), - is.null(names) || is.character(names), - is.null(value) || is.numeric(value), - is.numeric(increment) - ) - # Match method - method <- match.arg(arg = method, - choices = c("nichelimit"), several.ok = FALSE) - - # Add processing method # - # --- # - co <- list() - if(method == "nichelimit"){ - # Add a constrain on parameter space, e.g. max 1 SD from training data covariates - assertthat::assert_that( - is.numeric(value), - is.null(names) || is.character(names), - value > 0, msg = "Specify a value threshold (SD) and names of predictors, for which - we do not expect the species to persist." - ) - if(is.null(names)) names <- character() - co[['adaptability']] <- list(method = method, - params = c("names" = names, "value" = value, - "increment" = increment)) - } - # --- # - new <- mod$set_constraints(co) - return( - bdproto(NULL, new) - ) - } -) - -#' Adaptability constrain by applying a limit on extrapolation beyond the niche -#' -#' @param newdata A [`data.frame`] with the information about new data layers. -#' @param model A [`list`] created by the modelling object containing the full predictors and biodiversity predictors. -#' @param names A [`character`] or \code{NULL} of the names of predictors. -#' @param value A [`numeric`] value in units of standard deviation (Default: \code{1}). -#' @param increment A [`numeric`] constant that is added to value at every time step (Default: \code{0}). -#' Allows incremental widening of the niche space, thus opening constraints. -#' @param increment_step A [`numeric`] indicating the number of time increment should be applied. -#' @keywords internal -#' @noRd -.nichelimit <- function(newdata, model, names = NULL, value = 1, increment = 0, increment_step = 1){ - assertthat::assert_that( - is.data.frame(newdata), - is.list(model), - is.numeric(as.numeric(value)), - is.null(names) || is.na(names) || is.character(names), - is.numeric(as.numeric(increment)), - is.numeric(as.numeric(increment_step)) - ) - # Check that names are present if set - if(is.null(names) || is.na(names)) names <- model$predictors_names - if(is.character(names) ) assertthat::assert_that(all(names %in% model$predictors_names)) - # Convert numeric paramters to numeric to be sure - value <- as.numeric(value) - increment <- as.numeric(increment) - increment_step <- as.numeric(increment_step) - # --- # - # Now calculate the range across each target predictor and occurrence dataset - df <- data.frame() - for(id in names(model$biodiversity)){ - sub <- model$biodiversity[[id]] - # Which are presence data - is_presence <- which(sub$observations[['observed']] > 0) - df <- rbind(df, - sub$predictors[is_presence, names]) - } - rr <- sapply(df, function(x) range(x, na.rm = TRUE)) # Calculate ranges - rsd <- sapply(df, function(x) sd(x, na.rm = TRUE)) # Calculate standard deviation - - # Apply value and increment if set - rsd <- rsd * (value + (increment*increment_step)) - rr[1,] <- rr[1,] - rsd; rr[2,] <- rr[2,] + rsd - - # Now 'clamp' all predictor values beyond these names to 0, e.g. partial out - nd <- newdata - for(n in names){ - # Calc min - min_ex <- which(nd[,n] < rr[1,n]) - max_ex <- which(nd[,n] > rr[2,n]) - if(length(min_ex)>0) nd[min_ex,n] <- NA - if(length(max_ex)>0) nd[max_ex,n] <- NA - # FIXME Or rather do a smooth logistic decay for less extreme? - } - return(nd) -} - -# ------------------------ # -#### Boundary constraints #### - -#' @title Adds a boundary constraint to a scenario object -#' @description -#' The purpose of boundary constraints is to limit a future projection within a specified area -#' (such as for example a range or ecoregion). This can help to limit unreasonable projections into geographic space. -#' -#' Similar to boundary constraints it is also possible to define a \code{"zone"} for the scenario projections, similar -#' as was done for model training. The difference to a boundary constraint is that the boundary constraint is applied posthoc -#' as a hard cut on any projection, while the zones would allow any projection (and other constraints) to be applied within -#' the zone. -#' **Note: Setting a boundary constraint for future projections effectively potentially suitable areas!** -#' @name add_constraint_boundary -#' @aliases add_constraint_boundary -#' @inheritParams add_constraint -#' @param layer A [`Raster`] or [`sf`] object with the same extent as the model background. Has to be binary and -#' is used for a posthoc masking of projected grid cells. -#' @family constraint -#' @keywords scenario -#' @exportMethod add_constraint_boundary -#' @export -NULL -methods::setGeneric("add_constraint_boundary", - signature = methods::signature("mod", "layer"), - function(mod, layer, ...) standardGeneric("add_constraint_boundary")) - -#' @name add_constraint_boundary -#' @rdname add_constraint_boundary -#' @usage \S4method{add_constraint_boundary}{BiodiversityScenario, sf, character}(mod, layer, method) -methods::setMethod( - "add_constraint_boundary", - methods::signature(mod = "BiodiversityScenario", layer = "sf"), - function(mod, layer, method = "boundary", ...){ - assertthat::assert_that( - inherits(mod, "BiodiversityScenario"), - inherits(layer, "sf"), - is.character(method) - ) - - # Rasterize the layer - # First try and dig out a layer from a predictor dataset if found - if(inherits( mod$get_predictors(), "PredictorDataSet")){ - ras <- mod$get_predictors()$get_data() |> stars_to_raster() - ras <- ras[[1]] - } else { - # Try and get the underlying model and its predictors - ras <- mod$get_model()$get_data() - } - assertthat::assert_that(is.Raster(ras)) - bb <- try({ raster::rasterize(layer, ras, 1)},silent = TRUE) - if(inherits(bb, "try-error")) stop("Provide a rasterized layer of the boundary constraint!") - - # Call again - o <- add_constraint_boundary(mod, layer = bb, method = method, ..) - - return( o ) - } -) - -#' @name add_constraint_boundary -#' @rdname add_constraint_boundary -#' @usage \S4method{add_constraint_boundary}{BiodiversityScenario, ANY, character}(mod, layer, method) -methods::setMethod( - "add_constraint_boundary", - methods::signature(mod = "BiodiversityScenario", layer = "ANY"), - function(mod, layer, method = "boundary", ...){ - assertthat::assert_that( - inherits(mod, "BiodiversityScenario"), - is.Raster(layer), - is.character(method) - ) - - # Check that layer is a single RasterLayer - if(!inherits(layer, "RasterLayer")){ - assertthat::assert_that(raster::nlayers(layer) == 1) - layer <- layer[[1]] - } - - # Add processing method # - # --- # - co <- list() - if(method == "boundary"){ - # Add a constrain on parameter space, e.g. max 1 SD from training data covariates - assertthat::assert_that( - length( unique( layer )) <=2 - ) - # If length of values is greater than 1, remove everything else by setting it to NA - if( length( unique( layer )) >1 ){ - layer[layer<1] <- NA - } - co[['boundary']] <- list(method = method, - params = c("layer" = layer)) - } - # --- # - new <- mod$set_constraints(co) - return( - bdproto(NULL, new) - ) - } -) +#' @include utils.R bdproto-biodiversityscenario.R +NULL + +#' Add a constraint to an existing \code{scenario} +#' +#' @description This function adds a constrain to a [`BiodiversityScenario-class`] object to +#' constrain (future) projections. These constrains can for instance be constraints on a possible +#' dispersal distance, connectivity between identified patches or limitations on species adaptability. +#' **Most constrains require pre-calculated thresholds to present in the [`BiodiversityScenario-class`] object!** +#' @param mod A [`BiodiversityScenario`] object with specified predictors. +#' @param method A [`character`] indicating the type of constraints to be added to the scenario. See details for more +#' information. +#' @param value For many dispersal [`constrain`] this is set as [`numeric`] value specifying a +#' fixed constrain or constant in units \code{"m"} (Default: \code{NULL}). For kissmig the value needs to +#' give the number of iteration steps (or within year migration steps). +#' For adaptability constraints this parameter specifies the extent (in units of standard deviation) to which extrapolations +#' should be performed. +#' @param type A [`character`] indicating the type used in the method. See for instance [`kissmig::kissmig`]. +#' @param layer A [`Raster`] object that can be used for boundary constraints (Default: \code{NULL}). +#' @param pext [`numeric`] indicator for [`kissmig`] of the probability a colonized cell becomes uncolonised, +#' i.e., the species gets locally extinct (Default: \code{0.1}). +#' @param pcor [`numeric`] probability that corner cells are considered in the 3x3 neighbourhood (Default: \code{0.2}). +#' @param ... passed on parameters. See also the specific methods for adding constraints. +#' +#' @seealso [`add_constraint_dispersal`], [`add_constraint_connectivity`], [`add_constraint_adaptability`], [`add_constraint_boundary`] +#' @details +#' Constraints can be added to scenario objects to increase or decrease the suitability of a given area for the +#' target feature. This function acts as a wrapper to add these constraints. +#' Currently supported are the following options: +#' **Dispersal**: +#' * \code{sdd_fixed} - Applies a fixed uniform dispersal distance per modelling timestep. +#' * \code{sdd_nexpkernel} - Applies a dispersal distance using a negative exponential kernel from its origin. +#' * \code{kissmig} - Applies the kissmig stochastic dispersal model. Requires [`kissmig`] package. Applied at each modelling time step. +#' * \code{migclim} - Applies the dispersal algorithm MigClim to the modelled objects. Requires [`MigClim`] package. +#' +#' A comprehensive overview of the benefits of including dispersal constrains in species distribution models +#' can be found in Bateman et al. (2013). +#' +#' **Connectivity**: +#' * \code{hardbarrier} - Defines a hard barrier to any dispersal events. By definition this sets all values larger +#' than \code{0} in the barrier layer to \code{0} in the projection. Barrier has to be provided through the \code{"resistance"} +#' parameter. +#' * \code{resistance} - Allows the provision of a static or dynamic layer that is multiplied with the projection at each +#' time step. Can for example be used to reduce the suitability of any given area (using pressures not included in the model). +#' The respective layer(s) have to be provided through the \code{"resistance"} parameter. Provided layers are incorporated as +#' \code{abs(resistance - 1)} and multiplied with the prediction. +#' +#' **Adaptability**: +#' * \code{nichelimit} - Specifies a limit on the environmental niche to only allow a modest amount of extrapolation beyond the known occurrences. This +#' can be particular useful to limit the influence of increasing marginal responses and avoid biologically unrealistic projections. +#' +#' **Boundary**: +#' * \code{boundary} - Applies a hard boundary constraint on the projection, thus disallowing an expansion of a range outside +#' the provide layer. Similar as specifying projection limits (see [`distribution`]), but can be used to specifically +#' constrain a projection within a certain area (e.g. a species range or an island). +#' +#' @returns Adds constraints data to a [`BiodiversityScenario`] object. +#' @references +#' * Bateman, B. L., Murphy, H. T., Reside, A. E., Mokany, K., & VanDerWal, J. (2013). Appropriateness of full‐, partial‐and no‐dispersal scenarios in climate change impact modelling. Diversity and Distributions, 19(10), 1224-1234. +#' * Nobis MP and Normand S (2014) KISSMig - a simple model for R to account for limited migration in analyses of species distributions. Ecography 37: 1282-1287. +#' * Mendes, P., Velazco, S. J. E., de Andrade, A. F. A., & Júnior, P. D. M. (2020). Dealing with overprediction in species distribution models: How adding distance constraints can improve model accuracy. Ecological Modelling, 431, 109180. +#' @examples +#' \dontrun{ +#' # Assumes that a trained 'model' object exists +#' mod <- scenario(model) |> +#' add_predictors(env = predictors, transform = 'scale', derivates = "none") |> +#' add_constrain_dispersal(method = "kissmig", value = 2, pext = 0.1) |> +#' project() +#' } +#' @name add_constraint +#' @family constraint +#' @aliases add_constraint +#' @keywords scenario +#' @exportMethod add_constraint +#' @export +NULL +methods::setGeneric("add_constraint", + signature = methods::signature("mod"), + function(mod, method, ...) standardGeneric("add_constraint")) + +#' @name add_constraint +#' @rdname add_constraint +#' @usage \S4method{add_constraint}{BiodiversityScenario, character}(mod, method) +methods::setMethod( + "add_constraint", + methods::signature(mod = "BiodiversityScenario"), + function(mod, method, ...) { + assertthat::assert_that( + inherits(mod, "BiodiversityScenario"), + !is.Waiver(mod$get_predictors()), + is.character(method) + ) + # Match method + method <- match.arg(arg = method, + choices = c("sdd_fixed", "sdd_nexpkernel", "kissmig", "migclim", + "hardbarrier","resistance", "boundary", + "nichelimit"), several.ok = FALSE) + + # Now call the respective functions individually + o <- switch(method, + # Fixed dispersal + "sdd_fixed" = add_constraint_dispersal(mod, method = "sdd_fixed", ...), + # Short-distance dispersal + "sdd_nexpkernel" = add_constraint_dispersal(mod, method = "sdd_nexpkernel", ...), + # Add kissmig dispersal + "kissmig" = add_constraint_dispersal(mod, method = "kissmig", ...), + # Using the migclim package + "migclim" = add_constraint_dispersal(mod, method = "migclim", ...), + # --- # + "hardbarrier" = add_constraint_connectivity(mod, method = "hardbarrier", ...), + # --- # + "resistance" = add_constraint_connectivity(mod, method = "resistance", ...), + # --- # + "nichelimit" = add_constraint_adaptability(mod, method = "nichelimit", ...), + # --- # + "boundary" = add_constraint_boundary(mod, ...) + ) + return(o) + } +) + +# ------------------------ # +#### Dispersal constraints #### + +#' @title Adds a dispersal constrain to a scenario object. +#' @name add_constraint_dispersal +#' @aliases add_constraint_dispersal +#' @inheritParams add_constraint +#' @family constraint +#' @keywords scenario +#' @exportMethod add_constraint_dispersal +#' @export +NULL +methods::setGeneric("add_constraint_dispersal", + signature = methods::signature("mod"), + function(mod, method, value = NULL, type = NULL, ...) standardGeneric("add_constraint_dispersal")) + +#' @name add_constraint_dispersal +#' @rdname add_constraint_dispersal +#' @usage \S4method{add_constraint_dispersal}{BiodiversityScenario, character, numeric}(mod, method, value) +methods::setMethod( + "add_constraint_dispersal", + methods::signature(mod = "BiodiversityScenario"), + function(mod, method, value = NULL, type = NULL, ...){ + assertthat::assert_that( + inherits(mod, "BiodiversityScenario"), + !is.Waiver(mod$get_predictors()), + is.character(method), + is.null(value) || is.numeric(value), + is.null(type) || is.character(type) + ) + # Match method + method <- match.arg(arg = method, + choices = c("sdd_fixed", "sdd_nexpkernel", "kissmig", "migclim"), several.ok = FALSE) + + # Other arguments supplied + dots <- list(...) + argnames <- names(dots) + + # Check if there is already a dispersal constrain, if yes raise warning + if(!is.Waiver(mod$get_constraints())){ + # If there are any dispersal constrains in there, raise warning + if(any("dispersal" %in% names(mod$get_constraints()))){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]', 'yellow', 'Overwriting existing dispersal constraint.') + } + } + + # Add processing method # + # --- # + cr <- list() + if(method == "sdd_fixed"){ + # Short-distance dispersal (Fixed) + assertthat::assert_that( + is.numeric(value), msg = "Fixed short distance dispersal needs an annual mean disperal distance value." + ) + cr[['dispersal']] <- list(method = method, + params = c("mean_dispersal_distance" = value)) + } else if(method == "sdd_nexpkernel") { + # Negative exponential kernel + assertthat::assert_that( + is.numeric(value), msg = "Short distance negative exponential kernal dispersal needs an annual mean disperal distance value." + ) + cr[['dispersal']] <- list(method = method, + params = c("mean_dispersal_distance" = value)) + } else if(method == "kissmig"){ + # Check parameters to be correct + check_package("kissmig") + # Gather some default parameters + if(is.null(type)) type <- "DIS" else match.arg(type, c("DIS", "FOC", "LOC", "NOC"), several.ok = FALSE) + assertthat::assert_that( + is.numeric(value), + value > 0, msg = "For kissmig the value needs to give the number of iteration steps (or within time migration steps)." + ) + # probability [0,1] a colonized cell becomes uncolonized between iteration steps, i.e., the species gets locally extinct + if("pext" %in% argnames) pext <- dots[["pext"]] else pext <- 0.1 + # probability [0,1] corner cells are considered in the 3x3 cell neighborhood. Following Nobis & Nomand 2014, 0.2 is recommended for circular spread + if("pcor" %in% argnames) pcor <- dots[["pcor"]] else pcor <- 0.2 + + if(getOption('ibis.setupmessages')) myLog('[Estimation]', 'green', 'KISSMIG options: iterations=',value,'| pext=', pext,'| pcor=', pcor) + + cr[['dispersal']] <- list(method = method, + params = c("iteration" = value, + "type" = type, + "signed" = FALSE, + "pext" = pext, + "pcor" = pcor + )) + + } + if(method == "migclim"){ + # Using the MigClim package for calculating any transitions and + # This requires prior calculated Thresholds! + out <- add_constraint_MigClim(mod = mod, ...) + return(out) + } else { + # --- # + new <- mod$set_constraints(cr) + return( + bdproto(NULL, new) + ) + } + + } +) + +#' Short-distance fixed dispersal function +#' @param baseline_threshold The [`RasterLayer`] with presence/absence information from a previous year. +#' @param new_suit A new [`RasterLayer`] object. +#' @param value A [`numeric`] value of the fixed dispersal threshold. In unit \code{'meters'}. +#' @param resistance A resistance [`RasterLayer`] object with values to be omitted during distance calculation (Default: \code{NULL}). +#' @noRd +#' @keywords internal +.sdd_fixed <- function(baseline_threshold, new_suit, value, resistance = NULL){ + assertthat::assert_that( + is.Raster(baseline_threshold), is.Raster(new_suit), + raster::compareRaster(baseline_threshold, new_suit), + is.numeric(value), + is.null(resistance) || is.Raster(resistance), + # Check that baseline threshold raster is binomial + length(unique(baseline_threshold))==2 + ) + + # Set resistance layer to 0 if set to zero. + if(is.Raster(resistance)){ + baseline_threshold[resistance == 1] <- 2 + # Set resistance to the value omitted + resistance <- 2 + } + # Grow baseline raster by the amount of value at max + # Furthermore divide by value to get a normalized distance + dis <- raster::gridDistance(baseline_threshold, origin = 1, omit = resistance) + ras_dis <- raster::clamp(dis, lower = 0, upper = value) / value + # Invert + ras_dis <- abs(ras_dis - 1) + + # Now multiply the net suitability projection with this mask + # Thus removing any grid cells outside + out <- new_suit * ras_dis + return(out) +} + +#' Short-distance negative exponential kernel dispersal function +#' @param baseline_threshold The [`RasterLayer`] with presence/absence information from a previous year +#' @param new_suit A new [`RasterLayer`] object. +#' @param value A [`numeric`] value of the fixed dispersal threshold. In unit \code{'meters'}. +#' @param normalize Should a normalising constant be used for the exponential dispersal parameter (Default: \code{FALSE}). +#' @param resistance A resistance [`RasterLayer`] object with values to be omitted during distance calculation (Default: \code{NULL}). +#' @noRd +#' @keywords internal +.sdd_nexpkernel <- function(baseline_threshold, new_suit, value, normalize = FALSE, resistance = NULL){ + assertthat::assert_that( + is.Raster(baseline_threshold), is.Raster(new_suit), + raster::compareRaster(baseline_threshold, new_suit), + is.numeric(value), + is.null(resistance) || is.Raster(resistance), + # Check that baseline threshold raster is binomial + length(unique(baseline_threshold))==2 + ) + + # Set resistance layer to 0 if set to zero. + if(is.Raster(resistance)){ + baseline_threshold[resistance == 1] <- 2 + # Set resistance to the value omitted + resistance <- 2 + } + + # Inverse of mean dispersal distance + alpha <- 1/value + + # Grow baseline raster by using an exponentially weighted kernel + ras_dis <- raster::gridDistance(baseline_threshold, origin = 1, omit = resistance) + if(normalize){ + # Normalized (with a constant) negative exponential kernel + ras_dis <- raster::calc(ras_dis, fun = function(x) (1 / (2 * pi * value ^ 2)) * exp(-x / value) ) + } else { + ras_dis <- raster::calc(ras_dis, fun = function(x) exp(-alpha * x)) + } + + # Now multiply the net suitability projection with this mask + # Thus removing any non-suitable grid cells (0) and changing the value of those within reach + out <- new_suit * ras_dis + return(out) +} + +#' Keep it simple migration calculation. +#' @param baseline_threshold The [`RasterLayer`] with presence/absence information from a previous year. +#' @param new_suit A new [`RasterLayer`] object. +#' @param params A [vector] or [list] with passed on parameter values. +#' @param resistance A resistance [`RasterLayer`] object with values to be omitted during distance calculation (Default: \code{NULL}). +#' @noRd +#' @keywords internal +.kissmig_dispersal <- function(baseline_threshold, new_suit, params, resistance = NULL){ + assertthat::assert_that( + is.Raster(baseline_threshold), is.Raster(new_suit), + raster::compareRaster(baseline_threshold, new_suit), + is.vector(params) || is.list(params), + is.null(resistance) || is.logical(resistance) || is.Raster(resistance), + # Check that baseline threshold raster is binomial + length(unique(baseline_threshold))==2 + ) + + check_package('kissmig') + if(!isNamespaceLoaded("kissmig")) { attachNamespace("kissmig");requireNamespace("kissmig") } + + # Set suitability layer to 0 if resistance layer is set + if(is.Raster(resistance)){ + new_suit[resistance>0] <- 0 + } + + # Simulate kissmig for a given threshold and suitability raster + km <- kissmig::kissmig(O = baseline_threshold, + # Rescale newsuit to 0-1 + S = predictor_transform(new_suit, 'norm'), + it = as.numeric( params['iteration'] ), + type = params['type'], + pext = as.numeric(params['pext']), + pcor = as.numeric(params['pcor']) + ) + if(is.factor(km)) km <- raster::deratify(km, complete = TRUE) + + # Now multiply the net suitability projection with this mask + # Thus removing any non-suitable grid cells (0) and changing the value of those within reach + ns <- new_suit * km + + return( + raster::stack(km, ns) + ) +} + +# ------------------------ # +#### Connectivity constraints #### + +#' @title Adds a connectivity constraint to a scenario object. +#' @name add_constraint_connectivity +#' @aliases add_constraint_connectivity +#' @inheritParams add_constraint +#' @param resistance A [`RasterLayer`] object describing a resistance surface or barrier for use in +#' connectivity constrains (Default: \code{NULL}). +#' @family constraint +#' @keywords scenario +#' @exportMethod add_constraint_connectivity +#' @export +NULL +methods::setGeneric("add_constraint_connectivity", + signature = methods::signature("mod"), + function(mod, method, value = NULL, resistance = NULL, ...) standardGeneric("add_constraint_connectivity")) + +#' @name add_constraint_connectivity +#' @rdname add_constraint_connectivity +#' @usage \S4method{add_constraint_connectivity}{BiodiversityScenario, character, numeric, ANY}(mod, method, value, resistance) +methods::setMethod( + "add_constraint_connectivity", + methods::signature(mod = "BiodiversityScenario"), + function(mod, method, value = NULL, resistance = NULL, ...){ + assertthat::assert_that( + inherits(mod, "BiodiversityScenario"), + !is.Waiver(mod$get_predictors()), + is.character(method), + is.null(value) || is.numeric(value), + is.Raster(resistance) || is.null(resistance) + ) + # Match method + method <- match.arg(arg = method, + choices = c("hardbarrier", "resistance"), several.ok = FALSE) + + # Check if there is already a dispersal constrain, if yes raise warning + if(!is.Waiver(mod$get_constraints())){ + # If there are any dispersal constrains in there, raise warning + if(any( "connectivity" %in% names(mod$get_constraints()) )){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','yellow','Overwriting existing connectivity constraint') + } + } + + # Add processing method # + # --- # + co <- list() + if(method == "hardbarrier"){ + # Assert hard barrier + assertthat::assert_that( + is.Raster(resistance), + !is.null(resistance), msg = "Set a hard barrier via the resistance parameter." + ) + # Check that resistance layer is a binary mask + assertthat::assert_that(length(unique(resistance))<=2, + raster::cellStats(resistance,'max')>0, + msg = "Resistance layer should be a binary mark with values 0/1.") + co[['connectivity']] <- list(method = method, + params = c("resistance" = resistance)) + } else if(method == "resistance"){ + # Flexible resistance layer + assertthat::assert_that( + is.Raster(resistance), + !is.null(resistance), msg = "The method resistance requires a specified resistance raster." + ) + # If raster is stack with multiple layers, ensure that time + if(raster::nlayers(resistance)>1){ + # Check that layers have a z dimension and fall within the timeperiod + startend <- mod$get_timeperiod() + assertthat::assert_that( !is.null( raster::getZ(resistance) ), + all( range(raster::getZ(resistance))==startend ), + msg = "If a stack of layers is supplied as resistance, it needs a Z value of equal length to the predictors!") + } + times <- raster::getZ(resistance) + # If resistance layer is bigger than 1, normalize + if(any(raster::cellStats(resistance,'max')>1)){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','yellow','Resistance values larger than 1. Normalizing...') + resistance <- predictor_transform(resistance, option = "norm") + } + resistance <- abs( resistance - 1 ) # Invert + if(!is.null(times)) resistance <- raster::setZ(resistance, times) # Reset times again if found + + co[['connectivity']] <- list(method = method, + params = c("resistance" = resistance)) + } + # --- # + new <- mod$set_constraints(co) + return( + bdproto(NULL, new) + ) + } +) + +# ------------------------ # +#### Adaptability constraints #### + +#' @title Adds an adaptability constraint to a scenario object +#' @description +#' Adaptability constraints assume that suitable habitat for species in (future) projections might be unsuitable if +#' it is outside the range of conditions currently observed for the species. +#' +#' Currently only `nichelimit` is implemented, which adds a simple constrain on the predictor parameter space, which +#' can be defined through the \code{"value"} parameter. For example by setting it to \code{1} (Default), any projections +#' are constrained to be within the range of at maximum 1 standard deviation from the range of covariates used for model +#' training. +#' @name add_constraint_adaptability +#' @aliases add_constraint_adaptability +#' @inheritParams add_constraint +#' @param names A [`character`] vector with names of the predictors for which an adaptability threshold should be set (Default: \code{NULL} for all). +#' @param value A [`numeric`] value in units of standard deviation (Default: \code{1}). +#' @param increment A [`numeric`] constant that is added to value at every time step (Default: \code{0}). +#' Allows incremental widening of the niche space, thus opening constraints. +#' @family constraint +#' @keywords scenario +#' @exportMethod add_constraint_adaptability +#' @export +NULL +methods::setGeneric("add_constraint_adaptability", + signature = methods::signature("mod"), + function(mod, method = "nichelimit", names = NULL, value = 1, increment = 0, ...) standardGeneric("add_constraint_adaptability")) + +#' @name add_constraint_adaptability +#' @rdname add_constraint_adaptability +#' @usage \S4method{add_constraint_adaptability}{BiodiversityScenario, character, character, numeric, numeric}(mod, method, names, value, increment) +methods::setMethod( + "add_constraint_adaptability", + methods::signature(mod = "BiodiversityScenario"), + function(mod, method = "nichelimit", names = NULL, value = 1, increment = 0, ...){ + assertthat::assert_that( + inherits(mod, "BiodiversityScenario"), + !is.Waiver(mod$get_predictors()), + is.character(method), + is.null(names) || is.character(names), + is.null(value) || is.numeric(value), + is.numeric(increment) + ) + # Match method + method <- match.arg(arg = method, + choices = c("nichelimit"), several.ok = FALSE) + + # Add processing method # + # --- # + co <- list() + if(method == "nichelimit"){ + # Add a constrain on parameter space, e.g. max 1 SD from training data covariates + assertthat::assert_that( + is.numeric(value), + is.null(names) || is.character(names), + value > 0, msg = "Specify a value threshold (SD) and names of predictors, for which + we do not expect the species to persist." + ) + if(is.null(names)) names <- character() + co[['adaptability']] <- list(method = method, + params = c("names" = names, "value" = value, + "increment" = increment)) + } + # --- # + new <- mod$set_constraints(co) + return( + bdproto(NULL, new) + ) + } +) + +#' Adaptability constrain by applying a limit on extrapolation beyond the niche +#' +#' @param newdata A [`data.frame`] with the information about new data layers. +#' @param model A [`list`] created by the modelling object containing the full predictors and biodiversity predictors. +#' @param names A [`character`] or \code{NULL} of the names of predictors. +#' @param value A [`numeric`] value in units of standard deviation (Default: \code{1}). +#' @param increment A [`numeric`] constant that is added to value at every time step (Default: \code{0}). +#' Allows incremental widening of the niche space, thus opening constraints. +#' @param increment_step A [`numeric`] indicating the number of time increment should be applied. +#' @keywords internal +#' @noRd +.nichelimit <- function(newdata, model, names = NULL, value = 1, increment = 0, increment_step = 1){ + assertthat::assert_that( + is.data.frame(newdata), + is.list(model), + is.numeric(as.numeric(value)), + is.null(names) || is.na(names) || is.character(names), + is.numeric(as.numeric(increment)), + is.numeric(as.numeric(increment_step)) + ) + # Check that names are present if set + if(is.null(names) || is.na(names)) names <- model$predictors_names + if(is.character(names) ) assertthat::assert_that(all(names %in% model$predictors_names)) + # Convert numeric paramters to numeric to be sure + value <- as.numeric(value) + increment <- as.numeric(increment) + increment_step <- as.numeric(increment_step) + # --- # + # Now calculate the range across each target predictor and occurrence dataset + df <- data.frame() + for(id in names(model$biodiversity)){ + sub <- model$biodiversity[[id]] + # Which are presence data + is_presence <- which(sub$observations[['observed']] > 0) + df <- rbind(df, + sub$predictors[is_presence, names]) + } + rr <- sapply(df, function(x) range(x, na.rm = TRUE)) # Calculate ranges + rsd <- sapply(df, function(x) sd(x, na.rm = TRUE)) # Calculate standard deviation + + # Apply value and increment if set + rsd <- rsd * (value + (increment*increment_step)) + rr[1,] <- rr[1,] - rsd; rr[2,] <- rr[2,] + rsd + + # Now 'clamp' all predictor values beyond these names to 0, e.g. partial out + nd <- newdata + for(n in names){ + # Calc min + min_ex <- which(nd[,n] < rr[1,n]) + max_ex <- which(nd[,n] > rr[2,n]) + if(length(min_ex)>0) nd[min_ex,n] <- NA + if(length(max_ex)>0) nd[max_ex,n] <- NA + # FIXME Or rather do a smooth logistic decay for less extreme? + } + return(nd) +} + +# ------------------------ # +#### Boundary constraints #### + +#' @title Adds a boundary constraint to a scenario object +#' @description +#' The purpose of boundary constraints is to limit a future projection within a specified area +#' (such as for example a range or ecoregion). This can help to limit unreasonable projections into geographic space. +#' +#' Similar to boundary constraints it is also possible to define a \code{"zone"} for the scenario projections, similar +#' as was done for model training. The difference to a boundary constraint is that the boundary constraint is applied posthoc +#' as a hard cut on any projection, while the zones would allow any projection (and other constraints) to be applied within +#' the zone. +#' **Note: Setting a boundary constraint for future projections effectively potentially suitable areas!** +#' @name add_constraint_boundary +#' @aliases add_constraint_boundary +#' @inheritParams add_constraint +#' @param layer A [`Raster`] or [`sf`] object with the same extent as the model background. Has to be binary and +#' is used for a posthoc masking of projected grid cells. +#' @family constraint +#' @keywords scenario +#' @exportMethod add_constraint_boundary +#' @export +NULL +methods::setGeneric("add_constraint_boundary", + signature = methods::signature("mod", "layer"), + function(mod, layer, ...) standardGeneric("add_constraint_boundary")) + +#' @name add_constraint_boundary +#' @rdname add_constraint_boundary +#' @usage \S4method{add_constraint_boundary}{BiodiversityScenario, sf, character}(mod, layer, method) +methods::setMethod( + "add_constraint_boundary", + methods::signature(mod = "BiodiversityScenario", layer = "sf"), + function(mod, layer, method = "boundary", ...){ + assertthat::assert_that( + inherits(mod, "BiodiversityScenario"), + inherits(layer, "sf"), + is.character(method) + ) + + # Rasterize the layer + # First try and dig out a layer from a predictor dataset if found + if(inherits( mod$get_predictors(), "PredictorDataSet")){ + ras <- mod$get_predictors()$get_data() |> stars_to_raster() + ras <- ras[[1]] + } else { + # Try and get the underlying model and its predictors + ras <- mod$get_model()$get_data() + } + assertthat::assert_that(is.Raster(ras)) + bb <- try({ raster::rasterize(layer, ras, 1)},silent = TRUE) + if(inherits(bb, "try-error")) stop("Provide a rasterized layer of the boundary constraint!") + + # Call again + o <- add_constraint_boundary(mod, layer = bb, method = method, ...) + + return( o ) + } +) + +#' @name add_constraint_boundary +#' @rdname add_constraint_boundary +#' @usage \S4method{add_constraint_boundary}{BiodiversityScenario, ANY, character}(mod, layer, method) +methods::setMethod( + "add_constraint_boundary", + methods::signature(mod = "BiodiversityScenario", layer = "ANY"), + function(mod, layer, method = "boundary", ...){ + assertthat::assert_that( + inherits(mod, "BiodiversityScenario"), + is.Raster(layer), + is.character(method) + ) + + # Check that layer is a single RasterLayer + if(!inherits(layer, "RasterLayer")){ + assertthat::assert_that(raster::nlayers(layer) == 1) + layer <- layer[[1]] + } + + # Add processing method # + # --- # + co <- list() + if(method == "boundary"){ + # Add a constrain on parameter space, e.g. max 1 SD from training data covariates + assertthat::assert_that( + length( unique( layer )) <=2 + ) + # If length of values is greater than 1, remove everything else by setting it to NA + if( length( unique( layer )) >1 ){ + layer[layer<1] <- NA + } + co[['boundary']] <- list(method = method, + params = c("layer" = layer)) + } + # --- # + new <- mod$set_constraints(co) + return( + bdproto(NULL, new) + ) + } +) diff --git a/R/add_constraint_MigClim.R b/R/add_constraint_MigClim.R index e8dc9eff..920f06b0 100644 --- a/R/add_constraint_MigClim.R +++ b/R/add_constraint_MigClim.R @@ -29,8 +29,10 @@ NULL #' @details The barrier parameter is defined through [add_barrier]. #' @seealso [`MigClim.userGuide()`] #' @references -#' * Engler R., Hordijk W. and Guisan A. The MIGCLIM R package – seamless integration of dispersal constraints into projections of species distribution models. Ecography, -#' * Robin Engler, Wim Hordijk and Loic Pellissier (2013). MigClim: Implementing dispersal into species distribution models. R package version 1.6. +#' * Engler R., Hordijk W. and Guisan A. The MIGCLIM R package – seamless integration of +#' dispersal constraints into projections of species distribution models. Ecography, +#' * Robin Engler, Wim Hordijk and Loic Pellissier (2013). MigClim: Implementing dispersal +#' into species distribution models. R package version 1.6. #' @returns Adds a MigClim onstrain to a [`BiodiversityScenario`] object. #' @examples #' \dontrun{ diff --git a/R/add_control_bias.R b/R/add_control_bias.R index 369afc1a..7c96bc5d 100644 --- a/R/add_control_bias.R +++ b/R/add_control_bias.R @@ -37,8 +37,8 @@ #' @keywords bias, offset #' @examples #' \dontrun{ -#' x <- distribution(background) %>% -#' add_predictors(covariates) %>% +#' x <- distribution(background) |> +#' add_predictors(covariates) |> #' add_control_bias(biasvariable, bias_value = NULL) #' } #' @name add_control_bias diff --git a/R/add_latent.R b/R/add_latent.R index 93160b59..bbf18fb8 100644 --- a/R/add_latent.R +++ b/R/add_latent.R @@ -47,7 +47,7 @@ NULL #' @keywords latent #' @examples #' \dontrun{ -#' distribution(background) %>% add_latent_spatial(method = "poly") +#' distribution(background) |> add_latent_spatial(method = "poly") #' } #' @name add_latent_spatial NULL diff --git a/R/add_log.R b/R/add_log.R index d26b793f..e484fe3e 100644 --- a/R/add_log.R +++ b/R/add_log.R @@ -13,7 +13,7 @@ NULL #' @returns Adds a log file to a [`distribution`] object. #' @examples #' \dontrun{ -#' x <- distribution(background) %>% +#' x <- distribution(background) |> #' add_log() #' x #' } @@ -40,7 +40,7 @@ methods::setMethod( is.character(filename), assertthat::has_extension(filename,'txt') ) - # Messager + # Messenger if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding log file...') # Check whether a log is already present in the distribution file diff --git a/R/add_offset.R b/R/add_offset.R index e664aa5c..0b84a242 100644 --- a/R/add_offset.R +++ b/R/add_offset.R @@ -1,631 +1,631 @@ -#' Specify a spatial explicit offset -#' -#' @description -#' Including offsets is another option to integrate spatial prior information -#' in linear and additive regression models. Offsets shift the intercept of -#' the regression fit by a certain amount. Although only one offset can be added -#' to a regression model, it is possible to combine several spatial-explicit estimates into -#' one offset by calculating the sum of all spatial-explicit layers. -#' -#' @details -#' This function allows to set any specific offset to a regression model. The offset -#' has to be provided as spatial [`RasterLayer`] object. This function simply adds the layer to -#' a [`distribution()`] object. -#' **Note that any transformation of the offset (such as \code{log}) has do be done externally!** -#' -#' If the layer is range and requires additional formatting, consider using the -#' function [`add_offset_range()`] which has additional functionalities such such distance -#' transformations. -#' -#' @note -#' Since offsets only make sense for linear regressions (and not for instance -#' regression tree based methods such as [engine_bart]), they do not work for all engines. -#' Offsets specified for non-supported engines are ignored during the estimation -#' -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param layer A [`sf`] or [`RasterLayer`] object with the range for the target feature. -#' @param add [`logical`] specifying whether new offset is to be added. Setting -#' this parameter to \code{FALSE} replaces the current offsets with the new one (Default: \code{TRUE}). -#' @param ... Other parameters or arguments (currently not supported) -#' @references -#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 -#' @returns Adds an offset to a [`distribution`] object. -#' @family offset -#' @keywords prior, offset -#' @examples -#' \dontrun{ -#' x <- distribution(background) %>% -#' add_predictors(covariates) %>% -#' add_offset(nicheEstimate) -#' } -#' @name add_offset -NULL - -#' @name add_offset -#' @rdname add_offset -#' @exportMethod add_offset -#' @export -methods::setGeneric( - "add_offset", - signature = methods::signature("x", "layer"), - function(x, layer, add = TRUE) standardGeneric("add_offset")) - -#' @name add_offset -#' @rdname add_offset -#' @usage \S4method{add_offset}{BiodiversityDistribution, raster}(x, layer) -methods::setMethod( - "add_offset", - methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), - function(x, layer, add = TRUE) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.Raster(layer), - is.logical(add) - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding spatial explicit offset...') - ori.name <- names(layer) - - # Check for infinite values - assertthat::assert_that( - all( is.finite(cellStats(layer, "range")) ), - msg = "Infinite values found in the layer (maybe log of 0?)." - ) - - # Check that background and range align, otherwise raise error - if(compareRaster(layer, x$background,stopiffalse = FALSE)){ - warning('Supplied layer does not align with background! Aligning them now...') - layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) - names(layer) <- ori.name - } - - # Check whether an offset exists already - if(!is.Waiver(x$offset) && add){ - # Add to current object - of <- x$offset - layer <- raster::resample(layer, of, method = 'bilinear', func = mean, cl = FALSE) - names(layer) <- ori.name # In case the layer name got lost - of <- raster::stack(of) |> raster::addLayer(layer) - x <- x$set_offset(of) - } else { - # Add as a new offset - x <- x$set_offset(layer) - } - return(x) - } -) - -#' Function to remove an offset -#' -#' @description -#' This is just a wrapper function for removing specified offsets from a [`BiodiversityDistribution-class`]) object. -#' -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param layer A `character` pointing to the specific layer to be removed. If set to \code{NULL}, then -#' all offsets are removed from the object. -#' @returns Removes an offset from a [`distribution`] object. -#' @examples -#' \dontrun{ -#' rm_offset(model) -> model -#' } -#' @family offset -#' @keywords prior, offset, internal -#' @name rm_offset -NULL - -#' @name rm_offset -#' @rdname rm_offset -#' @exportMethod rm_offset -#' @export -methods::setGeneric( - "rm_offset", - signature = methods::signature("x", "layer"), - function(x, layer = NULL) standardGeneric("rm_offset")) - -#' @name rm_offset -#' @rdname rm_offset -#' @usage \S4method{rm_offset}{BiodiversityDistribution, character}(x, layer) -methods::setMethod( - "rm_offset", - methods::signature(x = "BiodiversityDistribution", layer = "character"), - function(x, layer = NULL) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.character(layer) || is.null(layer) - ) - # If no offset can be found, just return proto object - if(is.Waiver(x$offset)){ return(x) } - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','yellow','Removing offsets.') - - offs <- x$get_offset() - if(!is.null(layer)){ - assertthat::assert_that(layer %in% offs, - msg = paste0("Specified offset ", layer, "not found in the offset list.")) - } - - # Now remove the offset - x$rm_offset() - } -) - -#### Bias offset ---- -#' Specify a spatial explicit offset as bias -#' -#' @description -#' Including offsets is another option to integrate spatial prior information -#' in linear and additive regression models. Offsets shift the intercept of -#' the regression fit by a certain amount. Although only one offset can be added -#' to a regression model, it is possible to combine several spatial-explicit estimates into -#' one offset by calculating the sum of all spatial-explicit layers. -#' -#' @details -#' This functions emulates the use of the [`add_offset()`] function, however applies an inverse -#' transformation to remove the provided layer from the overall offset. -#' So if for instance a offset is already specified (such as area), this function -#' removes the provided \code{bias.layer} from it via \code{"offset(log(off.area)-log(bias.layer))"} -#' -#' **Note that any transformation of the offset (such as \code{log}) has do be done externally!** -#' -#' If a generic offset is added, consider using the [`add_offset()`] function. If the layer is a expert-based range and -#' requires additional parametrization, consider using the -#' function [`add_offset_range()`] or the \code{bossMaps} R-package. -#' -#' @inheritParams add_offset -#' @param points An optional [`sf`] object with key points. The location of the points are then used to -#' calculate the probability that a cell has been sampled while accounting for area differences. -#' (Default: \code{NULL}). -#' @references -#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 -#' @family offset -#' @keywords prior, offset -#' @returns Adds a bias offset to a [`distribution`] object. -#' @examples -#' \dontrun{ -#' x <- distribution(background) %>% -#' add_predictors(covariates) %>% -#' add_offset_bias(samplingBias) -#' } -#' @name add_offset_bias -NULL - -#' @name add_offset_bias -#' @rdname add_offset_bias -#' @exportMethod add_offset_bias -#' @export -methods::setGeneric( - "add_offset_bias", - signature = methods::signature("x", "layer"), - function(x, layer, add = TRUE, points = NULL) standardGeneric("add_offset_bias")) - -#' @name add_offset_bias -#' @rdname add_offset_bias -#' @usage \S4method{add_offset_bias}{BiodiversityDistribution, raster}(x, layer) -methods::setMethod( - "add_offset_bias", - methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), - function(x, layer, add = TRUE, points = NULL) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.Raster(layer), - is.logical(add), - is.null(points) || inherits(points, 'sf') - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding spatial explicit bias offset...') - ori.name <- names(layer) - - # Check that background and range align, otherwise raise error - if(compareRaster(layer, x$background,stopiffalse = FALSE)){ - warning('Supplied layer does not align with background! Aligning them now...') - layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) - names(layer) <- ori.name - } - if(is.null(points)){ - # Since it is a bias offset and removal is equivalent to simple subtraction, multiply with *-1 - layer <- layer * -1 - } else { - ## Count the number of records per cell - tab <- raster::cellFromXY(layer, sf::st_coordinates(points)) - r <- emptyraster(layer) - r[tab] <- layer[tab] - r <- raster::mask(r, background) - - ## Make zeros a very small number otherwise issues with log(0). - r[r[]==0] <- 1e-6 - suppressWarnings({ar <- raster::area(r)}) - - ## Calculate the probability that a cell has been sampled while accounting for area differences in lat/lon - ## Direction sign is negative and if area offset considered, use "+ offset(log(off.area)-log(off.bias))" - off.bias <- (-log(1-exp(-r * ar)) - log(ar)) - names(off.bias) <- "off.bias" - # Add bias as covariate - layer <- off.bias - } - - # Check for infinite values - assertthat::assert_that( - all( is.finite(cellStats(layer, "range")) ), - msg = "Infinite values found in the layer (maybe log of 0?)." - ) - - # Check whether an offset exists already - if(!is.Waiver(x$offset) && add){ - # Add to current object - of <- x$offset - layer <- raster::resample(layer, of, method = 'bilinear', func = mean, cl = FALSE) - names(layer) <- ori.name # In case the layer name got lost - of <- raster::stack(of) |> raster::addLayer(layer) - x <- x$set_offset(of) - } else { - # Add as a new offset - x <- x$set_offset(layer) - } - return(x) - } -) - -#### Add a range as offset ---- -#' Specify a expert-based species range as offset -#' -#' @description -#' This function has additional options compared to the more generic -#' [`add_offset()`], allowing customized options specifically for expert-based ranges as offsets or spatialized -#' polygon information on species occurrences. -#' If even more control is needed, the user is informed of the \pkg{bossMaps} package Merow et al. (2017). The \pkg{bossMaps} package -#' calculates - based on supplied point information - the probability of occurrences being inside vs outside the -#' range map and can thus be used as a method to 'improve' the mapping of a species range. -#' -#' @details -#' The output created by this function creates a [`RasterLayer`] to be added to a provided distribution object. Offsets -#' in regression models are likelihood specific as they are added directly to the overall estimate of \code{`y^hat`}. -#' -#' Note that all offsets created by this function are by default log-transformed before export. Background values -#' (e.g. beyond [`distance_max`]) are set to a very small constant (\code{1e-10}). -#' -#' @inheritParams add_offset -#' @param distance_max A [`numeric`] threshold on the maximum distance beyond the range that should be considered -#' to have a high likelihood of containing species occurrences (Default: \code{Inf} [m]). Can be set to \code{NULL} or \code{0} -#' to indicate that no distance should be calculated. -#' @param type A [`character`] denoting the type of model to which this offset is to be added. By default -#' it assumes a \code{'poisson'} distributed model and as a result the output created by this function will be log-transformed. -#' If however a \code{'binomial'} distribution is chosen, than the output will be \code{`logit`} transformed. -#' For integrated models leave at default. -#' @param presence_prop [`numeric`] giving the proportion of all records expected to be inside the range. By -#' default this is set to \code{0.9} indicating that 10% of all records are likely outside the range. -#' @param distance_clip [`logical`] as to whether distance should be clipped after the maximum distance (Default: \code{FALSE}). -#' @param fraction An optional [`RasterLayer`] object that is multiplied with digitized raster layer. -#' Can be used to for example to remove or reduce the expected value (Default: \code{NULL}). -#' @seealso [`bossMaps`] -#' @references -#' * Merow, C., Wilson, A.M., Jetz, W., 2017. Integrating occurrence data and expert maps for improved species range predictions. Glob. Ecol. Biogeogr. 26, 243–258. https://doi.org/10.1111/geb.12539 -#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 -#' @returns Adds a range offset to a [`distribution`] object. -#' @examples -#' \dontrun{ -#' # Adds the offset to a distribution object -#' distribution(background) |> add_offset_range(species_range) -#' } -#' @keywords prior, offset -#' @family offset -#' @name add_offset_range -NULL - -#' @name add_offset_range -#' @rdname add_offset_range -#' @exportMethod add_offset_range -#' @export -methods::setGeneric( - "add_offset_range", - signature = methods::signature("x", "layer"), - function(x, layer, distance_max = Inf, type = "poisson", presence_prop = 0.9, - distance_clip = FALSE, fraction = NULL, add = TRUE) standardGeneric("add_offset_range")) - -#' Function for when raster is directly supplied (precomputed) -#' @name add_offset_range -#' @rdname add_offset_range -#' @usage \S4method{add_offset_range}{BiodiversityDistribution, raster}(x, layer) -methods::setMethod( - "add_offset_range", - methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), - function(x, layer, fraction = NULL, add = TRUE) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.Raster(layer), - is.null(fraction) || is.Raster(fraction), - is.logical(add) - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range offset...') - - # Save name - ori.name <- names(layer) - - # Check for infinite values - assertthat::assert_that( - all( is.finite(cellStats(layer, "range")) ), - msg = "Infinite values found in the layer (maybe log of 0?)." - ) - - # Check that background and range align, otherwise raise error - if(compareRaster(layer, x$background,stopiffalse = FALSE)){ - warning('Supplied range does not align with background! Aligning them now...') - layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) - names(layer) <- ori.name # In case the layer name got lost - } - - # Multiply with fraction layer if set - if(!is.null(fraction)){ - # Rescale if necessary and set 0 to a small constant 1e-6 - if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") - fraction[fraction==0] <- 1e-6 - layer <- layer * fraction - } - - # Check whether an offset exists already - if(!is.Waiver(x$offset) && add){ - # Add to current object - of <- x$offset - layer <- raster::resample(layer, of, method = 'bilinear', func = mean, cl = FALSE) - names(layer) <- ori.name # In case the layer name got lost - of <- raster::stack(of) |> raster::addLayer(layer) - x <- x$set_offset(of) - } else { - # Add as a new offset - x <- x$set_offset(layer) - } - return(x) - } -) - -#' @name add_offset_range -#' @rdname add_offset_range -#' @usage \S4method{add_offset_range}{BiodiversityDistribution, sf}(x, layer) -methods::setMethod( - "add_offset_range", - methods::signature(x = "BiodiversityDistribution", layer = "sf"), - function(x, layer, distance_max = Inf, type = "poisson", presence_prop = 0.9, - distance_clip = FALSE, fraction = NULL, add = TRUE ) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(layer, 'sf'), - is.null(distance_max) || is.numeric(distance_max) || is.infinite(distance_max), - is.numeric(presence_prop), - is.logical(distance_clip), - is.null(fraction) || is.Raster(fraction), - is.character(type), - is.logical(add) - ) - # Match the type if set - type <- match.arg(type, c("poisson", "binomial"), several.ok = FALSE) - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range offset...') - - # Reproject if necessary - if(sf::st_crs(layer) != sf::st_crs(x$background)) layer <- sf::st_transform(layer, sf::st_crs(x$background)) - - # If distance max is null, set to 0 - if(is.null(distance_max)) distance_max <- 0 - - # Template raster for rasterization background - if(!is.Waiver(x$predictors)){ - temp <- emptyraster(x$predictors$get_data()) - } else { - # Try and guess an sensible background raster - myLog('[Setup]','red', - 'CAREFUL - This might not work without predictors already in the model. - Add offset after predictors') - temp <- raster::raster(raster::extent(x$background), - resolution = diff(sf::st_bbox(x$background)[c(1,3)]) / 100, - crs = sf::st_crs(x$background)) - } - - # Check to make the entries valid - if( any(!sf::st_is_valid(layer)) ){ - layer <- sf::st_make_valid(layer) # Check whether to make them valid - if( any(!sf::st_is_valid(layer)) ){ - # If still has errors, combine - layer <- layer |> sf::st_combine() |> sf::st_as_sf() - } - } - - # If layer has multiple entries join them - if(nrow(layer)>1) layer <- layer |> sf::st_union() |> sf::st_as_sf() - - # Rasterize the range - if( 'fasterize' %in% installed.packages()[,1] ){ - ras_range <- try({ fasterize::fasterize(layer, temp, field = NULL, background = NA) },silent = TRUE) - if(inherits(ras_range,"try-error")){ - myLog('[Setup]','yellow','Fasterize package needs to be re-installed!') - ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) - } - } else { - ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) - } - # Calculate distance if required - if(distance_max > 0){ - # Calculate a distance raster - dis <- raster::gridDistance(ras_range, origin = 1) - # If max distance is specified - if(distance_clip && is.finite(distance_max)){ - dis[dis > distance_max] <- NA # Set values above threshold to a very small constant - } - # Inverse of distance - if(is.infinite(distance_max)) distance_max <- cellStats(dis,"max") - # ---- # - alpha <- 1 / (distance_max / 4 ) # Divide by 4 for a quarter in each direction - # Grow baseline raster by using an exponentially weighted kernel - dis <- raster::calc(dis, fun = function(x) exp(-alpha * x)) - # Set the remaining ones to very small constant - dis[is.na(dis)] <- 1e-10 # Background values - dis <- raster::mask(dis, x$background) - - } else { - dis <- ras_range - dis[is.na(dis)] <- 1e-10 # Background values - dis <- raster::mask(dis, x$background) - } - - # Inside I want all X across the entire area for the PPMs, - # indicating a lambda per area of at least X/A (per unit area) within the range - suppressWarnings( ar <- raster::area(ras_range) ) # Calculate area - pres <- 1 + ( ( raster::cellStats(ar * ras_range, "sum") / raster::cellStats(ar, "sum")) * (presence_prop) ) - abs <- 1 + ( ( raster::cellStats(ar * ras_range, "sum") / raster::cellStats(ar, "sum")) * (1-presence_prop) ) - # Now set all values inside the range to pres and outside to abs - ras_range[ras_range == 1] <- pres - ras_range[is.na(ras_range)] <- abs - # Multiply with distance layer - ras_range <- ras_range * dis - # Normalize the result by dividing by the sum - ras_range <- ras_range / raster::cellStats(ras_range, "sum", na.rm = TRUE) - - # Multiply with fraction layer if set - if(!is.null(fraction)){ - # Rescale if necessary and set 0 to a small constant 1e-6 - if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") - fraction[fraction==0] <- 1e-6 - ras_range <- ras_range * fraction - } - - # -------------- # - # Log transform - ras_range <- log(ras_range) - # Rescaling does not affect relative differences. - ras_range <- raster::scale(ras_range, scale = F) - names(ras_range) <- "range_distance" - - assertthat::assert_that( - is.finite( raster::cellStats(ras_range, "max") ), - msg = "Range offset has infinite values. Check parameters!" - ) - - # Check whether an offset exists already - if(!is.Waiver(x$offset) && add){ - # Add to current object - of <- x$offset - ori.name <- names(ras_range) - ras_range <- raster::resample(ras_range, of, method = 'bilinear', func = mean, cl = FALSE) - names(ras_range) <- ori.name # In case the layer name got lost - of <- raster::stack(of) |> raster::addLayer(ras_range) - x <- x$set_offset(of) - } else { - # Add as a new offset - x <- x$set_offset(ras_range) - } - return(x) - } -) - -#' Specify elevational preferences as offset -#' -#' @description -#' This function implements the elevation preferences offset defined in Ellis‐Soto et al. (2021). -#' The code here was adapted from the Supporting materials script. -#' @details -#' Specifically this functions calculates a continuous decay and decreasing probability of a species to occur -#' from elevation limits. It requires a [`RasterLayer`] with elevation information. -#' A generalized logistic transform (aka Richard's curve) is used to calculate decay from the suitable elevational -#' areas, with the [`rate`] parameter allowing to vary the steepness of decline. -#' -#' Note that all offsets created by this function are by default log-transformed before export. In addition -#' this function also mean-centers the output as recommended by Ellis-Soto et al. -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param elev A [`RasterLayer`] with the elevation for a given background. -#' @param pref A [`numeric`] vector of length \code{2} giving the lower and upper bound of known elevational preferences. -#' Can be set to \code{Inf} if unknown. -#' @param rate A [`numeric`] for the rate used in the offset (Default: \code{.0089}). This parameter specifies the -#' decay to near zero probability at elevation above and below the expert limits. -#' @param add [`logical`] specifying whether new offset is to be added. Setting -#' this parameter to \code{FALSE} replaces the current offsets with the new one (Default: \code{TRUE}). -#' @references -#' * Ellis‐Soto, D., Merow, C., Amatulli, G., Parra, J.L., Jetz, W., 2021. Continental‐scale 1 km hummingbird diversity derived from fusing point records with lateral and elevational expert information. Ecography (Cop.). 44, 640–652. https://doi.org/10.1111/ecog.05119 -#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 -#' @returns Adds a elevational offset to a [`distribution`] object. -#' @examples -#' \dontrun{ -#' # Adds the offset to a distribution object -#' distribution(background) |> add_offset_elevation(dem, pref = c(400, 1200)) -#' } -#' @keywords prior, offset -#' @family offset -#' @name add_offset_elevation -NULL - -#' @name add_offset_elevation -#' @rdname add_offset_elevation -#' @exportMethod add_offset_elevation -#' @export -methods::setGeneric( - "add_offset_elevation", - signature = methods::signature("x", "elev", "pref"), - function(x, elev, pref, rate = .0089, add = TRUE) standardGeneric("add_offset_elevation")) - - -#' @name add_offset_elevation -#' @rdname add_offset_elevation -#' @usage \S4method{add_offset_elevation}{BiodiversityDistribution, raster, numeric}(x, elev, pref) -methods::setMethod( - "add_offset_elevation", - methods::signature(x = "BiodiversityDistribution", elev = "RasterLayer", pref = "numeric"), - function(x, elev, pref, rate = .0089, add = TRUE) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.Raster(elev), - is.numeric(pref), - length(pref)==2, - is.logical(add) - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding elevation offset...') - - # Check for infinite values - assertthat::assert_that( - all( is.finite(cellStats(elev, "range")) ), - msg = "Infinite values found in the layer (maybe log of 0?)." - ) - - # Check that background and range align, otherwise raise error - if(compareRaster(elev, x$background,stopiffalse = FALSE)){ - warning('Supplied range does not align with background! Aligning them now...') - elev <- alignRasters(elev, x$background, method = 'bilinear', func = mean, cl = FALSE) - } - # Generalized logistic transform (aka Richard's curve) function from bossMaps. - genLogit <- function(x, lower = 0, upper = 1, rate = 0.04, skew = 0.2, shift = 0){ - upper - ((upper - lower)/((1 + exp(-rate * (x - shift)))^(1/skew))) - } - - # ---- # - # if(getOption("ibis.runparallel")) raster::beginCluster(n = getOption("ibis.nthread")) - # Now calculate the elevation offset by projecting the values onto the elevation layer - # max avail > min expert - tmp.elev1 = -1 * (elev - pref[1]) - tmp.elev1.1 = raster::calc(tmp.elev1, function(x) genLogit(x, 1,100, rate, .2 )) - # min avail < max expert - tmp.elev2 = elev - pref[2] - tmp.elev2.1 = raster::calc(tmp.elev2, function(x) genLogit(x, 1,100, rate, .2)) - # Combine both and calculate the minimum - elev.prior = min( raster::stack(tmp.elev1.1, tmp.elev2.1)) - rm(tmp.elev1,tmp.elev1.1,tmp.elev2,tmp.elev2.1) # clean up - - # Normalize the result by dividing by the sum - elev.prior = elev.prior / raster::cellStats(elev.prior, "sum", na.rm = TRUE) - # Mean center prior - elev.prior = log(elev.prior) - prior.means = raster::cellStats(elev.prior,"mean") - raster::values(elev.prior) = do.call('cbind',lapply(1:length(prior.means), function(x) raster::values(elev.prior[[x]]) + abs(prior.means[x]))) - names(elev.prior)='elev.prior' - - # if(getOption("ibis.runparallel")) raster::endCluster() - # ---- # - - # Check whether an offset exists already - if(!is.Waiver(x$offset) && add){ - # Add to current object - of <- x$offset - elev.prior <- raster::resample(elev.prior, of, method = 'bilinear', func = mean, cl = FALSE) - names(elev.prior) <- 'elev.prior' # In case the layer name got lost - of <- raster::stack(of) |> raster::addLayer(elev.prior) - x <- x$set_offset(of) - } else { - # Add as a new offset - x <- x$set_offset(elev.prior) - } - return(x) - } -) +#' Specify a spatial explicit offset +#' +#' @description +#' Including offsets is another option to integrate spatial prior information +#' in linear and additive regression models. Offsets shift the intercept of +#' the regression fit by a certain amount. Although only one offset can be added +#' to a regression model, it is possible to combine several spatial-explicit estimates into +#' one offset by calculating the sum of all spatial-explicit layers. +#' +#' @details +#' This function allows to set any specific offset to a regression model. The offset +#' has to be provided as spatial [`RasterLayer`] object. This function simply adds the layer to +#' a [`distribution()`] object. +#' **Note that any transformation of the offset (such as \code{log}) has do be done externally!** +#' +#' If the layer is range and requires additional formatting, consider using the +#' function [`add_offset_range()`] which has additional functionalities such such distance +#' transformations. +#' +#' @note +#' Since offsets only make sense for linear regressions (and not for instance +#' regression tree based methods such as [engine_bart]), they do not work for all engines. +#' Offsets specified for non-supported engines are ignored during the estimation +#' +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param layer A [`sf`] or [`RasterLayer`] object with the range for the target feature. +#' @param add [`logical`] specifying whether new offset is to be added. Setting +#' this parameter to \code{FALSE} replaces the current offsets with the new one (Default: \code{TRUE}). +#' @param ... Other parameters or arguments (currently not supported) +#' @references +#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 +#' @returns Adds an offset to a [`distribution`] object. +#' @family offset +#' @keywords prior, offset +#' @examples +#' \dontrun{ +#' x <- distribution(background) |> +#' add_predictors(covariates) |> +#' add_offset(nicheEstimate) +#' } +#' @name add_offset +NULL + +#' @name add_offset +#' @rdname add_offset +#' @exportMethod add_offset +#' @export +methods::setGeneric( + "add_offset", + signature = methods::signature("x", "layer"), + function(x, layer, add = TRUE) standardGeneric("add_offset")) + +#' @name add_offset +#' @rdname add_offset +#' @usage \S4method{add_offset}{BiodiversityDistribution, raster}(x, layer) +methods::setMethod( + "add_offset", + methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), + function(x, layer, add = TRUE) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.Raster(layer), + is.logical(add) + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding spatial explicit offset...') + ori.name <- names(layer) + + # Check for infinite values + assertthat::assert_that( + all( is.finite(cellStats(layer, "range")) ), + msg = "Infinite values found in the layer (maybe log of 0?)." + ) + + # Check that background and range align, otherwise raise error + if(compareRaster(layer, x$background,stopiffalse = FALSE)){ + warning('Supplied layer does not align with background! Aligning them now...') + layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) + names(layer) <- ori.name + } + + # Check whether an offset exists already + if(!is.Waiver(x$offset) && add){ + # Add to current object + of <- x$offset + layer <- raster::resample(layer, of, method = 'bilinear', func = mean, cl = FALSE) + names(layer) <- ori.name # In case the layer name got lost + of <- raster::stack(of) |> raster::addLayer(layer) + x <- x$set_offset(of) + } else { + # Add as a new offset + x <- x$set_offset(layer) + } + return(x) + } +) + +#' Function to remove an offset +#' +#' @description +#' This is just a wrapper function for removing specified offsets from a [`BiodiversityDistribution-class`]) object. +#' +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param layer A `character` pointing to the specific layer to be removed. If set to \code{NULL}, then +#' all offsets are removed from the object. +#' @returns Removes an offset from a [`distribution`] object. +#' @examples +#' \dontrun{ +#' rm_offset(model) -> model +#' } +#' @family offset +#' @keywords prior, offset, internal +#' @name rm_offset +NULL + +#' @name rm_offset +#' @rdname rm_offset +#' @exportMethod rm_offset +#' @export +methods::setGeneric( + "rm_offset", + signature = methods::signature("x", "layer"), + function(x, layer = NULL) standardGeneric("rm_offset")) + +#' @name rm_offset +#' @rdname rm_offset +#' @usage \S4method{rm_offset}{BiodiversityDistribution, character}(x, layer) +methods::setMethod( + "rm_offset", + methods::signature(x = "BiodiversityDistribution", layer = "character"), + function(x, layer = NULL) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.character(layer) || is.null(layer) + ) + # If no offset can be found, just return proto object + if(is.Waiver(x$offset)){ return(x) } + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','yellow','Removing offsets.') + + offs <- x$get_offset() + if(!is.null(layer)){ + assertthat::assert_that(layer %in% offs, + msg = paste0("Specified offset ", layer, "not found in the offset list.")) + } + + # Now remove the offset + x$rm_offset() + } +) + +#### Bias offset ---- +#' Specify a spatial explicit offset as bias +#' +#' @description +#' Including offsets is another option to integrate spatial prior information +#' in linear and additive regression models. Offsets shift the intercept of +#' the regression fit by a certain amount. Although only one offset can be added +#' to a regression model, it is possible to combine several spatial-explicit estimates into +#' one offset by calculating the sum of all spatial-explicit layers. +#' +#' @details +#' This functions emulates the use of the [`add_offset()`] function, however applies an inverse +#' transformation to remove the provided layer from the overall offset. +#' So if for instance a offset is already specified (such as area), this function +#' removes the provided \code{bias.layer} from it via \code{"offset(log(off.area)-log(bias.layer))"} +#' +#' **Note that any transformation of the offset (such as \code{log}) has do be done externally!** +#' +#' If a generic offset is added, consider using the [`add_offset()`] function. If the layer is a expert-based range and +#' requires additional parametrization, consider using the +#' function [`add_offset_range()`] or the \code{bossMaps} R-package. +#' +#' @inheritParams add_offset +#' @param points An optional [`sf`] object with key points. The location of the points are then used to +#' calculate the probability that a cell has been sampled while accounting for area differences. +#' (Default: \code{NULL}). +#' @references +#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 +#' @family offset +#' @keywords prior, offset +#' @returns Adds a bias offset to a [`distribution`] object. +#' @examples +#' \dontrun{ +#' x <- distribution(background) |> +#' add_predictors(covariates) |> +#' add_offset_bias(samplingBias) +#' } +#' @name add_offset_bias +NULL + +#' @name add_offset_bias +#' @rdname add_offset_bias +#' @exportMethod add_offset_bias +#' @export +methods::setGeneric( + "add_offset_bias", + signature = methods::signature("x", "layer"), + function(x, layer, add = TRUE, points = NULL) standardGeneric("add_offset_bias")) + +#' @name add_offset_bias +#' @rdname add_offset_bias +#' @usage \S4method{add_offset_bias}{BiodiversityDistribution, raster}(x, layer) +methods::setMethod( + "add_offset_bias", + methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), + function(x, layer, add = TRUE, points = NULL) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.Raster(layer), + is.logical(add), + is.null(points) || inherits(points, 'sf') + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding spatial explicit bias offset...') + ori.name <- names(layer) + + # Check that background and range align, otherwise raise error + if(compareRaster(layer, x$background,stopiffalse = FALSE)){ + warning('Supplied layer does not align with background! Aligning them now...') + layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) + names(layer) <- ori.name + } + if(is.null(points)){ + # Since it is a bias offset and removal is equivalent to simple subtraction, multiply with *-1 + layer <- layer * -1 + } else { + ## Count the number of records per cell + tab <- raster::cellFromXY(layer, sf::st_coordinates(points)) + r <- emptyraster(layer) + r[tab] <- layer[tab] + r <- raster::mask(r, background) + + ## Make zeros a very small number otherwise issues with log(0). + r[r[]==0] <- 1e-6 + suppressWarnings({ar <- raster::area(r)}) + + ## Calculate the probability that a cell has been sampled while accounting for area differences in lat/lon + ## Direction sign is negative and if area offset considered, use "+ offset(log(off.area)-log(off.bias))" + off.bias <- (-log(1-exp(-r * ar)) - log(ar)) + names(off.bias) <- "off.bias" + # Add bias as covariate + layer <- off.bias + } + + # Check for infinite values + assertthat::assert_that( + all( is.finite(cellStats(layer, "range")) ), + msg = "Infinite values found in the layer (maybe log of 0?)." + ) + + # Check whether an offset exists already + if(!is.Waiver(x$offset) && add){ + # Add to current object + of <- x$offset + layer <- raster::resample(layer, of, method = 'bilinear', func = mean, cl = FALSE) + names(layer) <- ori.name # In case the layer name got lost + of <- raster::stack(of) |> raster::addLayer(layer) + x <- x$set_offset(of) + } else { + # Add as a new offset + x <- x$set_offset(layer) + } + return(x) + } +) + +#### Add a range as offset ---- +#' Specify a expert-based species range as offset +#' +#' @description +#' This function has additional options compared to the more generic +#' [`add_offset()`], allowing customized options specifically for expert-based ranges as offsets or spatialized +#' polygon information on species occurrences. +#' If even more control is needed, the user is informed of the \pkg{bossMaps} package Merow et al. (2017). The \pkg{bossMaps} package +#' calculates - based on supplied point information - the probability of occurrences being inside vs outside the +#' range map and can thus be used as a method to 'improve' the mapping of a species range. +#' +#' @details +#' The output created by this function creates a [`RasterLayer`] to be added to a provided distribution object. Offsets +#' in regression models are likelihood specific as they are added directly to the overall estimate of \code{`y^hat`}. +#' +#' Note that all offsets created by this function are by default log-transformed before export. Background values +#' (e.g. beyond [`distance_max`]) are set to a very small constant (\code{1e-10}). +#' +#' @inheritParams add_offset +#' @param distance_max A [`numeric`] threshold on the maximum distance beyond the range that should be considered +#' to have a high likelihood of containing species occurrences (Default: \code{Inf} [m]). Can be set to \code{NULL} or \code{0} +#' to indicate that no distance should be calculated. +#' @param type A [`character`] denoting the type of model to which this offset is to be added. By default +#' it assumes a \code{'poisson'} distributed model and as a result the output created by this function will be log-transformed. +#' If however a \code{'binomial'} distribution is chosen, than the output will be \code{`logit`} transformed. +#' For integrated models leave at default. +#' @param presence_prop [`numeric`] giving the proportion of all records expected to be inside the range. By +#' default this is set to \code{0.9} indicating that 10% of all records are likely outside the range. +#' @param distance_clip [`logical`] as to whether distance should be clipped after the maximum distance (Default: \code{FALSE}). +#' @param fraction An optional [`RasterLayer`] object that is multiplied with digitized raster layer. +#' Can be used to for example to remove or reduce the expected value (Default: \code{NULL}). +#' @seealso [`bossMaps`] +#' @references +#' * Merow, C., Wilson, A.M., Jetz, W., 2017. Integrating occurrence data and expert maps for improved species range predictions. Glob. Ecol. Biogeogr. 26, 243–258. https://doi.org/10.1111/geb.12539 +#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 +#' @returns Adds a range offset to a [`distribution`] object. +#' @examples +#' \dontrun{ +#' # Adds the offset to a distribution object +#' distribution(background) |> add_offset_range(species_range) +#' } +#' @keywords prior, offset +#' @family offset +#' @name add_offset_range +NULL + +#' @name add_offset_range +#' @rdname add_offset_range +#' @exportMethod add_offset_range +#' @export +methods::setGeneric( + "add_offset_range", + signature = methods::signature("x", "layer"), + function(x, layer, distance_max = Inf, type = "poisson", presence_prop = 0.9, + distance_clip = FALSE, fraction = NULL, add = TRUE) standardGeneric("add_offset_range")) + +#' Function for when raster is directly supplied (precomputed) +#' @name add_offset_range +#' @rdname add_offset_range +#' @usage \S4method{add_offset_range}{BiodiversityDistribution, raster}(x, layer) +methods::setMethod( + "add_offset_range", + methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), + function(x, layer, fraction = NULL, add = TRUE) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.Raster(layer), + is.null(fraction) || is.Raster(fraction), + is.logical(add) + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range offset...') + + # Save name + ori.name <- names(layer) + + # Check for infinite values + assertthat::assert_that( + all( is.finite(cellStats(layer, "range")) ), + msg = "Infinite values found in the layer (maybe log of 0?)." + ) + + # Check that background and range align, otherwise raise error + if(compareRaster(layer, x$background,stopiffalse = FALSE)){ + warning('Supplied range does not align with background! Aligning them now...') + layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) + names(layer) <- ori.name # In case the layer name got lost + } + + # Multiply with fraction layer if set + if(!is.null(fraction)){ + # Rescale if necessary and set 0 to a small constant 1e-6 + if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") + fraction[fraction==0] <- 1e-6 + layer <- layer * fraction + } + + # Check whether an offset exists already + if(!is.Waiver(x$offset) && add){ + # Add to current object + of <- x$offset + layer <- raster::resample(layer, of, method = 'bilinear', func = mean, cl = FALSE) + names(layer) <- ori.name # In case the layer name got lost + of <- raster::stack(of) |> raster::addLayer(layer) + x <- x$set_offset(of) + } else { + # Add as a new offset + x <- x$set_offset(layer) + } + return(x) + } +) + +#' @name add_offset_range +#' @rdname add_offset_range +#' @usage \S4method{add_offset_range}{BiodiversityDistribution, sf}(x, layer) +methods::setMethod( + "add_offset_range", + methods::signature(x = "BiodiversityDistribution", layer = "sf"), + function(x, layer, distance_max = Inf, type = "poisson", presence_prop = 0.9, + distance_clip = FALSE, fraction = NULL, add = TRUE ) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(layer, 'sf'), + is.null(distance_max) || is.numeric(distance_max) || is.infinite(distance_max), + is.numeric(presence_prop), + is.logical(distance_clip), + is.null(fraction) || is.Raster(fraction), + is.character(type), + is.logical(add) + ) + # Match the type if set + type <- match.arg(type, c("poisson", "binomial"), several.ok = FALSE) + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range offset...') + + # Reproject if necessary + if(sf::st_crs(layer) != sf::st_crs(x$background)) layer <- sf::st_transform(layer, sf::st_crs(x$background)) + + # If distance max is null, set to 0 + if(is.null(distance_max)) distance_max <- 0 + + # Template raster for rasterization background + if(!is.Waiver(x$predictors)){ + temp <- emptyraster(x$predictors$get_data()) + } else { + # Try and guess an sensible background raster + myLog('[Setup]','red', + 'CAREFUL - This might not work without predictors already in the model. + Add offset after predictors') + temp <- raster::raster(raster::extent(x$background), + resolution = diff(sf::st_bbox(x$background)[c(1,3)]) / 100, + crs = sf::st_crs(x$background)) + } + + # Check to make the entries valid + if( any(!sf::st_is_valid(layer)) ){ + layer <- sf::st_make_valid(layer) # Check whether to make them valid + if( any(!sf::st_is_valid(layer)) ){ + # If still has errors, combine + layer <- layer |> sf::st_combine() |> sf::st_as_sf() + } + } + + # If layer has multiple entries join them + if(nrow(layer)>1) layer <- layer |> sf::st_union() |> sf::st_as_sf() + + # Rasterize the range + if( 'fasterize' %in% utils::installed.packages()[,1] ){ + ras_range <- try({ fasterize::fasterize(layer, temp, field = NULL, background = NA) },silent = TRUE) + if(inherits(ras_range,"try-error")){ + myLog('[Setup]','yellow','Fasterize package needs to be re-installed!') + ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) + } + } else { + ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) + } + # Calculate distance if required + if(distance_max > 0){ + # Calculate a distance raster + dis <- raster::gridDistance(ras_range, origin = 1) + # If max distance is specified + if(distance_clip && is.finite(distance_max)){ + dis[dis > distance_max] <- NA # Set values above threshold to a very small constant + } + # Inverse of distance + if(is.infinite(distance_max)) distance_max <- cellStats(dis,"max") + # ---- # + alpha <- 1 / (distance_max / 4 ) # Divide by 4 for a quarter in each direction + # Grow baseline raster by using an exponentially weighted kernel + dis <- raster::calc(dis, fun = function(x) exp(-alpha * x)) + # Set the remaining ones to very small constant + dis[is.na(dis)] <- 1e-10 # Background values + dis <- raster::mask(dis, x$background) + + } else { + dis <- ras_range + dis[is.na(dis)] <- 1e-10 # Background values + dis <- raster::mask(dis, x$background) + } + + # Inside I want all X across the entire area for the PPMs, + # indicating a lambda per area of at least X/A (per unit area) within the range + suppressWarnings( ar <- raster::area(ras_range) ) # Calculate area + pres <- 1 + ( ( raster::cellStats(ar * ras_range, "sum") / raster::cellStats(ar, "sum")) * (presence_prop) ) + abs <- 1 + ( ( raster::cellStats(ar * ras_range, "sum") / raster::cellStats(ar, "sum")) * (1-presence_prop) ) + # Now set all values inside the range to pres and outside to abs + ras_range[ras_range == 1] <- pres + ras_range[is.na(ras_range)] <- abs + # Multiply with distance layer + ras_range <- ras_range * dis + # Normalize the result by dividing by the sum + ras_range <- ras_range / raster::cellStats(ras_range, "sum", na.rm = TRUE) + + # Multiply with fraction layer if set + if(!is.null(fraction)){ + # Rescale if necessary and set 0 to a small constant 1e-6 + if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") + fraction[fraction==0] <- 1e-6 + ras_range <- ras_range * fraction + } + + # -------------- # + # Log transform + ras_range <- log(ras_range) + # Rescaling does not affect relative differences. + ras_range <- raster::scale(ras_range, scale = F) + names(ras_range) <- "range_distance" + + assertthat::assert_that( + is.finite( raster::cellStats(ras_range, "max") ), + msg = "Range offset has infinite values. Check parameters!" + ) + + # Check whether an offset exists already + if(!is.Waiver(x$offset) && add){ + # Add to current object + of <- x$offset + ori.name <- names(ras_range) + ras_range <- raster::resample(ras_range, of, method = 'bilinear', func = mean, cl = FALSE) + names(ras_range) <- ori.name # In case the layer name got lost + of <- raster::stack(of) |> raster::addLayer(ras_range) + x <- x$set_offset(of) + } else { + # Add as a new offset + x <- x$set_offset(ras_range) + } + return(x) + } +) + +#' Specify elevational preferences as offset +#' +#' @description +#' This function implements the elevation preferences offset defined in Ellis‐Soto et al. (2021). +#' The code here was adapted from the Supporting materials script. +#' @details +#' Specifically this functions calculates a continuous decay and decreasing probability of a species to occur +#' from elevation limits. It requires a [`RasterLayer`] with elevation information. +#' A generalized logistic transform (aka Richard's curve) is used to calculate decay from the suitable elevational +#' areas, with the [`rate`] parameter allowing to vary the steepness of decline. +#' +#' Note that all offsets created by this function are by default log-transformed before export. In addition +#' this function also mean-centers the output as recommended by Ellis-Soto et al. +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param elev A [`RasterLayer`] with the elevation for a given background. +#' @param pref A [`numeric`] vector of length \code{2} giving the lower and upper bound of known elevational preferences. +#' Can be set to \code{Inf} if unknown. +#' @param rate A [`numeric`] for the rate used in the offset (Default: \code{.0089}). This parameter specifies the +#' decay to near zero probability at elevation above and below the expert limits. +#' @param add [`logical`] specifying whether new offset is to be added. Setting +#' this parameter to \code{FALSE} replaces the current offsets with the new one (Default: \code{TRUE}). +#' @references +#' * Ellis‐Soto, D., Merow, C., Amatulli, G., Parra, J.L., Jetz, W., 2021. Continental‐scale 1 km hummingbird diversity derived from fusing point records with lateral and elevational expert information. Ecography (Cop.). 44, 640–652. https://doi.org/10.1111/ecog.05119 +#' * Merow, C., Allen, J.M., Aiello-Lammens, M., Silander, J.A., 2016. Improving niche and range estimates with Maxent and point process models by integrating spatially explicit information. Glob. Ecol. Biogeogr. 25, 1022–1036. https://doi.org/10.1111/geb.12453 +#' @returns Adds a elevational offset to a [`distribution`] object. +#' @examples +#' \dontrun{ +#' # Adds the offset to a distribution object +#' distribution(background) |> add_offset_elevation(dem, pref = c(400, 1200)) +#' } +#' @keywords prior, offset +#' @family offset +#' @name add_offset_elevation +NULL + +#' @name add_offset_elevation +#' @rdname add_offset_elevation +#' @exportMethod add_offset_elevation +#' @export +methods::setGeneric( + "add_offset_elevation", + signature = methods::signature("x", "elev", "pref"), + function(x, elev, pref, rate = .0089, add = TRUE) standardGeneric("add_offset_elevation")) + + +#' @name add_offset_elevation +#' @rdname add_offset_elevation +#' @usage \S4method{add_offset_elevation}{BiodiversityDistribution, raster, numeric}(x, elev, pref) +methods::setMethod( + "add_offset_elevation", + methods::signature(x = "BiodiversityDistribution", elev = "RasterLayer", pref = "numeric"), + function(x, elev, pref, rate = .0089, add = TRUE) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.Raster(elev), + is.numeric(pref), + length(pref)==2, + is.logical(add) + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding elevation offset...') + + # Check for infinite values + assertthat::assert_that( + all( is.finite(cellStats(elev, "range")) ), + msg = "Infinite values found in the layer (maybe log of 0?)." + ) + + # Check that background and range align, otherwise raise error + if(compareRaster(elev, x$background,stopiffalse = FALSE)){ + warning('Supplied range does not align with background! Aligning them now...') + elev <- alignRasters(elev, x$background, method = 'bilinear', func = mean, cl = FALSE) + } + # Generalized logistic transform (aka Richard's curve) function from bossMaps. + genLogit <- function(x, lower = 0, upper = 1, rate = 0.04, skew = 0.2, shift = 0){ + upper - ((upper - lower)/((1 + exp(-rate * (x - shift)))^(1/skew))) + } + + # ---- # + # if(getOption("ibis.runparallel")) raster::beginCluster(n = getOption("ibis.nthread")) + # Now calculate the elevation offset by projecting the values onto the elevation layer + # max avail > min expert + tmp.elev1 = -1 * (elev - pref[1]) + tmp.elev1.1 = raster::calc(tmp.elev1, function(x) genLogit(x, 1,100, rate, .2 )) + # min avail < max expert + tmp.elev2 = elev - pref[2] + tmp.elev2.1 = raster::calc(tmp.elev2, function(x) genLogit(x, 1,100, rate, .2)) + # Combine both and calculate the minimum + elev.prior = min( raster::stack(tmp.elev1.1, tmp.elev2.1)) + rm(tmp.elev1,tmp.elev1.1,tmp.elev2,tmp.elev2.1) # clean up + + # Normalize the result by dividing by the sum + elev.prior = elev.prior / raster::cellStats(elev.prior, "sum", na.rm = TRUE) + # Mean center prior + elev.prior = log(elev.prior) + prior.means = raster::cellStats(elev.prior,"mean") + raster::values(elev.prior) = do.call('cbind',lapply(1:length(prior.means), function(x) raster::values(elev.prior[[x]]) + abs(prior.means[x]))) + names(elev.prior)='elev.prior' + + # if(getOption("ibis.runparallel")) raster::endCluster() + # ---- # + + # Check whether an offset exists already + if(!is.Waiver(x$offset) && add){ + # Add to current object + of <- x$offset + elev.prior <- raster::resample(elev.prior, of, method = 'bilinear', func = mean, cl = FALSE) + names(elev.prior) <- 'elev.prior' # In case the layer name got lost + of <- raster::stack(of) |> raster::addLayer(elev.prior) + x <- x$set_offset(of) + } else { + # Add as a new offset + x <- x$set_offset(elev.prior) + } + return(x) + } +) diff --git a/R/add_predictors.R b/R/add_predictors.R index 376ca8d5..485b27f6 100644 --- a/R/add_predictors.R +++ b/R/add_predictors.R @@ -1,1270 +1,1270 @@ -#' @include utils.R bdproto.R bdproto-biodiversitydistribution.R bdproto-predictors.R bdproto-biodiversityscenario.R -NULL - -#' Add predictors to a Biodiversity distribution object -#' -#' @description -#' This function allows to add predictors to [distribution] or [BiodiversityScenario] -#' objects. Predictors are covariates that in spatial projection have to match -#' the geographic projection of the background layer in the [distribution] object. -#' This function furthermore allows to transform or create derivates of provided -#' predictors. -#' -#' A transformation takes the provided rasters and for instance rescales them or transforms -#' them through a principal component analysis ([prcomp]). In contrast, derivates leave -#' the original provided predictors alone, but instead create new ones, for instance by transforming -#' their values through a quadratic or hinge transformation. Note that this effectively -#' increases the number of predictors in the object, generally requiring stronger regularization by -#' the used [`engine`]. -#' Both transformations and derivates can also be combined. -#' Available options for transformation are: -#' * \code{'none'} - Leaves the provided predictors in the original scale. -#' * \code{'pca'} - Converts the predictors to principal components. Note that this -#' results in a renaming of the variables to principal component axes! -#' * \code{'scale'} - Transforms all predictors by applying [scale] on them. -#' * \code{'norm'} - Normalizes all predictors by transforming them to a scale from 0 to 1. -#' * \code{'windsor'} - Applies a windsorization to the target predictors. By default -#' this effectively cuts the predictors to the 0.05 and 0.95, thus helping to remove -#' extreme outliers. -#' -#' Available options for creating derivates are: -#' * \code{'none'} - No additional predictor derivates are created. -#' * \code{'quad'} - Adds quadratic transformed predictors. -#' * \code{'interaction'} - Add interacting predictors. Interactions need to be specified (\code{"int_variables"})! -#' * \code{'thresh'} - Add threshold transformed predictors. -#' * \code{'hinge'} - Add hinge transformed predictors. -#' * \code{'bin'} - Add predictors binned by their percentiles. -#' -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param env A [`RasterStack-class`], [`RasterLayer-class`] or [`stars`] object. -#' @param names A [`vector`] of character names describing the environmental stack in case they should be renamed. -#' @param transform A [`vector`] stating whether predictors should be preprocessed in any way (Options: \code{'none'},\code{'pca'}, \code{'scale'}, \code{'norm'}) -#' @param derivates A Boolean check whether derivate features should be considered (Options: \code{'none'}, \code{'thresh'}, \code{'hinge'}, \code{'quad'}) ) -#' @param derivate_knots A single [`numeric`] or [`vector`] giving the number of knots for derivate creation if relevant (Default: \code{4}). -#' @param int_variables A [`vector`] with length greater or equal than \code{2} specifying the covariates (Default: \code{NULL}). -#' @param bgmask Check whether the environmental data should be masked with the background layer (Default: \code{TRUE}) -#' @param harmonize_na A [`logical`] value indicating of whether NA values should be harmonized among predictors (Default: \code{FALSE}) -#' @param explode_factors [`logical`] of whether any factor variables should be split up into binary variables (one per class). (Default: \code{FALSE}). -#' @param priors A [`PriorList-class`] object. Default is set to \code{NULL} which uses default prior assumptions. -#' @param ... Other parameters passed down -#' @note -#' **Important:** -#' Not every [`engine`] supported by the \pkg{ibis.iSDM} R-package allows missing data points -#' among extracted covariates. Thus any observation with missing data is generally removed prior -#' from model fitting. Thus ensure that covariates have appropriate no-data settings (for instance setting \code{NA} -#' values to \code{0} or another out of range constant). -#' -#' Not every engine does actually need covariates. For instance it is perfectly legit -#' to fit a model with only occurrence data and a spatial latent effect ([add_latent]). -#' This correspondents to a spatial kernel density estimate. -#' -#' Certain names such \code{"offset"} are forbidden as predictor variable names. The function -#' will return an error message if these are used. -#' @aliases add_predictors -#' @examples -#' \dontrun{ -#' obj <- distribution(background) %>% -#' add_predictors(covariates, transform = 'scale') -#' obj -#' } -#' @name add_predictors -NULL - -#' @name add_predictors -#' @rdname add_predictors -#' @exportMethod add_predictors -#' @export -methods::setGeneric( - "add_predictors", - signature = methods::signature("x", "env"), - function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, bgmask = TRUE, - harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ...) standardGeneric("add_predictors")) - -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityDistribution,RasterBrick}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityDistribution", env = "RasterBrick"), - function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, - bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - !missing(env)) - # Convert env to stack if it is a single layer only - env = raster::stack(env) - add_predictors(x, env, names, transform, derivates, derivate_knots, int_variables, bgmask, harmonize_na, explode_factors, priors, ...) - } -) - -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityDistribution,RasterLayer}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityDistribution", env = "RasterLayer"), - function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, - bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - !missing(env)) - # Convert env to stack if it is a single layer only - env = raster::stack(env) - add_predictors(x, env, names, transform, derivates, derivate_knots, int_variables, bgmask, harmonize_na, explode_factors, priors, ...) - } -) - -# TODO: Support other objects other than Raster stacks such as data.frames and stars objects -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityDistribution,RasterStack}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityDistribution", env = "RasterStack"), - function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, - bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { - # Try and match transform and derivatives arguments - transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor') , several.ok = TRUE) - derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin', 'interaction') , several.ok = TRUE) - - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.Raster(env), - all(transform == 'none') || all( transform %in% c('pca', 'scale', 'norm', 'windsor') ), - all(derivates == 'none') || all( derivates %in% c('thresh', 'hinge', 'quadratic', 'bin', 'interaction') ), - is.vector(derivate_knots) || is.numeric(derivate_knots), - is.null(names) || assertthat::is.scalar(names) || is.vector(names), - is.logical(explode_factors), - is.null(priors) || inherits(priors,'PriorList') - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding predictors...') - - if(!is.null(names)) { - assertthat::assert_that(nlayers(env)==length(names), - all(is.character(names)), - msg = 'Provided names not of same length as environmental data.') - # Set names of env - names(env) <- names - } - - # Check that all names allowed - problematic_names <- grep("offset|w|weight|spatial_offset|Intercept|spatial.field", names(env),fixed = TRUE) - if( length(problematic_names)>0 ){ - stop(paste0("Some predictor names are not allowed as they might interfere with model fitting:", paste0(names(env)[problematic_names],collapse = " | "))) - } - - # If priors have been set, save them in the distribution object - if(!is.null(priors)) { - assertthat::assert_that( all( priors$varnames() %in% names(env) ) ) - x <- x$set_priors(priors) - } - # Harmonize NA values - if(harmonize_na){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') - env <- predictor_homogenize_na(env, fill = FALSE) - } - - # Don't transform or create derivatives of factor variables - if(any(is.factor(env))){ - # Make subsets to join back later - env_f <- raster::subset(env, which(is.factor(env))) - env <- raster::subset(env, which(!is.factor(env))) - if(explode_factors){ - # Refactor categorical variables - if(inherits(env_f,'RasterLayer')){ - env_f <- explode_factorized_raster(env_f) - env <- addLayer(env, env_f) - } else { - o <- raster::stack() - for(layer in names(env_f)){ - o <- raster::addLayer(o, explode_factorized_raster(env_f[[layer]])) - } - env_f <- o;rm(o) - # Joining back to full raster stack - env <- raster::stack(env, env_f);rm(env_f) - } - has_factors <- FALSE # Set to false since factors have been exploded. - } else { has_factors <- TRUE } - } else { has_factors <- FALSE } - - # Standardization and scaling - if('none' %notin% transform){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') - for(tt in transform) env <- predictor_transform(env, option = tt) - } - - # Calculate derivates if set - if('none' %notin% derivates){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') - # Specific condition for interaction - if(any(derivates == "interaction")){ - assertthat::assert_that(is.vector(int_variables), length(int_variables)>=2) - attr(env, "int_variables") <- int_variables - } - new_env <- raster::stack() - for(dd in derivates) new_env <- raster::addLayer(new_env, predictor_derivate(env, option = dd, nknots = derivate_knots, int_variables = int_variables) ) - - # Add to env - env <- raster::addLayer(env, new_env) - } - - # Add factors back in if there are any. - # This is to avoid that they are transformed or similar - if(has_factors){ - env <- raster::addLayer(env, env_f) - } - attr(env, 'has_factors') <- has_factors - - # Assign an attribute to this object to keep track of it - attr(env,'transform') <- transform - - # Mask predictors with existing background layer - if(bgmask){ - env <- raster::mask(env, mask = x$background) - # Reratify, work somehow only on stacks - if(has_factors && any(is.factor(env)) ){ - new_env <- raster::stack(env) - new_env[[which(is.factor(env))]] <- raster::ratify(env[[which(is.factor(env))]]) - env <- new_env;rm(new_env) - } else env <- raster::stack(env) - } - - # Check whether predictors already exist, if so overwrite - if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') - - # Finally set the data to the BiodiversityDistribution object - x$set_predictors( - bdproto(NULL, PredictorDataset, - id = new_id(), - data = env, - ... - ) - ) - } -) - -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityDistribution, stars}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityDistribution", env = "stars"), - function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, - bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - !missing(env)) - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Taking first time entry from object.') - - # Convert to raster - env <- stars_to_raster(env, which = 1) - if(is.list(env)) env <- env[[1]] - x <- add_predictors(x, env, names, transform, derivates, derivate_knots, int_variables, bgmask, harmonize_na, explode_factors, priors, ...) - return( x ) - } -) - -# Add elevational delineation as predictor ---- - -#' Create lower and upper limits for an elevational range and add them as separate predictors -#' -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param layer A [`character`] stating the elevational layer in the Distribution object or [`RasterLayer`] object. -#' @param lower [`numeric`] value for a lower elevational preference of a species. -#' @param upper [`numeric`] value for a upper elevational preference of a species. -#' @param transform [`character`] Any optional transformation to be applied. Usually not needed (Default: \code{"none"}). -#' @name add_predictor_elevationpref -NULL - -#' @name add_predictor_elevationpref -#' @rdname add_predictor_elevationpref -#' @exportMethod add_predictor_elevationpref -#' @export -methods::setGeneric( - "add_predictor_elevationpref", - signature = methods::signature("x", "layer", "lower", "upper", "transform"), - function(x, layer, lower, upper, transform = "none") standardGeneric("add_predictor_elevationpref")) - -#' @name add_predictor_elevationpref -#' @rdname add_predictor_elevationpref -#' @usage \S4method{add_predictor_elevationpref}{BiodiversityDistribution, ANY, numeric, numeric, character}(x, layer, lower, upper, transform) -methods::setMethod( - "add_predictor_elevationpref", - methods::signature(x = "BiodiversityDistribution", layer = "ANY", lower = "numeric", upper = "numeric"), - function(x, layer, lower, upper, transform = "none") { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.Raster(layer) || is.character(layer), - is.numeric(lower) || is.na(lower), - is.numeric(upper) || is.na(upper), - is.character(transform) - ) - # Messager - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Formatting elevational preference predictors...') - - # If layer is a character, check that it is in the provided object - if(is.character(layer)){ - assertthat::assert_that(layer %in% x$get_predictor_names()) - layer <- x$predictors$get_data()[[layer]] - } else { - # If it is a raster - # Check that background and range align, otherwise raise error - if(compareRaster(layer, x$background,stopiffalse = FALSE)){ - warning('Supplied range does not align with background! Aligning them now...') - layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) - } - } - - # Format lower and upper preferences - if(is.na(lower)) lower <- raster::cellStats(layer, "min") - if(is.na(upper)) upper <- raster::cellStats(layer, "max") - - # Now create thresholded derivatives of lower and upper elevation - ras1 <- layer - # ras2[ras2 < lower] <- 0; ras2[ras2 > upper] <- 0; ras2[ras2 > 0] <- 1 # Both ways - ras1[layer < lower] <- 0; ras1[ras1 > lower] <- 1 - ras2 <- layer - ras2[ras2 < upper] <- 0; ras2[ras2 > 0] <- 1 - # If the minimum of those layers have equal min and max - if(raster::cellStats(ras1, "min") == raster::cellStats(ras1, "max")){ - o <- ras2 - # Ensure that all layers have a minimum and a maximum - o[is.na(o)] <- 0; o <- raster::mask(o, x$background) - names(o) <- c('elev_high') - } else { - o <- raster::stack(ras1, ras2) - # Ensure that all layers have a minimum and a maximum - o[is.na(o)] <- 0; o <- raster::mask(o, x$background) - names(o) <- c('elev_low', 'elev_high') - } - rm(ras1,ras2) - - # Add as predictor - if(is.Waiver(x$predictors)){ - x <- add_predictors(x, env = o, transform = transform, derivates = 'none') - } else { - for(n in names(o)){ - r <- o[[n]] - # If predictor transformation is specified, apply - if(transform != "none") r <- predictor_transform(r, option = transform) - x$predictors <- x$predictors$set_data(n, r) - rm(r) - } - } - return(x) - } -) - -# Add species ranges as predictor ---- -#' Add a range of a species as predictor to a distribution object -#' -#' @description -#' This function allows to add a species range which is usually drawn by experts in a separate process -#' as spatial explicit prior. Both [`sf`] and [`Raster`]-objects are supported as input. -#' -#' Users are advised to look at the [`bossMaps`] R-package presented as part of Merow et al. (2017), -#' which allows flexible calculation of non-linear distance transforms from the boundary of the range. -#' Outputs of this package could be added directly to this function. -#' **Note that this function adds the range as predictor and not as offset. For this purpose a separate function [`add_offset_range()`] exists.** -#' -#' Additional options allow to include the range either as \code{"binary"} or as \code{"distance"} transformed -#' predictor. The difference being that the range is either directly included as presence-only predictor or -#' alternatively with a linear distance transform from the range boundary. The parameter -#' \code{"distance_max"} can be specified to constrain this distance transform. -#' -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param layer A [`sf`] or [`Raster`] object with the range for the target feature. -#' @param method [`character`] describing how the range should be included (\code{"binary"} | \code{"distance"}). -#' @param distance_max Numeric threshold on the maximum distance (Default: \code{NULL}). -#' @param fraction An optional [`RasterLayer`] object that is multiplied with digitized raster layer. -#' Can be used to for example to remove or reduce the expected value (Default: \code{NULL}). -#' @param priors A [`PriorList-class`] object. Default is set to NULL which uses default prior assumptions -#' @references -#' * Merow, C., Wilson, A. M., & Jetz, W. (2017). Integrating occurrence data and expert maps for improved species range predictions. Global Ecology and Biogeography, 26(2), 243–258. https://doi.org/10.1111/geb.12539 -#' @name add_predictor_range -NULL - -#' @name add_predictor_range -#' @rdname add_predictor_range -#' @exportMethod add_predictor_range -#' @export -methods::setGeneric( - "add_predictor_range", - signature = methods::signature("x", "layer", "method"), - function(x, layer, method = 'distance', distance_max = NULL, fraction = NULL, priors = NULL) standardGeneric("add_predictor_range")) - -#' Function for when distance raster is directly supplied (precomputed) -#' @name add_predictor_range -#' @rdname add_predictor_range -#' @usage \S4method{add_predictor_range}{BiodiversityDistribution, raster}(x, layer) -methods::setMethod( - "add_predictor_range", - methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), - function(x, layer, method = 'precomputed_range', fraction = NULL, priors = NULL) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.Raster(layer), - is.Raster(fraction) || is.null(fraction), - is.character(method) - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range predictors...') - - # Check that background and range align, otherwise raise error - if(compareRaster(layer, x$background,stopiffalse = FALSE)){ - warning('Supplied range does not align with background! Aligning them now...') - layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) - } - names(layer) <- method - - # Multiply with fraction layer if set - if(!is.null(fraction)){ - # Rescale if necessary and set 0 to a small constant 1e-6 - if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") - fraction[fraction==0] <- 1e-6 - layer <- layer * fraction - } - - # Add as predictor - if(is.Waiver(x$predictors)){ - x <- add_predictors(x, env = layer, transform = 'none',derivates = 'none', priors) - } else { - x$predictors <- x$predictors$set_data('range_distance', layer) - if(!is.null(priors)) { - # FIXME: Ideally attempt to match varnames against supplied predictors vis match.arg or similar - assertthat::assert_that( all( priors$varnames() %in% names(layer) ) ) - x <- x$set_priors(priors) - } - } - return(x) - } -) - -#' @name add_predictor_range -#' @rdname add_predictor_range -#' @usage \S4method{add_predictor_range}{BiodiversityDistribution, sf}(x, layer) -methods::setMethod( - "add_predictor_range", - methods::signature(x = "BiodiversityDistribution", layer = "sf"), - function(x, layer, method = 'distance', distance_max = Inf, fraction = NULL, priors = NULL ) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.character(method), - inherits(layer, 'sf'), - method %in% c('binary','distance'), - is.null(fraction) || is.Raster(fraction), - is.null(distance_max) || is.numeric(distance_max) || is.infinite(distance_max), - is.null(priors) || inherits(priors,'PriorList') - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range predictors...') - - # Reproject if necessary - if(sf::st_crs(layer) != sf::st_crs(x$background)) layer <- sf::st_transform(layer, sf::st_crs(x$background)) - - # Template raster for background - if(!is.Waiver(x$predictors)){ - temp <- emptyraster(x$predictors$get_data()) - } else { - # TODO: Eventually make this work better - myLog('[Setup]','red','CAREFUL - This might not work without predictors already in the model.') - temp <- raster::raster(extent(x$background),resolution = 1) - } - - # Rasterize the range - if( 'fasterize' %in% utils::installed.packages()[,1] ){ - ras_range <- try({ fasterize::fasterize(layer, temp, field = NULL) }, silent = TRUE) - if(inherits(ras_range,"try-error")){ - myLog('[Setup]','yellow','Fasterize package needs to be re-installed!') - ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) - } - } else { - ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) - } - - # -------------- # - if(method == 'binary'){ - dis <- ras_range - dis[is.na(dis)] <- 0 - # Mask with temp again - dis <- raster::mask(dis, x$background) - names(dis) <- 'binary_range' - } else if(method == 'distance'){ - # Calculate the linear distance from the range - dis <- raster::gridDistance(ras_range, origin = 1) - dis <- raster::mask(dis, x$background) - # If max distance is specified - if(!is.null(distance_max) && !is.infinite(distance_max)){ - dis[dis > distance_max] <- NA # Set values above threshold to NA - attr(dis, "distance_max") <- distance_max - } else { distance_max <- raster::cellStats(dis, "max") } - # Grow baseline raster by using an exponentially weighted kernel - alpha <- 1 / (distance_max / 4 ) # Divide by 4 for a quarter in each direction - # Grow baseline raster by using an exponentially weighted kernel - dis <- raster::calc(dis, fun = function(x) exp(-alpha * x)) - # Convert to relative for better scaling in predictions - dis <- (dis / raster::cellStats(dis,'max')) - - # Set NA to 0 and mask again - dis[is.na(dis)] <- 0 - dis <- raster::mask(dis, x$background) - names(dis) <- 'distance_range' - } - - # Multiply with fraction layer if set - if(!is.null(fraction)){ - # Rescale if necessary and set 0 to a small constant 1e-6 - if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") - fraction[fraction==0] <- 1e-6 - layer <- layer * fraction - } - - # If priors have been set, save them in the distribution object - if(!is.null(priors)) { - # FIXME: Ideally attempt to match varnames against supplied predictors vis match.arg or similar - assertthat::assert_that( all( priors$varnames() %in% names(dis) ) ) - x <- x$set_priors(priors) - } - - # Add as predictor - if(is.Waiver(x$predictors)){ - x <- add_predictors(x, env = dis, transform = 'none',derivates = 'none') - } else { - x$predictors <- x$predictors$set_data('range_distance', dis) - } - return(x) - } -) - -#' Remove specific predictors from a [distribution] object -#' -#' @description -#' Remove a particular variable from an [distribution] object with a -#' [`PredictorDataset-class`]. -#' See Examples. -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param names [`vector`] A Vector of character names describing the environmental stack. -#' @examples -#' \dontrun{ -#' distribution(background) %>% -#' add_predictors(my_covariates) %>% -#' rm_predictors(names = "Urban") -#' } -#' @name rm_predictors -NULL - -#' @name rm_predictors -#' @rdname rm_predictors -#' @exportMethod rm_predictors -#' @export -methods::setGeneric( - "rm_predictors", - signature = methods::signature("x", "names"), - function(x, names) standardGeneric("rm_predictors")) - -#' @name rm_predictors -#' @rdname rm_predictors -#' @usage \S4method{rm_predictors}{BiodiversityDistribution,vector}(x, names) -methods::setMethod( - "rm_predictors", - methods::signature(x = "BiodiversityDistribution", names = "character"), - # rm_predictors ---- - function(x, names ) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.character(names) || assertthat::is.scalar(names) || is.vector(names) - ) - # TODO: Maybe implement a flexible wildcard, base::startsWith() - # Is there anything to remove - assertthat::assert_that(!is.Waiver(x$predictors), - all( names %in% x$get_predictor_names() ), - msg = 'Suggested variables not in model!') - - # Finally set the data to the BiodiversityDistribution object - x$rm_predictors(names) - } -) - -#' Select specific predictors from a [distribution] object -#' -#' @description -#' This function allows - out of a [`character`] vector with the names -#' of an already added [`PredictorDataset-class`] object - to select a particular set of predictors. -#' See Examples. -#' -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param names [`vector`] A Vector of character names describing the environmental stack. -#' @examples -#' \dontrun{ -#' distribution(background) %>% -#' add_predictors(my_covariates) %>% -#' sel_predictors(names = c("Forest", "Elevation")) -#' } -#' @name sel_predictors -NULL - -#' @name sel_predictors -#' @rdname sel_predictors -#' @exportMethod sel_predictors -#' @export -methods::setGeneric( - "sel_predictors", - signature = methods::signature("x", "names"), - function(x, names) standardGeneric("sel_predictors")) - -#' @name sel_predictors -#' @rdname sel_predictors -#' @usage \S4method{sel_predictors}{BiodiversityDistribution,vector}(x, names) -methods::setMethod( - "sel_predictors", - methods::signature(x = "BiodiversityDistribution", names = "character"), - # sel_predictors ---- - function(x, names ) { - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.character(names) || assertthat::is.scalar(names) || is.vector(names) - ) - # TODO: Maybe implement a flexible wildcard, base::startsWith() - # Is there anything to remove - assertthat::assert_that(!is.Waiver(x$predictors), - any( names %in% x$get_predictor_names() ), - msg = 'Suggested variables not in model!') - - # Get current predictors - varnames <- x$get_predictor_names() - varnames <- varnames[which(varnames %notin% names)] - - # Remove all predictors listed - if(length(varnames)>=1) x$rm_predictors(varnames) - } -) - -# ---------------- # -# Add predictor actions for scenario objects ---- -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityScenario,RasterBrick}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityScenario", env = "RasterBrick"), - function(x, env, names = NULL, transform = 'none', derivates = 'none', - derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { - assertthat::assert_that(inherits(x, "BiodiversityScenario"), - !missing(env)) - env <- raster_to_stars(env) # Convert to stars - - add_predictors(x, env, names = names, transform = transform, derivates = derivates, - derivate_knots = derivate_knots, int_variables = int_variables, harmonize_na = harmonize_na, ...) - } -) - -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityScenario,RasterLayer}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityScenario", env = "RasterLayer"), - function(x, env, names = NULL, transform = 'none', derivates = 'none', - derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { - assertthat::assert_that(inherits(x, "BiodiversityScenario"), - !missing(env)) - - env <- raster_to_stars(env) # Convert to stars - - add_predictors(x, env, names = names, transform = transform, derivates = derivates, - derivate_knots = derivate_knots, int_variables = int_variables, harmonize_na = harmonize_na, ...) - } -) - -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityScenario,RasterStack}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityScenario", env = "RasterStack"), - function(x, env, names = NULL, transform = 'none', derivates = 'none', - derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { - assertthat::assert_that(inherits(x, "BiodiversityScenario"), - !missing(env)) - - env <- raster_to_stars(env) # Convert to stars - - add_predictors(x, env, names = names, transform = transform, derivates = derivates, - derivate_knots = derivate_knots, int_variables = int_variables, harmonize_na = harmonize_na, ...) - } -) - -#' @name add_predictors -#' @rdname add_predictors -#' @usage \S4method{add_predictors}{BiodiversityScenario, stars}(x, env) -methods::setMethod( - "add_predictors", - methods::signature(x = "BiodiversityScenario", env = "stars"), - function(x, env, names = NULL, transform = 'none', derivates = 'none', - derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { - # Try and match transform and derivatives arguments - transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor') , several.ok = TRUE) - derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin') , several.ok = TRUE) - - assertthat::validate_that(inherits(env,'stars'),msg = 'Projection rasters need to be stars stack!') - assertthat::assert_that(inherits(x, "BiodiversityScenario"), - transform == 'none' || all( transform %in% c('pca', 'scale', 'norm', 'windsor') ), - derivates == 'none' || all( derivates %in% c('thresh', 'hinge', 'quadratic', 'bin') ), - is.vector(derivate_knots) || is.numeric(derivate_knots), - is.null(int_variables) || is.character(int_variables), - is.null(names) || assertthat::is.scalar(names) || is.vector(names), - is.logical(harmonize_na) - ) - # Some stars checks - assertthat::validate_that(length(env) >= 1) - - # Get model object - obj <- x$get_model() - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding scenario predictors...') - - # Rename attributes if names is specified - if(!is.null(names)){ - assertthat::assert_that(length(names) == length(env)) - names(env) <- names - } - - # Harmonize NA values - if(harmonize_na){ - stop('Missing data harmonization for stars not yet implemented!') #TODO - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') - env <- predictor_homogenize_na(env, fill = FALSE) - } - - # Standardization and scaling - if('none' %notin% transform){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') - for(tt in transform) env <- predictor_transform(env, option = tt) - } - - # # Calculate derivates if set - if('none' %notin% derivates){ - # Get variable names - varn <- obj$get_coefficients()[['Feature']] - # Are there any derivates present in the coefficients? - if(any( length( grep("hinge__|bin__|quad__|thresh__", varn ) ) > 0 )){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') - for(dd in derivates){ - if(any(grep(dd, varn))){ - env <- predictor_derivate(env, option = dd, nknots = derivate_knots, deriv = varn, int_variables = int_variables) - } else { - if(getOption('ibis.setupmessages')) myLog('[Setup]','red', paste0(derivates,' derivates should be created, but not found among coefficients!')) - } - } - } else { - if(getOption('ibis.setupmessages')) myLog('[Setup]','red','No derivates found among coefficients. None created for projection!') - } - } - - # Get, guess and format Time period - env_dim <- stars::st_dimensions(env) - timeperiod <- stars::st_get_dimension_values(env, - grep("year|time|date", names(env_dim), ignore.case = TRUE, value = TRUE) - ) - - # Check whether predictors already exist, if so overwrite - # TODO: In the future one could think of supplying predictors of varying grain - if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') - - # Finally set the data to the BiodiversityScenario object - x$set_predictors( - bdproto(NULL, PredictorDataset, - id = new_id(), - data = env, - timeperiod = timeperiod, - ... - ) - ) - } -) - -# --------------------- # -#### GLOBIOM specific code ---- - -#' Add GLOBIOM-DownScaleR derived predictors to a Biodiversity distribution object -#' -#' @description -#' This is a customized function to format and add downscaled land-use shares from -#' the [Global Biosphere Management Model (GLOBIOM)](https://iiasa.github.io/GLOBIOM/) to a -#' [distribution] or [BiodiversityScenario] in ibis.iSDM. GLOBIOM is a partial-equilibrium model -#' developed at IIASA and represents land-use sectors with a rich set of environmental and -#' socio-economic parameters, where for instance the agricultural and forestry sector are estimated through -#' dedicated process-based models. GLOBIOM outputs are spatial explicit and usually at a half-degree resolution globally. -#' For finer grain analyses GLOBIOM outputs can be produced in a downscaled format with a -#' customized statistical [downscaling module](https://github.com/iiasa/DownScale). -#' -#' The purpose of this script is to format the GLOBIOM outputs of *DownScale* for the use in the -#' ibis.iSDM package. -#' @details -#' See [`add_predictors()`] for additional parameters and customizations. -#' For more (manual) control the function for formatting the GLOBIOM data can also be -#' called directly via `formatGLOBIOM()`. -#' -#' @param x A [`BiodiversityDistribution-class`] or [`BiodiversityScenario-class`] object. -#' @param fname A [`character`] pointing to a netCDF with the GLOBIOM data. -#' @param names A [`vector`] of character names describing the environmental stack in case they should be renamed (Default: \code{NULL}). -#' @param transform A [`vector`] stating whether predictors should be preprocessed in any way (Options: \code{'none'},\code{'pca'}, \code{'scale'}, \code{'norm'}) -#' @param derivates A Boolean check whether derivate features should be considered (Options: \code{'none'}, \code{'thresh'}, \code{'hinge'}, \code{'quad'}) ) -#' @param derivate_knots A single [`numeric`] or [`vector`] giving the number of knots for derivate creation if relevant (Default: \code{4}). -#' @param int_variables A [`vector`] with length greater or equal than \code{2} specifying the covariates (Default: \code{NULL}). -#' @param bgmask Check whether the environmental data should be masked with the background layer (Default: \code{TRUE}) -#' @param harmonize_na A [`logical`] value indicating of whether NA values should be harmonized among predictors (Default: \code{FALSE}) -#' @param priors A [`PriorList-class`] object. Default is set to \code{NULL} which uses default prior assumptions. -#' @param ... Other parameters passed down -#' @seealso [add_predictors] -#' @examples -#' \dontrun{ -#' obj <- distribution(background) %>% -#' add_predictors_globiom(fname = "", transform = 'none') -#' obj -#' } -#' @name add_predictors_globiom -NULL - -#' @name add_predictors_globiom -#' @rdname add_predictors_globiom -#' @exportMethod add_predictors_globiom -#' @export -methods::setGeneric( - "add_predictors_globiom", - signature = methods::signature("x", "fname"), - function(x, fname, names = NULL, transform = 'none', derivates = 'none', derivate_knots = 4, int_variables = NULL, - bgmask = TRUE, harmonize_na = FALSE, - priors = NULL, ...) standardGeneric("add_predictors_globiom")) - -#' @name add_predictors_globiom -#' @rdname add_predictors_globiom -#' @usage \S4method{add_predictors_globiom}{BiodiversityDistribution, character}(x, fname) -methods::setMethod( - "add_predictors_globiom", - methods::signature(x = "BiodiversityDistribution", fname = "character"), - function(x, fname, names = NULL, transform = 'none', derivates = 'none', derivate_knots = 4, int_variables = NULL, - bgmask = TRUE, harmonize_na = FALSE, priors = NULL, ... ) { - # Try and match transform and derivatives arguments - transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor'), several.ok = TRUE) - derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin'), several.ok = TRUE) - - # Check that file exists and has the correct endings - assertthat::assert_that(is.character(fname), - file.exists(fname), - assertthat::is.readable(fname), - assertthat::has_extension(fname, "nc"), - msg = "The provided path to GLOBIOM land-use shares could not be found or is not readable!" - ) - - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - is.null(names) || assertthat::is.scalar(names) || is.vector(names), - is.null(priors) || inherits(priors,'PriorList'), - is.vector(derivate_knots) || is.null(derivate_knots), - is.null(int_variables) || is.vector(int_variables) - ) - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Formatting GLOBIOM inputs for species distribution modelling.') - - # Get and format the GLOBIOM data - env <- formatGLOBIOM(fname = fname, - oftype = "raster", - period = "reference", - template = x$background - ) - - if(is.list(env)) env <- env[[1]] # Take the first reference entry - assertthat::assert_that(is.Raster(env), - raster::nlayers(env)>0) - - if(!is.null(names)) { - assertthat::assert_that(nlayers(env)==length(names), - all(is.character(names)), - msg = 'Provided names not of same length as environmental data.') - # Set names of env - names(env) <- names - } - - # Check that all names allowed - problematic_names <- grep("offset|w|weight|spatial_offset|Intercept|spatial.field", names(env),fixed = TRUE) - if( length(problematic_names)>0 ){ - stop(paste0("Some predictor names are not allowed as they might interfere with model fitting:", paste0(names(env)[problematic_names],collapse = " | "))) - } - - # If priors have been set, save them in the distribution object - if(!is.null(priors)) { - assertthat::assert_that( all( priors$varnames() %in% names(env) ) ) - x <- x$set_priors(priors) - } - # Harmonize NA values - if(harmonize_na){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') - env <- predictor_homogenize_na(env, fill = FALSE) - } - - # Standardization and scaling - if('none' %notin% transform){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') - for(tt in transform) env <- predictor_transform(env, option = tt) - } - - # Calculate derivates if set - if('none' %notin% derivates){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') - new_env <- raster::stack() - for(dd in derivates) new_env <- raster::addLayer(new_env, predictor_derivate(env, option = dd, nknots = derivate_knots, int_variables = int_variables) ) - - # Add to env - env <- raster::addLayer(env, new_env) - } - - # Generally not relevant for GLOBIOM unless created as derivate - attr(env, 'has_factors') <- FALSE - - # Assign an attribute to this object to keep track of it - attr(env,'transform') <- transform - - # Mask predictors with existing background layer - if(bgmask){ - env <- raster::mask(env, mask = x$background) - env <- raster::stack(env) - } - - # Check whether predictors already exist, if so overwrite - if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') - - # Finally set the data to the BiodiversityDistribution object - x$set_predictors( - bdproto(NULL, PredictorDataset, - id = new_id(), - data = env, - ... - ) - ) - } -) - -#' @name add_predictors_globiom -#' @rdname add_predictors_globiom -#' @usage \S4method{add_predictors_globiom}{BiodiversityScenario, character}(x, fname) -methods::setMethod( - "add_predictors_globiom", - methods::signature(x = "BiodiversityScenario", fname = "character"), - function(x, fname, names = NULL, transform = 'none', derivates = 'none', derivate_knots = 4, int_variables = NULL, - harmonize_na = FALSE, ... ) { - # Try and match transform and derivatives arguments - transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor') , several.ok = TRUE) - derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin') , several.ok = TRUE) - - # Check that file exists and has the correct endings - assertthat::assert_that(is.character(fname), - file.exists(fname), - assertthat::is.readable(fname), - assertthat::has_extension(fname, "nc"), - msg = "The provided path to GLOBIOM land-use shares could not be found or is not readable!" - ) - assertthat::assert_that(inherits(x, "BiodiversityScenario"), - is.null(names) || assertthat::is.scalar(names) || is.vector(names), - is.logical(harmonize_na), - is.vector(derivate_knots) || is.null(derivate_knots), - is.null(int_variables) || is.vector(int_variables) - ) - - # Get model object - obj <- x$get_model() - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding GLOBIOM predictors to scenario object...') - - # Get and format the GLOBIOM data - env <- formatGLOBIOM(fname = fname, - oftype = "stars", - period = "projection", - template = obj$model$background - ) - assertthat::assert_that( inherits(env, "stars") ) - - # Rename attributes if names is specified - if(!is.null(names)){ - assertthat::assert_that(length(names) == length(env)) - names(env) <- names - } - - # Harmonize NA values - if(harmonize_na){ - stop('Missing data harmonization for stars not yet implemented!') #TODO - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') - env <- predictor_homogenize_na(env, fill = FALSE) - } - - # Standardization and scaling - if('none' %notin% transform){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') - for(tt in transform) env <- predictor_transform(env, option = tt) - } - - # # Calculate derivates if set - if('none' %notin% derivates){ - # Get variable names - varn <- obj$get_coefficients()[['Feature']] - # Are there any derivates present in the coefficients? - if(any( length( grep("hinge__|bin__|quad__|thresh__", varn ) ) > 0 )){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') - for(dd in derivates){ - if(any(grep(dd, varn))){ - env <- predictor_derivate(env, option = dd, nknots = derivate_knots, int_variables = int_variables, deriv = varn) - } else { - if(getOption('ibis.setupmessages')) myLog('[Setup]','red', paste0(derivates,' derivates should be created, but not found among coefficients!')) - } - } - } else { - if(getOption('ibis.setupmessages')) myLog('[Setup]','red','No derivates found among coefficients. None created for projection!') - } - } - - # Get and format Time period - env_dim <- stars::st_dimensions(env) - timeperiod <- stars::st_get_dimension_values(env, "time", center = TRUE) - if(is.numeric(timeperiod)){ - # Format to Posix. Assuming years only - timeperiod <- as.POSIXct(paste0(timeperiod,"-01-01")) - } - if(anyNA(timeperiod)) stop('Third dimension is not a time value!') - - # Check whether predictors already exist, if so overwrite - if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') - - # Finally set the data to the BiodiversityScenario object - x$set_predictors( - bdproto(NULL, PredictorDataset, - id = new_id(), - data = env, - timeperiod = timeperiod, - ... - ) - ) - } -) - -#' Function to format a prepared GLOBIOM netCDF file for use in Ibis.iSDM -#' -#' @description -#' This function expects a downscaled GLOBIOM output as created in the BIOCLIMA project. -#' Likely of little use for anyone outside IIASA. -#' -#' @param fname A filename in [`character`] pointing to a GLOBIOM output in netCDF format. -#' @param oftype A [`character`] denoting the output type (Default: \code{'raster'}). -#' @param ignore A [`vector`] of variables to be ignored (Default: \code{NULL}). -#' @param period A [`character`] limiting the period to be returned from the formatted data. -#' Options include \code{"reference"} for the first entry, \code{"projection"} for all entries but the first, -#' and \code{"all"} for all entries (Default: \code{"reference"}). -#' @param template An optional [`RasterLayer`] object towards which projects should be transformed. -#' @param verbose [`logical`] on whether to be chatty. -#' -#' @examples \dontrun{ -#' # Expects a filename pointing to a netCDF file. -#' covariates <- formatBIOCLIMA(fname) -#' } -#' @keywords internal, utils -formatGLOBIOM <- function(fname, oftype = "raster", ignore = NULL, - period = "all", template = NULL, - verbose = getOption("ibis.setupmessages")){ - assertthat::assert_that( - file.exists(fname), - assertthat::has_extension(fname, "nc"), - is.character(oftype), - is.null(ignore) || is.character(ignore), - is.character(period), - is.character(fname), - is.logical(verbose) - ) - period <- match.arg(period, c("reference", "projection", "all"), several.ok = FALSE) - check_package("stars") - check_package("dplyr") - check_package("cubelyr") - check_package("ncdf4") - - # Try and load in the GLOBIOM file to get the attributes - fatt <- ncdf4::nc_open(fname) - if(verbose) myLog('[Setup]','green',"Found ", fatt$ndims, " dimensions and ", fatt$nvars, " variables") - - # Get all dimension names and variable names - dims <- names(fatt$dim) - vars <- names(fatt$var) - if(!is.null(ignore)) assertthat::assert_that( all( ignore %in% vars ) ) - - attrs <- list() # For storing the attributes - sc <- vector() # For storing the scenario files - - # Now open the netcdf file with stats - if( length( grep("netcdf", stars:::detect.driver(fname), ignore.case = TRUE) )>0 ){ - if(verbose){ - myLog('[Predictor]','green',"Loading in predictor file...") - pb <- progress::progress_bar$new(total = length(vars), - format = "Loading :variable (:spin) [:bar] :percent") - } - - for(v in vars) { - if(verbose) pb$tick(tokens = list(variable = v)) - if(!is.null(ignore)) if(ignore == v) next() - - # Get and save the attributes of each variable - attrs[[v]] <- ncdf4::ncatt_get(fatt, varid = v, verbose = FALSE) - - # Load in the variable - suppressWarnings( - suppressMessages( - ff <- stars::read_ncdf(fname, - var = v, - proxy = FALSE, - make_time = TRUE, # Make time on 'time' band - make_units = FALSE # To avoid unnecessary errors due to unknown units - ) - ) - ) - - # Sometimes variables don't seem to have a time dimension - if(!"time" %in% names(stars::st_dimensions(ff))) next() - - # Crop to background extent if set - if(!is.null(template)){ - # FIXME: Currently this code, while working clips too much of Europe. - # Likely need to - # bbox <- sf::st_bbox(template) |> sf::st_as_sfc() |> - # sf::st_transform(crs = sf::st_crs(ff)) - # suppressMessages( - # ff <- ff |> stars:::st_crop.stars(bbox) - # ) - } - - # Record dimensions for later - full_dis <- stars::st_dimensions(ff) - - # Get dimensions other that x,y and time and split - # Commonly used column names - check = c("x","X","lon","longitude", "y", "Y", "lat", "latitude", "time", "Time", "year", "Year") - chk <- which(!names(stars::st_dimensions(ff)) %in% check) - - if(length(chk)>0){ - for(i in chk){ - col_class <- names(stars::st_dimensions(ff))[i] - # FIXME: Dirty hack to remove forest zoning - if(length( grep("zone",col_class,ignore.case = T) )>0) next() - - # And class units as description from over - class_units <- fatt$dim[[col_class]]$units - class_units <- class_units |> - strsplit(";") |> - # Remove emptyspace and special symbols - sapply(function(y) gsub("[^0-9A-Za-z///' ]", "" , y, ignore.case = TRUE) ) |> - sapply(function(y) gsub(" ", "" , y, ignore.case = TRUE) ) - # Convert to vector and make names - class_units <- paste0( - v, "__", - make.names(unlist(class_units)) |> as.vector() - ) - - ff <- ff %>% stars:::split.stars(col_class) %>% setNames(nm = class_units) - - # FIXME: Dirty hack to deal with the forest zone dimension - # If there are more dimensions than 3, aggregate over them - if( length(stars::st_dimensions(ff)) >3){ - # Aggregate spatial-temporally - ff <- stars::st_apply(ff, c("longitude", "latitude", "time"), sum, na.rm = TRUE) - } - } - } - - # Finally aggregate - if(!is.null(template) && is.Raster(template)){ - # FIXME: - # MJ 14/11/2022 - The code below is buggy, resulting in odd curvilinear extrapolations for Europe - # Hacky approach now is to convert to raster, crop, project and then convert back. - ff <- hack_project_stars(ff, template) - # Make background - # bg <- stars::st_as_stars(template) - # - # # Get resolution - # res <- sapply(stars::st_dimensions(bg), "[[", "delta") - # res[1:2] = abs(res[1:2]) # Assumes the first too entries are the coordinates - # assertthat::assert_that(!anyNA(res)) - # - # # And warp by projecting and resampling - # ff <- ff |> st_transform(crs = sf::st_crs(template)) |> - # stars::st_warp(crs = sf::st_crs(bg), - # cellsize = res, - # method = "near") |> - # stars:::st_transform.stars(crs = sf::st_crs(template)) - # Overwrite full dimensions - full_dis <- stars::st_dimensions(ff) - } - # Now append to vector - sc <- c(sc, ff) - rm(ff) - } - invisible(gc()) - assertthat::assert_that(length(names(full_dis))>=3) - - # Format sc object as stars and set dimensions again - sc <- stars::st_as_stars(sc) - assertthat::assert_that(length(sc)>0) - full_dis <- full_dis[c( - grep("x|longitude",names(full_dis), ignore.case = TRUE,value = TRUE), - grep("y|latitude",names(full_dis), ignore.case = TRUE,value = TRUE), - grep("year|time",names(full_dis), ignore.case = TRUE,value = TRUE) - )] # Order assumed to be correct - stars:::st_dimensions(sc) <- full_dis # Target dimensions - - } else { stop("Fileformat not recognized!")} - - # Get time dimension (without applying offset) so at the centre - times <- stars::st_get_dimension_values(sc, "time", center = TRUE) - - # Make checks on length of times and if equal to one, drop. check. - if(length(times)==1){ - if(period == "projection") stop("Found only a single time slot. Projections not possible.") - if(verbose) myLog('[Setup]','yellow','Found only a single time point in file. Dropping time dimension.') - # Drop the time dimension - sc <- stars:::adrop.stars(sc, drop = which(names(stars::st_dimensions(sc)) == "time") ) - } - - # Formate times unit and convert to posix if not already set - if(is.numeric(times) && length(times) > 1){ - # Assume year and paste0 as properly POSIX formatted - times <- as.POSIXct( paste0(times, "-01-01") ) - sc <- stars::st_set_dimensions(sc, "time", times) - } - - # Depending on the period, slice the input data - if(period == "reference"){ - # Get the first entry and filter - if(length(times)>1){ - # In case times got removed - times_first <- stars::st_get_dimension_values(sc, "time")[1] - sc <- sc %>% stars:::filter.stars(time == times_first) - times <- times_first;rm(times_first) - } - } else if(period == "projection"){ - # Remove the first time entry instead, only using the last entries - times_allbutfirst <- stars::st_get_dimension_values(sc, "time")[-1] - sc <- sc %>% stars:::filter.stars(time %in% times_allbutfirst) - times <- times_allbutfirst; rm(times_allbutfirst) - } - assertthat::assert_that(length(times)>0, - length(sc)>=1) - - # Create raster template if set - if(!is.null(template)){ - # Check that template is a raster, otherwise rasterize for GLOBIOM use - if(inherits(template, "sf")){ - o <- sc %>% stars:::slice.stars("time" , 1) %>% as("Raster") - if("fasterize" %in% utils::installed.packages()[,1]){ - template <- fasterize::fasterize(sf = template, raster = o, field = NULL) - } else { - template <- raster::rasterize(template, o, field = 1) - } - rm(o) - } - } - - # Now format outputs depending on type, either returning the raster or the stars object - if(oftype == "raster"){ - # Output type raster, use function from utils_scenario - out <- stars_to_raster(sc, which = NULL, template = template) - return(out) - } else { return( sc ) } -} +#' @include utils.R bdproto.R bdproto-biodiversitydistribution.R bdproto-predictors.R bdproto-biodiversityscenario.R +NULL + +#' Add predictors to a Biodiversity distribution object +#' +#' @description +#' This function allows to add predictors to [distribution] or [BiodiversityScenario] +#' objects. Predictors are covariates that in spatial projection have to match +#' the geographic projection of the background layer in the [distribution] object. +#' This function furthermore allows to transform or create derivates of provided +#' predictors. +#' +#' A transformation takes the provided rasters and for instance rescales them or transforms +#' them through a principal component analysis ([prcomp]). In contrast, derivates leave +#' the original provided predictors alone, but instead create new ones, for instance by transforming +#' their values through a quadratic or hinge transformation. Note that this effectively +#' increases the number of predictors in the object, generally requiring stronger regularization by +#' the used [`engine`]. +#' Both transformations and derivates can also be combined. +#' Available options for transformation are: +#' * \code{'none'} - Leaves the provided predictors in the original scale. +#' * \code{'pca'} - Converts the predictors to principal components. Note that this +#' results in a renaming of the variables to principal component axes! +#' * \code{'scale'} - Transforms all predictors by applying [scale] on them. +#' * \code{'norm'} - Normalizes all predictors by transforming them to a scale from 0 to 1. +#' * \code{'windsor'} - Applies a windsorization to the target predictors. By default +#' this effectively cuts the predictors to the 0.05 and 0.95, thus helping to remove +#' extreme outliers. +#' +#' Available options for creating derivates are: +#' * \code{'none'} - No additional predictor derivates are created. +#' * \code{'quad'} - Adds quadratic transformed predictors. +#' * \code{'interaction'} - Add interacting predictors. Interactions need to be specified (\code{"int_variables"})! +#' * \code{'thresh'} - Add threshold transformed predictors. +#' * \code{'hinge'} - Add hinge transformed predictors. +#' * \code{'bin'} - Add predictors binned by their percentiles. +#' +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param env A [`RasterStack-class`], [`RasterLayer-class`] or [`stars`] object. +#' @param names A [`vector`] of character names describing the environmental stack in case they should be renamed. +#' @param transform A [`vector`] stating whether predictors should be preprocessed in any way (Options: \code{'none'},\code{'pca'}, \code{'scale'}, \code{'norm'}) +#' @param derivates A Boolean check whether derivate features should be considered (Options: \code{'none'}, \code{'thresh'}, \code{'hinge'}, \code{'quad'}) ) +#' @param derivate_knots A single [`numeric`] or [`vector`] giving the number of knots for derivate creation if relevant (Default: \code{4}). +#' @param int_variables A [`vector`] with length greater or equal than \code{2} specifying the covariates (Default: \code{NULL}). +#' @param bgmask Check whether the environmental data should be masked with the background layer (Default: \code{TRUE}) +#' @param harmonize_na A [`logical`] value indicating of whether NA values should be harmonized among predictors (Default: \code{FALSE}) +#' @param explode_factors [`logical`] of whether any factor variables should be split up into binary variables (one per class). (Default: \code{FALSE}). +#' @param priors A [`PriorList-class`] object. Default is set to \code{NULL} which uses default prior assumptions. +#' @param ... Other parameters passed down +#' @note +#' **Important:** +#' Not every [`engine`] supported by the \pkg{ibis.iSDM} R-package allows missing data points +#' among extracted covariates. Thus any observation with missing data is generally removed prior +#' from model fitting. Thus ensure that covariates have appropriate no-data settings (for instance setting \code{NA} +#' values to \code{0} or another out of range constant). +#' +#' Not every engine does actually need covariates. For instance it is perfectly legit +#' to fit a model with only occurrence data and a spatial latent effect ([add_latent]). +#' This correspondents to a spatial kernel density estimate. +#' +#' Certain names such \code{"offset"} are forbidden as predictor variable names. The function +#' will return an error message if these are used. +#' @aliases add_predictors +#' @examples +#' \dontrun{ +#' obj <- distribution(background) |> +#' add_predictors(covariates, transform = 'scale') +#' obj +#' } +#' @name add_predictors +NULL + +#' @name add_predictors +#' @rdname add_predictors +#' @exportMethod add_predictors +#' @export +methods::setGeneric( + "add_predictors", + signature = methods::signature("x", "env"), + function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, bgmask = TRUE, + harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ...) standardGeneric("add_predictors")) + +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityDistribution,RasterBrick}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityDistribution", env = "RasterBrick"), + function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, + bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + !missing(env)) + # Convert env to stack if it is a single layer only + env = raster::stack(env) + add_predictors(x, env, names, transform, derivates, derivate_knots, int_variables, bgmask, harmonize_na, explode_factors, priors, ...) + } +) + +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityDistribution,RasterLayer}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityDistribution", env = "RasterLayer"), + function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, + bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + !missing(env)) + # Convert env to stack if it is a single layer only + env = raster::stack(env) + add_predictors(x, env, names, transform, derivates, derivate_knots, int_variables, bgmask, harmonize_na, explode_factors, priors, ...) + } +) + +# TODO: Support other objects other than Raster stacks such as data.frames and stars objects +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityDistribution,RasterStack}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityDistribution", env = "RasterStack"), + function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, + bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { + # Try and match transform and derivatives arguments + transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor') , several.ok = TRUE) + derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin', 'interaction') , several.ok = TRUE) + + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.Raster(env), + all(transform == 'none') || all( transform %in% c('pca', 'scale', 'norm', 'windsor') ), + all(derivates == 'none') || all( derivates %in% c('thresh', 'hinge', 'quadratic', 'bin', 'interaction') ), + is.vector(derivate_knots) || is.numeric(derivate_knots), + is.null(names) || assertthat::is.scalar(names) || is.vector(names), + is.logical(explode_factors), + is.null(priors) || inherits(priors,'PriorList') + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding predictors...') + + if(!is.null(names)) { + assertthat::assert_that(nlayers(env)==length(names), + all(is.character(names)), + msg = 'Provided names not of same length as environmental data.') + # Set names of env + names(env) <- names + } + + # Check that all names allowed + problematic_names <- grep("offset|w|weight|spatial_offset|Intercept|spatial.field", names(env),fixed = TRUE) + if( length(problematic_names)>0 ){ + stop(paste0("Some predictor names are not allowed as they might interfere with model fitting:", paste0(names(env)[problematic_names],collapse = " | "))) + } + + # If priors have been set, save them in the distribution object + if(!is.null(priors)) { + assertthat::assert_that( all( priors$varnames() %in% names(env) ) ) + x <- x$set_priors(priors) + } + # Harmonize NA values + if(harmonize_na){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') + env <- predictor_homogenize_na(env, fill = FALSE) + } + + # Don't transform or create derivatives of factor variables + if(any(is.factor(env))){ + # Make subsets to join back later + env_f <- raster::subset(env, which(is.factor(env))) + env <- raster::subset(env, which(!is.factor(env))) + if(explode_factors){ + # Refactor categorical variables + if(inherits(env_f,'RasterLayer')){ + env_f <- explode_factorized_raster(env_f) + env <- addLayer(env, env_f) + } else { + o <- raster::stack() + for(layer in names(env_f)){ + o <- raster::addLayer(o, explode_factorized_raster(env_f[[layer]])) + } + env_f <- o;rm(o) + # Joining back to full raster stack + env <- raster::stack(env, env_f);rm(env_f) + } + has_factors <- FALSE # Set to false since factors have been exploded. + } else { has_factors <- TRUE } + } else { has_factors <- FALSE } + + # Standardization and scaling + if('none' %notin% transform){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') + for(tt in transform) env <- predictor_transform(env, option = tt) + } + + # Calculate derivates if set + if('none' %notin% derivates){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') + # Specific condition for interaction + if(any(derivates == "interaction")){ + assertthat::assert_that(is.vector(int_variables), length(int_variables)>=2) + attr(env, "int_variables") <- int_variables + } + new_env <- raster::stack() + for(dd in derivates) new_env <- raster::addLayer(new_env, predictor_derivate(env, option = dd, nknots = derivate_knots, int_variables = int_variables) ) + + # Add to env + env <- raster::addLayer(env, new_env) + } + + # Add factors back in if there are any. + # This is to avoid that they are transformed or similar + if(has_factors){ + env <- raster::addLayer(env, env_f) + } + attr(env, 'has_factors') <- has_factors + + # Assign an attribute to this object to keep track of it + attr(env,'transform') <- transform + + # Mask predictors with existing background layer + if(bgmask){ + env <- raster::mask(env, mask = x$background) + # Reratify, work somehow only on stacks + if(has_factors && any(is.factor(env)) ){ + new_env <- raster::stack(env) + new_env[[which(is.factor(env))]] <- raster::ratify(env[[which(is.factor(env))]]) + env <- new_env;rm(new_env) + } else env <- raster::stack(env) + } + + # Check whether predictors already exist, if so overwrite + if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') + + # Finally set the data to the BiodiversityDistribution object + x$set_predictors( + bdproto(NULL, PredictorDataset, + id = new_id(), + data = env, + ... + ) + ) + } +) + +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityDistribution, stars}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityDistribution", env = "stars"), + function(x, env, names = NULL, transform = 'scale', derivates = 'none', derivate_knots = 4, int_variables = NULL, + bgmask = TRUE, harmonize_na = FALSE, explode_factors = FALSE, priors = NULL, ... ) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + !missing(env)) + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Taking first time entry from object.') + + # Convert to raster + env <- stars_to_raster(env, which = 1) + if(is.list(env)) env <- env[[1]] + x <- add_predictors(x, env, names, transform, derivates, derivate_knots, int_variables, bgmask, harmonize_na, explode_factors, priors, ...) + return( x ) + } +) + +# Add elevational delineation as predictor ---- + +#' Create lower and upper limits for an elevational range and add them as separate predictors +#' +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param layer A [`character`] stating the elevational layer in the Distribution object or [`RasterLayer`] object. +#' @param lower [`numeric`] value for a lower elevational preference of a species. +#' @param upper [`numeric`] value for a upper elevational preference of a species. +#' @param transform [`character`] Any optional transformation to be applied. Usually not needed (Default: \code{"none"}). +#' @name add_predictor_elevationpref +NULL + +#' @name add_predictor_elevationpref +#' @rdname add_predictor_elevationpref +#' @exportMethod add_predictor_elevationpref +#' @export +methods::setGeneric( + "add_predictor_elevationpref", + signature = methods::signature("x", "layer", "lower", "upper", "transform"), + function(x, layer, lower, upper, transform = "none") standardGeneric("add_predictor_elevationpref")) + +#' @name add_predictor_elevationpref +#' @rdname add_predictor_elevationpref +#' @usage \S4method{add_predictor_elevationpref}{BiodiversityDistribution, ANY, numeric, numeric, character}(x, layer, lower, upper, transform) +methods::setMethod( + "add_predictor_elevationpref", + methods::signature(x = "BiodiversityDistribution", layer = "ANY", lower = "numeric", upper = "numeric"), + function(x, layer, lower, upper, transform = "none") { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.Raster(layer) || is.character(layer), + is.numeric(lower) || is.na(lower), + is.numeric(upper) || is.na(upper), + is.character(transform) + ) + # Messager + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Formatting elevational preference predictors...') + + # If layer is a character, check that it is in the provided object + if(is.character(layer)){ + assertthat::assert_that(layer %in% x$get_predictor_names()) + layer <- x$predictors$get_data()[[layer]] + } else { + # If it is a raster + # Check that background and range align, otherwise raise error + if(compareRaster(layer, x$background,stopiffalse = FALSE)){ + warning('Supplied range does not align with background! Aligning them now...') + layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) + } + } + + # Format lower and upper preferences + if(is.na(lower)) lower <- raster::cellStats(layer, "min") + if(is.na(upper)) upper <- raster::cellStats(layer, "max") + + # Now create thresholded derivatives of lower and upper elevation + ras1 <- layer + # ras2[ras2 < lower] <- 0; ras2[ras2 > upper] <- 0; ras2[ras2 > 0] <- 1 # Both ways + ras1[layer < lower] <- 0; ras1[ras1 > lower] <- 1 + ras2 <- layer + ras2[ras2 < upper] <- 0; ras2[ras2 > 0] <- 1 + # If the minimum of those layers have equal min and max + if(raster::cellStats(ras1, "min") == raster::cellStats(ras1, "max")){ + o <- ras2 + # Ensure that all layers have a minimum and a maximum + o[is.na(o)] <- 0; o <- raster::mask(o, x$background) + names(o) <- c('elev_high') + } else { + o <- raster::stack(ras1, ras2) + # Ensure that all layers have a minimum and a maximum + o[is.na(o)] <- 0; o <- raster::mask(o, x$background) + names(o) <- c('elev_low', 'elev_high') + } + rm(ras1,ras2) + + # Add as predictor + if(is.Waiver(x$predictors)){ + x <- add_predictors(x, env = o, transform = transform, derivates = 'none') + } else { + for(n in names(o)){ + r <- o[[n]] + # If predictor transformation is specified, apply + if(transform != "none") r <- predictor_transform(r, option = transform) + x$predictors <- x$predictors$set_data(n, r) + rm(r) + } + } + return(x) + } +) + +# Add species ranges as predictor ---- +#' Add a range of a species as predictor to a distribution object +#' +#' @description +#' This function allows to add a species range which is usually drawn by experts in a separate process +#' as spatial explicit prior. Both [`sf`] and [`Raster`]-objects are supported as input. +#' +#' Users are advised to look at the [`bossMaps`] R-package presented as part of Merow et al. (2017), +#' which allows flexible calculation of non-linear distance transforms from the boundary of the range. +#' Outputs of this package could be added directly to this function. +#' **Note that this function adds the range as predictor and not as offset. For this purpose a separate function [`add_offset_range()`] exists.** +#' +#' Additional options allow to include the range either as \code{"binary"} or as \code{"distance"} transformed +#' predictor. The difference being that the range is either directly included as presence-only predictor or +#' alternatively with a linear distance transform from the range boundary. The parameter +#' \code{"distance_max"} can be specified to constrain this distance transform. +#' +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param layer A [`sf`] or [`Raster`] object with the range for the target feature. +#' @param method [`character`] describing how the range should be included (\code{"binary"} | \code{"distance"}). +#' @param distance_max Numeric threshold on the maximum distance (Default: \code{NULL}). +#' @param fraction An optional [`RasterLayer`] object that is multiplied with digitized raster layer. +#' Can be used to for example to remove or reduce the expected value (Default: \code{NULL}). +#' @param priors A [`PriorList-class`] object. Default is set to NULL which uses default prior assumptions +#' @references +#' * Merow, C., Wilson, A. M., & Jetz, W. (2017). Integrating occurrence data and expert maps for improved species range predictions. Global Ecology and Biogeography, 26(2), 243–258. https://doi.org/10.1111/geb.12539 +#' @name add_predictor_range +NULL + +#' @name add_predictor_range +#' @rdname add_predictor_range +#' @exportMethod add_predictor_range +#' @export +methods::setGeneric( + "add_predictor_range", + signature = methods::signature("x", "layer", "method"), + function(x, layer, method = 'distance', distance_max = NULL, fraction = NULL, priors = NULL) standardGeneric("add_predictor_range")) + +#' Function for when distance raster is directly supplied (precomputed) +#' @name add_predictor_range +#' @rdname add_predictor_range +#' @usage \S4method{add_predictor_range}{BiodiversityDistribution, raster}(x, layer) +methods::setMethod( + "add_predictor_range", + methods::signature(x = "BiodiversityDistribution", layer = "RasterLayer"), + function(x, layer, method = 'precomputed_range', fraction = NULL, priors = NULL) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.Raster(layer), + is.Raster(fraction) || is.null(fraction), + is.character(method) + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range predictors...') + + # Check that background and range align, otherwise raise error + if(compareRaster(layer, x$background,stopiffalse = FALSE)){ + warning('Supplied range does not align with background! Aligning them now...') + layer <- alignRasters(layer, x$background, method = 'bilinear', func = mean, cl = FALSE) + } + names(layer) <- method + + # Multiply with fraction layer if set + if(!is.null(fraction)){ + # Rescale if necessary and set 0 to a small constant 1e-6 + if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") + fraction[fraction==0] <- 1e-6 + layer <- layer * fraction + } + + # Add as predictor + if(is.Waiver(x$predictors)){ + x <- add_predictors(x, env = layer, transform = 'none',derivates = 'none', priors) + } else { + x$predictors <- x$predictors$set_data('range_distance', layer) + if(!is.null(priors)) { + # FIXME: Ideally attempt to match varnames against supplied predictors vis match.arg or similar + assertthat::assert_that( all( priors$varnames() %in% names(layer) ) ) + x <- x$set_priors(priors) + } + } + return(x) + } +) + +#' @name add_predictor_range +#' @rdname add_predictor_range +#' @usage \S4method{add_predictor_range}{BiodiversityDistribution, sf}(x, layer) +methods::setMethod( + "add_predictor_range", + methods::signature(x = "BiodiversityDistribution", layer = "sf"), + function(x, layer, method = 'distance', distance_max = Inf, fraction = NULL, priors = NULL ) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.character(method), + inherits(layer, 'sf'), + method %in% c('binary','distance'), + is.null(fraction) || is.Raster(fraction), + is.null(distance_max) || is.numeric(distance_max) || is.infinite(distance_max), + is.null(priors) || inherits(priors,'PriorList') + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding range predictors...') + + # Reproject if necessary + if(sf::st_crs(layer) != sf::st_crs(x$background)) layer <- sf::st_transform(layer, sf::st_crs(x$background)) + + # Template raster for background + if(!is.Waiver(x$predictors)){ + temp <- emptyraster(x$predictors$get_data()) + } else { + # TODO: Eventually make this work better + myLog('[Setup]','red','CAREFUL - This might not work without predictors already in the model.') + temp <- raster::raster(extent(x$background),resolution = 1) + } + + # Rasterize the range + if( 'fasterize' %in% utils::installed.packages()[,1] ){ + ras_range <- try({ fasterize::fasterize(layer, temp, field = NULL) }, silent = TRUE) + if(inherits(ras_range,"try-error")){ + myLog('[Setup]','yellow','Fasterize package needs to be re-installed!') + ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) + } + } else { + ras_range <- raster::rasterize(layer, temp, field = 1, background = NA) + } + + # -------------- # + if(method == 'binary'){ + dis <- ras_range + dis[is.na(dis)] <- 0 + # Mask with temp again + dis <- raster::mask(dis, x$background) + names(dis) <- 'binary_range' + } else if(method == 'distance'){ + # Calculate the linear distance from the range + dis <- raster::gridDistance(ras_range, origin = 1) + dis <- raster::mask(dis, x$background) + # If max distance is specified + if(!is.null(distance_max) && !is.infinite(distance_max)){ + dis[dis > distance_max] <- NA # Set values above threshold to NA + attr(dis, "distance_max") <- distance_max + } else { distance_max <- raster::cellStats(dis, "max") } + # Grow baseline raster by using an exponentially weighted kernel + alpha <- 1 / (distance_max / 4 ) # Divide by 4 for a quarter in each direction + # Grow baseline raster by using an exponentially weighted kernel + dis <- raster::calc(dis, fun = function(x) exp(-alpha * x)) + # Convert to relative for better scaling in predictions + dis <- (dis / raster::cellStats(dis,'max')) + + # Set NA to 0 and mask again + dis[is.na(dis)] <- 0 + dis <- raster::mask(dis, x$background) + names(dis) <- 'distance_range' + } + + # Multiply with fraction layer if set + if(!is.null(fraction)){ + # Rescale if necessary and set 0 to a small constant 1e-6 + if(raster::cellStats(fraction, "min") < 0) fraction <- predictor_transform(fraction, option = "norm") + fraction[fraction==0] <- 1e-6 + layer <- layer * fraction + } + + # If priors have been set, save them in the distribution object + if(!is.null(priors)) { + # FIXME: Ideally attempt to match varnames against supplied predictors vis match.arg or similar + assertthat::assert_that( all( priors$varnames() %in% names(dis) ) ) + x <- x$set_priors(priors) + } + + # Add as predictor + if(is.Waiver(x$predictors)){ + x <- add_predictors(x, env = dis, transform = 'none',derivates = 'none') + } else { + x$predictors <- x$predictors$set_data('range_distance', dis) + } + return(x) + } +) + +#' Remove specific predictors from a [distribution] object +#' +#' @description +#' Remove a particular variable from an [distribution] object with a +#' [`PredictorDataset-class`]. +#' See Examples. +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param names [`vector`] A Vector of character names describing the environmental stack. +#' @examples +#' \dontrun{ +#' distribution(background) |> +#' add_predictors(my_covariates) |> +#' rm_predictors(names = "Urban") +#' } +#' @name rm_predictors +NULL + +#' @name rm_predictors +#' @rdname rm_predictors +#' @exportMethod rm_predictors +#' @export +methods::setGeneric( + "rm_predictors", + signature = methods::signature("x", "names"), + function(x, names) standardGeneric("rm_predictors")) + +#' @name rm_predictors +#' @rdname rm_predictors +#' @usage \S4method{rm_predictors}{BiodiversityDistribution,vector}(x, names) +methods::setMethod( + "rm_predictors", + methods::signature(x = "BiodiversityDistribution", names = "character"), + # rm_predictors ---- + function(x, names ) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.character(names) || assertthat::is.scalar(names) || is.vector(names) + ) + # TODO: Maybe implement a flexible wildcard, base::startsWith() + # Is there anything to remove + assertthat::assert_that(!is.Waiver(x$predictors), + all( names %in% x$get_predictor_names() ), + msg = 'Suggested variables not in model!') + + # Finally set the data to the BiodiversityDistribution object + x$rm_predictors(names) + } +) + +#' Select specific predictors from a [distribution] object +#' +#' @description +#' This function allows - out of a [`character`] vector with the names +#' of an already added [`PredictorDataset-class`] object - to select a particular set of predictors. +#' See Examples. +#' +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param names [`vector`] A Vector of character names describing the environmental stack. +#' @examples +#' \dontrun{ +#' distribution(background) |> +#' add_predictors(my_covariates) |> +#' sel_predictors(names = c("Forest", "Elevation")) +#' } +#' @name sel_predictors +NULL + +#' @name sel_predictors +#' @rdname sel_predictors +#' @exportMethod sel_predictors +#' @export +methods::setGeneric( + "sel_predictors", + signature = methods::signature("x", "names"), + function(x, names) standardGeneric("sel_predictors")) + +#' @name sel_predictors +#' @rdname sel_predictors +#' @usage \S4method{sel_predictors}{BiodiversityDistribution,vector}(x, names) +methods::setMethod( + "sel_predictors", + methods::signature(x = "BiodiversityDistribution", names = "character"), + # sel_predictors ---- + function(x, names ) { + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.character(names) || assertthat::is.scalar(names) || is.vector(names) + ) + # TODO: Maybe implement a flexible wildcard, base::startsWith() + # Is there anything to remove + assertthat::assert_that(!is.Waiver(x$predictors), + any( names %in% x$get_predictor_names() ), + msg = 'Suggested variables not in model!') + + # Get current predictors + varnames <- x$get_predictor_names() + varnames <- varnames[which(varnames %notin% names)] + + # Remove all predictors listed + if(length(varnames)>=1) x$rm_predictors(varnames) + } +) + +# ---------------- # +# Add predictor actions for scenario objects ---- +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityScenario,RasterBrick}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityScenario", env = "RasterBrick"), + function(x, env, names = NULL, transform = 'none', derivates = 'none', + derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { + assertthat::assert_that(inherits(x, "BiodiversityScenario"), + !missing(env)) + env <- raster_to_stars(env) # Convert to stars + + add_predictors(x, env, names = names, transform = transform, derivates = derivates, + derivate_knots = derivate_knots, int_variables = int_variables, harmonize_na = harmonize_na, ...) + } +) + +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityScenario,RasterLayer}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityScenario", env = "RasterLayer"), + function(x, env, names = NULL, transform = 'none', derivates = 'none', + derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { + assertthat::assert_that(inherits(x, "BiodiversityScenario"), + !missing(env)) + + env <- raster_to_stars(env) # Convert to stars + + add_predictors(x, env, names = names, transform = transform, derivates = derivates, + derivate_knots = derivate_knots, int_variables = int_variables, harmonize_na = harmonize_na, ...) + } +) + +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityScenario,RasterStack}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityScenario", env = "RasterStack"), + function(x, env, names = NULL, transform = 'none', derivates = 'none', + derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { + assertthat::assert_that(inherits(x, "BiodiversityScenario"), + !missing(env)) + + env <- raster_to_stars(env) # Convert to stars + + add_predictors(x, env, names = names, transform = transform, derivates = derivates, + derivate_knots = derivate_knots, int_variables = int_variables, harmonize_na = harmonize_na, ...) + } +) + +#' @name add_predictors +#' @rdname add_predictors +#' @usage \S4method{add_predictors}{BiodiversityScenario, stars}(x, env) +methods::setMethod( + "add_predictors", + methods::signature(x = "BiodiversityScenario", env = "stars"), + function(x, env, names = NULL, transform = 'none', derivates = 'none', + derivate_knots = 4, int_variables = NULL, harmonize_na = FALSE, ... ) { + # Try and match transform and derivatives arguments + transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor') , several.ok = TRUE) + derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin') , several.ok = TRUE) + + assertthat::validate_that(inherits(env,'stars'),msg = 'Projection rasters need to be stars stack!') + assertthat::assert_that(inherits(x, "BiodiversityScenario"), + transform == 'none' || all( transform %in% c('pca', 'scale', 'norm', 'windsor') ), + derivates == 'none' || all( derivates %in% c('thresh', 'hinge', 'quadratic', 'bin') ), + is.vector(derivate_knots) || is.numeric(derivate_knots), + is.null(int_variables) || is.character(int_variables), + is.null(names) || assertthat::is.scalar(names) || is.vector(names), + is.logical(harmonize_na) + ) + # Some stars checks + assertthat::validate_that(length(env) >= 1) + + # Get model object + obj <- x$get_model() + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding scenario predictors...') + + # Rename attributes if names is specified + if(!is.null(names)){ + assertthat::assert_that(length(names) == length(env)) + names(env) <- names + } + + # Harmonize NA values + if(harmonize_na){ + stop('Missing data harmonization for stars not yet implemented!') #TODO + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') + env <- predictor_homogenize_na(env, fill = FALSE) + } + + # Standardization and scaling + if('none' %notin% transform){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') + for(tt in transform) env <- predictor_transform(env, option = tt) + } + + # # Calculate derivates if set + if('none' %notin% derivates){ + # Get variable names + varn <- obj$get_coefficients()[['Feature']] + # Are there any derivates present in the coefficients? + if(any( length( grep("hinge__|bin__|quad__|thresh__", varn ) ) > 0 )){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') + for(dd in derivates){ + if(any(grep(dd, varn))){ + env <- predictor_derivate(env, option = dd, nknots = derivate_knots, deriv = varn, int_variables = int_variables) + } else { + if(getOption('ibis.setupmessages')) myLog('[Setup]','red', paste0(derivates,' derivates should be created, but not found among coefficients!')) + } + } + } else { + if(getOption('ibis.setupmessages')) myLog('[Setup]','red','No derivates found among coefficients. None created for projection!') + } + } + + # Get, guess and format Time period + env_dim <- stars::st_dimensions(env) + timeperiod <- stars::st_get_dimension_values(env, + grep("year|time|date", names(env_dim), ignore.case = TRUE, value = TRUE) + ) + + # Check whether predictors already exist, if so overwrite + # TODO: In the future one could think of supplying predictors of varying grain + if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') + + # Finally set the data to the BiodiversityScenario object + x$set_predictors( + bdproto(NULL, PredictorDataset, + id = new_id(), + data = env, + timeperiod = timeperiod, + ... + ) + ) + } +) + +# --------------------- # +#### GLOBIOM specific code ---- + +#' Add GLOBIOM-DownScaleR derived predictors to a Biodiversity distribution object +#' +#' @description +#' This is a customized function to format and add downscaled land-use shares from +#' the [Global Biosphere Management Model (GLOBIOM)](https://iiasa.github.io/GLOBIOM/) to a +#' [distribution] or [BiodiversityScenario] in ibis.iSDM. GLOBIOM is a partial-equilibrium model +#' developed at IIASA and represents land-use sectors with a rich set of environmental and +#' socio-economic parameters, where for instance the agricultural and forestry sector are estimated through +#' dedicated process-based models. GLOBIOM outputs are spatial explicit and usually at a half-degree resolution globally. +#' For finer grain analyses GLOBIOM outputs can be produced in a downscaled format with a +#' customized statistical [downscaling module](https://github.com/iiasa/DownScale). +#' +#' The purpose of this script is to format the GLOBIOM outputs of *DownScale* for the use in the +#' ibis.iSDM package. +#' @details +#' See [`add_predictors()`] for additional parameters and customizations. +#' For more (manual) control the function for formatting the GLOBIOM data can also be +#' called directly via `formatGLOBIOM()`. +#' +#' @param x A [`BiodiversityDistribution-class`] or [`BiodiversityScenario-class`] object. +#' @param fname A [`character`] pointing to a netCDF with the GLOBIOM data. +#' @param names A [`vector`] of character names describing the environmental stack in case they should be renamed (Default: \code{NULL}). +#' @param transform A [`vector`] stating whether predictors should be preprocessed in any way (Options: \code{'none'},\code{'pca'}, \code{'scale'}, \code{'norm'}) +#' @param derivates A Boolean check whether derivate features should be considered (Options: \code{'none'}, \code{'thresh'}, \code{'hinge'}, \code{'quad'}) ) +#' @param derivate_knots A single [`numeric`] or [`vector`] giving the number of knots for derivate creation if relevant (Default: \code{4}). +#' @param int_variables A [`vector`] with length greater or equal than \code{2} specifying the covariates (Default: \code{NULL}). +#' @param bgmask Check whether the environmental data should be masked with the background layer (Default: \code{TRUE}) +#' @param harmonize_na A [`logical`] value indicating of whether NA values should be harmonized among predictors (Default: \code{FALSE}) +#' @param priors A [`PriorList-class`] object. Default is set to \code{NULL} which uses default prior assumptions. +#' @param ... Other parameters passed down +#' @seealso [add_predictors] +#' @examples +#' \dontrun{ +#' obj <- distribution(background) |> +#' add_predictors_globiom(fname = "", transform = 'none') +#' obj +#' } +#' @name add_predictors_globiom +NULL + +#' @name add_predictors_globiom +#' @rdname add_predictors_globiom +#' @exportMethod add_predictors_globiom +#' @export +methods::setGeneric( + "add_predictors_globiom", + signature = methods::signature("x", "fname"), + function(x, fname, names = NULL, transform = 'none', derivates = 'none', derivate_knots = 4, int_variables = NULL, + bgmask = TRUE, harmonize_na = FALSE, + priors = NULL, ...) standardGeneric("add_predictors_globiom")) + +#' @name add_predictors_globiom +#' @rdname add_predictors_globiom +#' @usage \S4method{add_predictors_globiom}{BiodiversityDistribution, character}(x, fname) +methods::setMethod( + "add_predictors_globiom", + methods::signature(x = "BiodiversityDistribution", fname = "character"), + function(x, fname, names = NULL, transform = 'none', derivates = 'none', derivate_knots = 4, int_variables = NULL, + bgmask = TRUE, harmonize_na = FALSE, priors = NULL, ... ) { + # Try and match transform and derivatives arguments + transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor'), several.ok = TRUE) + derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin'), several.ok = TRUE) + + # Check that file exists and has the correct endings + assertthat::assert_that(is.character(fname), + file.exists(fname), + assertthat::is.readable(fname), + assertthat::has_extension(fname, "nc"), + msg = "The provided path to GLOBIOM land-use shares could not be found or is not readable!" + ) + + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + is.null(names) || assertthat::is.scalar(names) || is.vector(names), + is.null(priors) || inherits(priors,'PriorList'), + is.vector(derivate_knots) || is.null(derivate_knots), + is.null(int_variables) || is.vector(int_variables) + ) + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Formatting GLOBIOM inputs for species distribution modelling.') + + # Get and format the GLOBIOM data + env <- formatGLOBIOM(fname = fname, + oftype = "raster", + period = "reference", + template = x$background + ) + + if(is.list(env)) env <- env[[1]] # Take the first reference entry + assertthat::assert_that(is.Raster(env), + raster::nlayers(env)>0) + + if(!is.null(names)) { + assertthat::assert_that(nlayers(env)==length(names), + all(is.character(names)), + msg = 'Provided names not of same length as environmental data.') + # Set names of env + names(env) <- names + } + + # Check that all names allowed + problematic_names <- grep("offset|w|weight|spatial_offset|Intercept|spatial.field", names(env),fixed = TRUE) + if( length(problematic_names)>0 ){ + stop(paste0("Some predictor names are not allowed as they might interfere with model fitting:", paste0(names(env)[problematic_names],collapse = " | "))) + } + + # If priors have been set, save them in the distribution object + if(!is.null(priors)) { + assertthat::assert_that( all( priors$varnames() %in% names(env) ) ) + x <- x$set_priors(priors) + } + # Harmonize NA values + if(harmonize_na){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') + env <- predictor_homogenize_na(env, fill = FALSE) + } + + # Standardization and scaling + if('none' %notin% transform){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') + for(tt in transform) env <- predictor_transform(env, option = tt) + } + + # Calculate derivates if set + if('none' %notin% derivates){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') + new_env <- raster::stack() + for(dd in derivates) new_env <- raster::addLayer(new_env, predictor_derivate(env, option = dd, nknots = derivate_knots, int_variables = int_variables) ) + + # Add to env + env <- raster::addLayer(env, new_env) + } + + # Generally not relevant for GLOBIOM unless created as derivate + attr(env, 'has_factors') <- FALSE + + # Assign an attribute to this object to keep track of it + attr(env,'transform') <- transform + + # Mask predictors with existing background layer + if(bgmask){ + env <- raster::mask(env, mask = x$background) + env <- raster::stack(env) + } + + # Check whether predictors already exist, if so overwrite + if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') + + # Finally set the data to the BiodiversityDistribution object + x$set_predictors( + bdproto(NULL, PredictorDataset, + id = new_id(), + data = env, + ... + ) + ) + } +) + +#' @name add_predictors_globiom +#' @rdname add_predictors_globiom +#' @usage \S4method{add_predictors_globiom}{BiodiversityScenario, character}(x, fname) +methods::setMethod( + "add_predictors_globiom", + methods::signature(x = "BiodiversityScenario", fname = "character"), + function(x, fname, names = NULL, transform = 'none', derivates = 'none', derivate_knots = 4, int_variables = NULL, + harmonize_na = FALSE, ... ) { + # Try and match transform and derivatives arguments + transform <- match.arg(transform, c('none','pca', 'scale', 'norm', 'windsor') , several.ok = TRUE) + derivates <- match.arg(derivates, c('none','thresh', 'hinge', 'quadratic', 'bin') , several.ok = TRUE) + + # Check that file exists and has the correct endings + assertthat::assert_that(is.character(fname), + file.exists(fname), + assertthat::is.readable(fname), + assertthat::has_extension(fname, "nc"), + msg = "The provided path to GLOBIOM land-use shares could not be found or is not readable!" + ) + assertthat::assert_that(inherits(x, "BiodiversityScenario"), + is.null(names) || assertthat::is.scalar(names) || is.vector(names), + is.logical(harmonize_na), + is.vector(derivate_knots) || is.null(derivate_knots), + is.null(int_variables) || is.vector(int_variables) + ) + + # Get model object + obj <- x$get_model() + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Adding GLOBIOM predictors to scenario object...') + + # Get and format the GLOBIOM data + env <- formatGLOBIOM(fname = fname, + oftype = "stars", + period = "projection", + template = obj$model$background + ) + assertthat::assert_that( inherits(env, "stars") ) + + # Rename attributes if names is specified + if(!is.null(names)){ + assertthat::assert_that(length(names) == length(env)) + names(env) <- names + } + + # Harmonize NA values + if(harmonize_na){ + stop('Missing data harmonization for stars not yet implemented!') #TODO + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Harmonizing missing values...') + env <- predictor_homogenize_na(env, fill = FALSE) + } + + # Standardization and scaling + if('none' %notin% transform){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Transforming predictors...') + for(tt in transform) env <- predictor_transform(env, option = tt) + } + + # # Calculate derivates if set + if('none' %notin% derivates){ + # Get variable names + varn <- obj$get_coefficients()[['Feature']] + # Are there any derivates present in the coefficients? + if(any( length( grep("hinge__|bin__|quad__|thresh__", varn ) ) > 0 )){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','green','Creating predictor derivates...') + for(dd in derivates){ + if(any(grep(dd, varn))){ + env <- predictor_derivate(env, option = dd, nknots = derivate_knots, int_variables = int_variables, deriv = varn) + } else { + if(getOption('ibis.setupmessages')) myLog('[Setup]','red', paste0(derivates,' derivates should be created, but not found among coefficients!')) + } + } + } else { + if(getOption('ibis.setupmessages')) myLog('[Setup]','red','No derivates found among coefficients. None created for projection!') + } + } + + # Get and format Time period + env_dim <- stars::st_dimensions(env) + timeperiod <- stars::st_get_dimension_values(env, "time", center = TRUE) + if(is.numeric(timeperiod)){ + # Format to Posix. Assuming years only + timeperiod <- as.POSIXct(paste0(timeperiod,"-01-01")) + } + if(anyNA(timeperiod)) stop('Third dimension is not a time value!') + + # Check whether predictors already exist, if so overwrite + if(!is.Waiver(x$predictors)) myLog('[Setup]','yellow','Overwriting existing predictors.') + + # Finally set the data to the BiodiversityScenario object + x$set_predictors( + bdproto(NULL, PredictorDataset, + id = new_id(), + data = env, + timeperiod = timeperiod, + ... + ) + ) + } +) + +#' Function to format a prepared GLOBIOM netCDF file for use in Ibis.iSDM +#' +#' @description +#' This function expects a downscaled GLOBIOM output as created in the BIOCLIMA project. +#' Likely of little use for anyone outside IIASA. +#' +#' @param fname A filename in [`character`] pointing to a GLOBIOM output in netCDF format. +#' @param oftype A [`character`] denoting the output type (Default: \code{'raster'}). +#' @param ignore A [`vector`] of variables to be ignored (Default: \code{NULL}). +#' @param period A [`character`] limiting the period to be returned from the formatted data. +#' Options include \code{"reference"} for the first entry, \code{"projection"} for all entries but the first, +#' and \code{"all"} for all entries (Default: \code{"reference"}). +#' @param template An optional [`RasterLayer`] object towards which projects should be transformed. +#' @param verbose [`logical`] on whether to be chatty. +#' +#' @examples \dontrun{ +#' # Expects a filename pointing to a netCDF file. +#' covariates <- formatBIOCLIMA(fname) +#' } +#' @keywords internal, utils +formatGLOBIOM <- function(fname, oftype = "raster", ignore = NULL, + period = "all", template = NULL, + verbose = getOption("ibis.setupmessages")){ + assertthat::assert_that( + file.exists(fname), + assertthat::has_extension(fname, "nc"), + is.character(oftype), + is.null(ignore) || is.character(ignore), + is.character(period), + is.character(fname), + is.logical(verbose) + ) + period <- match.arg(period, c("reference", "projection", "all"), several.ok = FALSE) + check_package("stars") + check_package("dplyr") + check_package("cubelyr") + check_package("ncdf4") + + # Try and load in the GLOBIOM file to get the attributes + fatt <- ncdf4::nc_open(fname) + if(verbose) myLog('[Setup]','green',"Found ", fatt$ndims, " dimensions and ", fatt$nvars, " variables") + + # Get all dimension names and variable names + dims <- names(fatt$dim) + vars <- names(fatt$var) + if(!is.null(ignore)) assertthat::assert_that( all( ignore %in% vars ) ) + + attrs <- list() # For storing the attributes + sc <- vector() # For storing the scenario files + + # Now open the netcdf file with stats + if( length( grep("netcdf", stars::detect.driver(fname), ignore.case = TRUE) )>0 ){ + if(verbose){ + myLog('[Predictor]','green',"Loading in predictor file...") + pb <- progress::progress_bar$new(total = length(vars), + format = "Loading :variable (:spin) [:bar] :percent") + } + + for(v in vars) { + if(verbose) pb$tick(tokens = list(variable = v)) + if(!is.null(ignore)) if(ignore == v) next() + + # Get and save the attributes of each variable + attrs[[v]] <- ncdf4::ncatt_get(fatt, varid = v, verbose = FALSE) + + # Load in the variable + suppressWarnings( + suppressMessages( + ff <- stars::read_ncdf(fname, + var = v, + proxy = FALSE, + make_time = TRUE, # Make time on 'time' band + make_units = FALSE # To avoid unnecessary errors due to unknown units + ) + ) + ) + + # Sometimes variables don't seem to have a time dimension + if(!"time" %in% names(stars::st_dimensions(ff))) next() + + # Crop to background extent if set + if(!is.null(template)){ + # FIXME: Currently this code, while working clips too much of Europe. + # Likely need to + # bbox <- sf::st_bbox(template) |> sf::st_as_sfc() |> + # sf::st_transform(crs = sf::st_crs(ff)) + # suppressMessages( + # ff <- ff |> stars:::st_crop.stars(bbox) + # ) + } + + # Record dimensions for later + full_dis <- stars::st_dimensions(ff) + + # Get dimensions other that x,y and time and split + # Commonly used column names + check = c("x","X","lon","longitude", "y", "Y", "lat", "latitude", "time", "Time", "year", "Year") + chk <- which(!names(stars::st_dimensions(ff)) %in% check) + + if(length(chk)>0){ + for(i in chk){ + col_class <- names(stars::st_dimensions(ff))[i] + # FIXME: Dirty hack to remove forest zoning + if(length( grep("zone",col_class,ignore.case = T) )>0) next() + + # And class units as description from over + class_units <- fatt$dim[[col_class]]$units + class_units <- class_units |> + strsplit(";") |> + # Remove emptyspace and special symbols + sapply(function(y) gsub("[^0-9A-Za-z///' ]", "" , y, ignore.case = TRUE) ) |> + sapply(function(y) gsub(" ", "" , y, ignore.case = TRUE) ) + # Convert to vector and make names + class_units <- paste0( + v, "__", + make.names(unlist(class_units)) |> as.vector() + ) + + ff <- ff |> stars:::split.stars(col_class) |> stats::setNames(nm = class_units) + + # FIXME: Dirty hack to deal with the forest zone dimension + # If there are more dimensions than 3, aggregate over them + if( length(stars::st_dimensions(ff)) >3){ + # Aggregate spatial-temporally + ff <- stars::st_apply(ff, c("longitude", "latitude", "time"), sum, na.rm = TRUE) + } + } + } + + # Finally aggregate + if(!is.null(template) && is.Raster(template)){ + # FIXME: + # MJ 14/11/2022 - The code below is buggy, resulting in odd curvilinear extrapolations for Europe + # Hacky approach now is to convert to raster, crop, project and then convert back. + ff <- hack_project_stars(ff, template) + # Make background + # bg <- stars::st_as_stars(template) + # + # # Get resolution + # res <- sapply(stars::st_dimensions(bg), "[[", "delta") + # res[1:2] = abs(res[1:2]) # Assumes the first too entries are the coordinates + # assertthat::assert_that(!anyNA(res)) + # + # # And warp by projecting and resampling + # ff <- ff |> st_transform(crs = sf::st_crs(template)) |> + # stars::st_warp(crs = sf::st_crs(bg), + # cellsize = res, + # method = "near") |> + # stars:::st_transform.stars(crs = sf::st_crs(template)) + # Overwrite full dimensions + full_dis <- stars::st_dimensions(ff) + } + # Now append to vector + sc <- c(sc, ff) + rm(ff) + } + invisible(gc()) + assertthat::assert_that(length(names(full_dis))>=3) + + # Format sc object as stars and set dimensions again + sc <- stars::st_as_stars(sc) + assertthat::assert_that(length(sc)>0) + full_dis <- full_dis[c( + grep("x|longitude",names(full_dis), ignore.case = TRUE,value = TRUE), + grep("y|latitude",names(full_dis), ignore.case = TRUE,value = TRUE), + grep("year|time",names(full_dis), ignore.case = TRUE,value = TRUE) + )] # Order assumed to be correct + stars::st_dimensions(sc) <- full_dis # Target dimensions + + } else { stop("Fileformat not recognized!")} + + # Get time dimension (without applying offset) so at the centre + times <- stars::st_get_dimension_values(sc, "time", center = TRUE) + + # Make checks on length of times and if equal to one, drop. check. + if(length(times)==1){ + if(period == "projection") stop("Found only a single time slot. Projections not possible.") + if(verbose) myLog('[Setup]','yellow','Found only a single time point in file. Dropping time dimension.') + # Drop the time dimension + sc <- stars:::adrop.stars(sc, drop = which(names(stars::st_dimensions(sc)) == "time") ) + } + + # Formate times unit and convert to posix if not already set + if(is.numeric(times) && length(times) > 1){ + # Assume year and paste0 as properly POSIX formatted + times <- as.POSIXct( paste0(times, "-01-01") ) + sc <- stars::st_set_dimensions(sc, "time", times) + } + + # Depending on the period, slice the input data + if(period == "reference"){ + # Get the first entry and filter + if(length(times)>1){ + # In case times got removed + times_first <- stars::st_get_dimension_values(sc, "time")[1] + sc <- sc |> stars:::filter.stars(time == times_first) + times <- times_first;rm(times_first) + } + } else if(period == "projection"){ + # Remove the first time entry instead, only using the last entries + times_allbutfirst <- stars::st_get_dimension_values(sc, "time")[-1] + sc <- sc |> stars:::filter.stars(time %in% times_allbutfirst) + times <- times_allbutfirst; rm(times_allbutfirst) + } + assertthat::assert_that(length(times)>0, + length(sc)>=1) + + # Create raster template if set + if(!is.null(template)){ + # Check that template is a raster, otherwise rasterize for GLOBIOM use + if(inherits(template, "sf")){ + o <- sc |> stars:::slice.stars("time" , 1) |> methods::as("Raster") + if("fasterize" %in% utils::installed.packages()[,1]){ + template <- fasterize::fasterize(sf = template, raster = o, field = NULL) + } else { + template <- raster::rasterize(template, o, field = 1) + } + rm(o) + } + } + + # Now format outputs depending on type, either returning the raster or the stars object + if(oftype == "raster"){ + # Output type raster, use function from utils_scenario + out <- stars_to_raster(sc, which = NULL, template = template) + return(out) + } else { return( sc ) } +} diff --git a/R/add_priors.R b/R/add_priors.R index 51b49733..997f887b 100644 --- a/R/add_priors.R +++ b/R/add_priors.R @@ -141,10 +141,10 @@ methods::setMethod( #' @aliases get_priors #' @examples #' \dontrun{ -#' mod <- distribution(background) %>% -#' add_predictors(covariates) %>% -#' add_biodiversity_poipo(points) %>% -#' engine_inlabru() %>% +#' mod <- distribution(background) |> +#' add_predictors(covariates) |> +#' add_biodiversity_poipo(points) |> +#' engine_inlabru() |> #' train() #' get_priors(mod, target_engine = "BART") #' } diff --git a/R/bdproto-biodiversitydataset.R b/R/bdproto-biodiversitydataset.R index aab77078..15b43646 100644 --- a/R/bdproto-biodiversitydataset.R +++ b/R/bdproto-biodiversitydataset.R @@ -1,302 +1,302 @@ -#' @include utils.R waiver.R bdproto.R -NULL - -#' @export -if (!methods::isClass("BiodiversityDatasetCollection")) methods::setOldClass("BiodiversityDatasetCollection") -if (!methods::isClass("BiodiversityDataset")) methods::setOldClass("BiodiversityDataset") -NULL - -#' BiodiversityDatasetCollection super prototype description -#' -#' Acts a container for BiodiversityDataset within -#' -#' @name BiodiversityDatasetCollection-class -#' @keywords bdproto -#' @family bdproto -#' @aliases BiodiversityDatasetCollection -NULL - -#' @export -BiodiversityDatasetCollection <- bdproto( - "BiodiversityDatasetCollection", - data = list(), - # Print the names of all Biodiversity datasets - print = function(self) { - message(self$show()) - }, - # Print a summary of all datasets - show = function(self){ - ty = if(is.Waiver(self$get_types())){ - ty = '\n \033[31mNone\033[39m'} else { ty = paste0("\n ",self$get_types()) } - if(self$length()>0){ obs = paste0(" <",self$get_observations()," records>") } else obs = '' - # FIXME: Prettify - paste0(self$name(),':', - paste0(ty,obs,collapse = '') - ) - }, - # Name of this object - name = function(self){ - 'Biodiversity data' - }, - # Types of all biodiversity datasets - get_types = function(self, short = FALSE){ - if (base::length(self$data) > 0) - { - return(sapply(self$data, function(z) z$get_type( short ) )) - } else return(new_waiver()) - }, - # Get names. Format if necessary - get_names = function(self, format = FALSE){ - x <- lapply(self$data, function(z) z$name) - if(format) x <- make.names( tolower(x) ) - x - }, - # Add a new Biodiversity dataset to this collection - set_data = function(self, x, value){ - assertthat::assert_that(assertthat::is.string(x), - inherits(value, "BiodiversityDataset")) - self$data[[x]] <- value - invisible() - }, - # Get a specific Biodiversity dataset by name - get_data_object = function(self, id) { - assertthat::assert_that(is.Id(id) || is.character(id) ) - if (!id %in% names(self$data)) - return(new_waiver()) - return(self$data[[id]]) - }, - # Get biodiversity observations - get_data = function(self, id){ - assertthat::assert_that(is.Id(id) || is.character(id)) - o <- self$get_data_object(id) - o$get_data() - }, - # Get coordinates for a given biodiversity dataset. Else return a wkt object - get_coordinates = function(self, id){ - assertthat::assert_that(is.Id(id) || is.character(id)) - # Get data - o <- self$get_data(id) - o <- guess_sf(o) - # Add lowercase coordinates for consistency - o$x <- sf::st_coordinates(o)[,1] - o$y <- sf::st_coordinates(o)[,2] - - # Return coordinates - if(hasName(o,'geom')) sf::st_coordinates(o) else o[,c('x','y')] - }, - # Remove a specific biodiversity dataset by id - rm_data = function(self, id) { - assertthat::assert_that(is.Id(id) || is.character(id), - id %in% names(self$data) ) - self$data[[id]] <- NULL - invisible() - }, - # Number of Biodiversity Datasets in connection - length = function(self) { - base::length(self$data) - }, - # Get number of observations of all datasets - get_observations = function(self) { - x <- sapply(self$data, function(z) z$get_observations()) - x - }, - # Get equations - get_equations = function(self){ - x <- lapply(self$data, function(z) z$get_equation()) - x - }, - # Get family - get_families = function(self){ - x <- lapply(self$data, function(z) z$get_family()) - x - }, - # Get custom link functions - get_links = function(self){ - x <- lapply(self$data, function(z) z$get_link() ) - x - }, - # Get fields with observation columns - get_columns_occ = function(self){ - x <- lapply(self$data, function(z) z$get_column_occ()) - x - }, - # Get weights - get_weights = function(self){ - x <- lapply(self$data, function(z) z$get_weight()) - x - }, - # Get ids of all assets in collection - get_ids = function(self){ - x <- lapply(self$data, function(z) z$id) - x - }, - # Search for a specific biodiversity dataset with type - get_id_byType = function(self, type){ - assertthat::assert_that(is.character(type), !missing(type)) - # Check whether type is correctly set - if(type %notin% c("Point - Presence only","Point - Presence absence",'Polygon - Presence only','Polygon - Presence absence')){ - type <- switch(type, - 'poipo' = "Point - Presence only", - 'poipa' = "Point - Presence absence", - 'polpo' = 'Polygon - Presence only', - 'polpa' = 'Polygon - Presence absence' - ) - } - if(is.null(type)) stop('Dataset type not found!') - w <- which(self$get_types() %in% type) - self$get_types()[w] - }, - # Get id by name - get_id_byName = function(self, name){ - assertthat::assert_that(is.character(name), !missing(name)) - # Get id(s) of dataset with given name - r <- lapply(self$data, function(z) z$name) - if(name %in% r){ - r[which(r==name)] - } else character(0) - }, - # Show equations of all datasets - show_equations = function(self, msg = TRUE) { - # Get equations - x <- self$get_equations() - - # new names - # n <- c( - # poipo = 'Point - Presence only', - # poipa = 'Point - Presence absence', - # polpo = 'Polygon - Presence only', - # polpa = 'Polygon - Presence absence' - # ) - # names(x) <- as.vector( n[match(names(x), names(n))] ) - # Prettify - o <- paste0(names(x),":\n ",x,collapse = '\n') - if(msg) message(o) else o - }, - # Plot the whole collection - plot = function(self){ - # FIXME: Can quite likely be beautified - # Get observed columns - cols <- self$get_columns_occ() - - par.ori <- par(no.readonly = TRUE) - # Base plot - g <- ggplot2::ggplot() + ggplot2::geom_sf() + ggplot2::labs( title = self$name()) - # Adding the other elements - for(dataset in names(cols)){ - - if('Polygon - Presence only' == self$get_types()[dataset] ) g <- g + ggplot2::geom_sf(data = st_as_sf(self$get_data(dataset))[cols[[dataset]]], fill = 'lightblue', alpha = .35 ) - - if('Polygon - Presence absence' == self$get_types()[dataset] ){ - dd <- st_as_sf(self$get_data(dataset))[cols[[dataset]]] - dd[[cols[[dataset]]]] <- factor(dd[[cols[[dataset]]]]) - g <- g + ggplot2::geom_sf(data = dd, fill = 'lightgreen', alpha = .35 ) - } - - if('Point - Presence only' == self$get_types()[dataset] ) g <- g + ggplot2::geom_sf(data = st_as_sf(self$get_data(dataset))[cols[[dataset]]], colour = 'grey20', alpha = .5 ) - - if('Point - Presence absence' == self$get_types()[dataset] ){ - dd <- st_as_sf(self$get_data(dataset))[cols[[dataset]]] - dd[[cols[[dataset]]]] <- factor(dd[[cols[[dataset]]]]) - dd$observed <- dd[[cols[[dataset]]]] - g <- g + ggplot2::geom_sf(data = dd, ggplot2::aes(colour = observed) ) - } - } - g - } - -) - - -#' BiodiversityDataset prototype description -#' -#' @name BiodiversityDataset-class -#' @aliases BiodiversityDataset -#' @keywords bdproto -NULL - -#' @export -BiodiversityDataset <- bdproto( - "BiodiversityDataset", - name = character(0), - id = character(0), - equation = new_waiver(), - family = character(0), - link = new_waiver(), - type = new_waiver(), - weight = new_waiver(), - field_occurrence = character(0), - data = new_waiver(), - # Set new equation - set_equation = function(self, x){ - assertthat::assert_that(inherits(x, "formula")) - self$formula <- x - }, - # Get equation - get_equation = function(self){ - if(is.Waiver(self$equation)) return('') - self$equation - }, - # Function to print the equation - show_equation = function(self){ - if(!is.Waiver(equation) && !is.null(equation)) - message(equation) - else message('None set. Default equation used (response ~ .)') - }, - # Printing function - print = function(self){ - message(paste0('Biodiversity data:', - '\n Name: ',self$name, - '\n Type: ',self$get_type() - )) - }, - # Return name - name = function(self){ - self$name - }, - # Get Id - id = function(self){ - self$id - }, - # Get type - get_type = function(self, short = FALSE){ - if(short){ - self$type - } else { - switch (self$type, - poipo = 'Point - Presence only', - poipa = 'Point - Presence absence', - polpo = 'Polygon - Presence only', - polpa = 'Polygon - Presence absence' - ) - } - }, - # Get field with occurrence information - get_column_occ = function(self){ - self$field_occurrence - }, - # Get family - get_family = function(self){ - assertthat::assert_that(is.character(self$family)) - self$family - }, - # Get custom link function - get_link = function(self){ - self$link - }, - # Get data - get_data = function(self){ - self$data - }, - # Get weight - get_weight = function(self){ - self$weight - }, - # Print input messages - show = function(self) { - self$print() - }, - # Collect info statistics - get_observations = function(self) { - nrow(self$data) - } -) +#' @include utils.R waiver.R bdproto.R +NULL + +#' @export +if (!methods::isClass("BiodiversityDatasetCollection")) methods::setOldClass("BiodiversityDatasetCollection") +if (!methods::isClass("BiodiversityDataset")) methods::setOldClass("BiodiversityDataset") +NULL + +#' BiodiversityDatasetCollection super prototype description +#' +#' Acts a container for BiodiversityDataset within +#' +#' @name BiodiversityDatasetCollection-class +#' @keywords bdproto +#' @family bdproto +#' @aliases BiodiversityDatasetCollection +NULL + +#' @export +BiodiversityDatasetCollection <- bdproto( + "BiodiversityDatasetCollection", + data = list(), + # Print the names of all Biodiversity datasets + print = function(self) { + message(self$show()) + }, + # Print a summary of all datasets + show = function(self){ + ty = if(is.Waiver(self$get_types())){ + ty = '\n \033[31mNone\033[39m'} else { ty = paste0("\n ",self$get_types()) } + if(self$length()>0){ obs = paste0(" <",self$get_observations()," records>") } else obs = '' + # FIXME: Prettify + paste0(self$name(),':', + paste0(ty,obs,collapse = '') + ) + }, + # Name of this object + name = function(self){ + 'Biodiversity data' + }, + # Types of all biodiversity datasets + get_types = function(self, short = FALSE){ + if (base::length(self$data) > 0) + { + return(sapply(self$data, function(z) z$get_type( short ) )) + } else return(new_waiver()) + }, + # Get names. Format if necessary + get_names = function(self, format = FALSE){ + x <- lapply(self$data, function(z) z$name) + if(format) x <- make.names( tolower(x) ) + x + }, + # Add a new Biodiversity dataset to this collection + set_data = function(self, x, value){ + assertthat::assert_that(assertthat::is.string(x), + inherits(value, "BiodiversityDataset")) + self$data[[x]] <- value + invisible() + }, + # Get a specific Biodiversity dataset by name + get_data_object = function(self, id) { + assertthat::assert_that(is.Id(id) || is.character(id) ) + if (!id %in% names(self$data)) + return(new_waiver()) + return(self$data[[id]]) + }, + # Get biodiversity observations + get_data = function(self, id){ + assertthat::assert_that(is.Id(id) || is.character(id)) + o <- self$get_data_object(id) + o$get_data() + }, + # Get coordinates for a given biodiversity dataset. Else return a wkt object + get_coordinates = function(self, id){ + assertthat::assert_that(is.Id(id) || is.character(id)) + # Get data + o <- self$get_data(id) + o <- guess_sf(o) + # Add lowercase coordinates for consistency + o$x <- sf::st_coordinates(o)[,1] + o$y <- sf::st_coordinates(o)[,2] + + # Return coordinates + if(utils::hasName(o,'geom')) sf::st_coordinates(o) else o[,c('x','y')] + }, + # Remove a specific biodiversity dataset by id + rm_data = function(self, id) { + assertthat::assert_that(is.Id(id) || is.character(id), + id %in% names(self$data) ) + self$data[[id]] <- NULL + invisible() + }, + # Number of Biodiversity Datasets in connection + length = function(self) { + base::length(self$data) + }, + # Get number of observations of all datasets + get_observations = function(self) { + x <- sapply(self$data, function(z) z$get_observations()) + x + }, + # Get equations + get_equations = function(self){ + x <- lapply(self$data, function(z) z$get_equation()) + x + }, + # Get family + get_families = function(self){ + x <- lapply(self$data, function(z) z$get_family()) + x + }, + # Get custom link functions + get_links = function(self){ + x <- lapply(self$data, function(z) z$get_link() ) + x + }, + # Get fields with observation columns + get_columns_occ = function(self){ + x <- lapply(self$data, function(z) z$get_column_occ()) + x + }, + # Get weights + get_weights = function(self){ + x <- lapply(self$data, function(z) z$get_weight()) + x + }, + # Get ids of all assets in collection + get_ids = function(self){ + x <- lapply(self$data, function(z) z$id) + x + }, + # Search for a specific biodiversity dataset with type + get_id_byType = function(self, type){ + assertthat::assert_that(is.character(type), !missing(type)) + # Check whether type is correctly set + if(type %notin% c("Point - Presence only","Point - Presence absence",'Polygon - Presence only','Polygon - Presence absence')){ + type <- switch(type, + 'poipo' = "Point - Presence only", + 'poipa' = "Point - Presence absence", + 'polpo' = 'Polygon - Presence only', + 'polpa' = 'Polygon - Presence absence' + ) + } + if(is.null(type)) stop('Dataset type not found!') + w <- which(self$get_types() %in% type) + self$get_types()[w] + }, + # Get id by name + get_id_byName = function(self, name){ + assertthat::assert_that(is.character(name), !missing(name)) + # Get id(s) of dataset with given name + r <- lapply(self$data, function(z) z$name) + if(name %in% r){ + r[which(r==name)] + } else character(0) + }, + # Show equations of all datasets + show_equations = function(self, msg = TRUE) { + # Get equations + x <- self$get_equations() + + # new names + # n <- c( + # poipo = 'Point - Presence only', + # poipa = 'Point - Presence absence', + # polpo = 'Polygon - Presence only', + # polpa = 'Polygon - Presence absence' + # ) + # names(x) <- as.vector( n[match(names(x), names(n))] ) + # Prettify + o <- paste0(names(x),":\n ",x,collapse = '\n') + if(msg) message(o) else o + }, + # Plot the whole collection + plot = function(self){ + # FIXME: Can quite likely be beautified + # Get observed columns + cols <- self$get_columns_occ() + + par.ori <- graphics::par(no.readonly = TRUE) + # Base plot + g <- ggplot2::ggplot() + ggplot2::geom_sf() + ggplot2::labs( title = self$name()) + # Adding the other elements + for(dataset in names(cols)){ + + if('Polygon - Presence only' == self$get_types()[dataset] ) g <- g + ggplot2::geom_sf(data = st_as_sf(self$get_data(dataset))[cols[[dataset]]], fill = 'lightblue', alpha = .35 ) + + if('Polygon - Presence absence' == self$get_types()[dataset] ){ + dd <- st_as_sf(self$get_data(dataset))[cols[[dataset]]] + dd[[cols[[dataset]]]] <- factor(dd[[cols[[dataset]]]]) + g <- g + ggplot2::geom_sf(data = dd, fill = 'lightgreen', alpha = .35 ) + } + + if('Point - Presence only' == self$get_types()[dataset] ) g <- g + ggplot2::geom_sf(data = st_as_sf(self$get_data(dataset))[cols[[dataset]]], colour = 'grey20', alpha = .5 ) + + if('Point - Presence absence' == self$get_types()[dataset] ){ + dd <- st_as_sf(self$get_data(dataset))[cols[[dataset]]] + dd[[cols[[dataset]]]] <- factor(dd[[cols[[dataset]]]]) + dd$observed <- dd[[cols[[dataset]]]] + g <- g + ggplot2::geom_sf(data = dd, ggplot2::aes(colour = observed) ) + } + } + g + } + +) + + +#' BiodiversityDataset prototype description +#' +#' @name BiodiversityDataset-class +#' @aliases BiodiversityDataset +#' @keywords bdproto +NULL + +#' @export +BiodiversityDataset <- bdproto( + "BiodiversityDataset", + name = character(0), + id = character(0), + equation = new_waiver(), + family = character(0), + link = new_waiver(), + type = new_waiver(), + weight = new_waiver(), + field_occurrence = character(0), + data = new_waiver(), + # Set new equation + set_equation = function(self, x){ + assertthat::assert_that(inherits(x, "formula")) + self$formula <- x + }, + # Get equation + get_equation = function(self){ + if(is.Waiver(self$equation)) return('') + self$equation + }, + # Function to print the equation + show_equation = function(self){ + if(!is.Waiver(equation) && !is.null(equation)) + message(equation) + else message('None set. Default equation used (response ~ .)') + }, + # Printing function + print = function(self){ + message(paste0('Biodiversity data:', + '\n Name: ',self$name, + '\n Type: ',self$get_type() + )) + }, + # Return name + name = function(self){ + self$name + }, + # Get Id + id = function(self){ + self$id + }, + # Get type + get_type = function(self, short = FALSE){ + if(short){ + self$type + } else { + switch (self$type, + poipo = 'Point - Presence only', + poipa = 'Point - Presence absence', + polpo = 'Polygon - Presence only', + polpa = 'Polygon - Presence absence' + ) + } + }, + # Get field with occurrence information + get_column_occ = function(self){ + self$field_occurrence + }, + # Get family + get_family = function(self){ + assertthat::assert_that(is.character(self$family)) + self$family + }, + # Get custom link function + get_link = function(self){ + self$link + }, + # Get data + get_data = function(self){ + self$data + }, + # Get weight + get_weight = function(self){ + self$weight + }, + # Print input messages + show = function(self) { + self$print() + }, + # Collect info statistics + get_observations = function(self) { + nrow(self$data) + } +) diff --git a/R/bdproto-biodiversitydistribution.R b/R/bdproto-biodiversitydistribution.R index 427ff204..c596f45f 100644 --- a/R/bdproto-biodiversitydistribution.R +++ b/R/bdproto-biodiversitydistribution.R @@ -37,7 +37,6 @@ BiodiversityDistribution <- bdproto( # Self printing function print = function(self) { - # TODO: Prettify below # Query information from the distribution object ex <- self$show_background_info() pn <- ifelse(is.Waiver(self$get_predictor_names()),'None',name_atomic(self$get_predictor_names(), "predictors")) diff --git a/R/bdproto-biodiversityscenario.R b/R/bdproto-biodiversityscenario.R index 5ecf2b0a..f5830610 100644 --- a/R/bdproto-biodiversityscenario.R +++ b/R/bdproto-biodiversityscenario.R @@ -1,585 +1,585 @@ -#' @include utils.R bdproto.R -NULL - -#' @export -if (!methods::isClass("BiodiversityScenario")) methods::setOldClass("BiodiversityScenario") -NULL - -#' Prototype for a biodiversity scenario from a trained model -#' -#' Base [`proto`] class for any biodiversity scenario objects. -#' Serves as container that supplies data and functions to -#' other [`proto`] classes. -#' -#' @name BiodiversityScenario-class -#' @aliases BiodiversityScenario -#' @family bdproto -#' @keywords bdproto -NULL - -#' @export -BiodiversityScenario <- bdproto( - "BiodiversityScenario", - modelobject = new_waiver(), # The id of the model - modelid = new_waiver(), - limits = new_waiver(), - predictors = new_waiver(), - constraints = new_waiver(), - scenarios = new_waiver(), - # Print message with summary of model - print = function(self) { - # Check that model exists - fit <- self$get_model() - timeperiod <- self$get_timeperiod() - # Get set predictors and time period - pn = ifelse(is.Waiver(self$get_predictor_names()),'None',name_atomic(self$get_predictor_names(), "predictors")) - tp = ifelse(is.Waiver(timeperiod),'None', - paste0( - paste0( timeperiod,collapse = ' -- '), - ' (',round(as.numeric(difftime(self$get_timeperiod()[2],self$get_timeperiod()[1],unit="weeks"))/52.25,1),' years)' - ) - ) - # Constrains - cs <- self$get_constraints() - if(!is.Waiver(cs)) cs <- vapply(cs, function(x) x$method, character(1)) - # Thresholds - tr <- self$get_threshold() - - message(paste0( - ifelse(is.Waiver(self$limits),"Spatial-temporal scenario:","Spatial-temporal scenario (limited):"), - '\n Used model: ',ifelse(is.Waiver(fit) || isFALSE(fit), text_red('None'), class(fit)[1] ), - "\n --------- ", - "\n Predictors: ", pn, - "\n Time period: ", tp, - ifelse(!is.Waiver(cs)||!is.Waiver(tr), "\n --------- ", ""), - ifelse(is.Waiver(cs),"", paste0("\n Constraints: ", text_green(paste(paste0(names(cs),' (',cs,')'),collapse = ', ')) ) ), - ifelse(is.Waiver(tr),"", paste0("\n Threshold: ", round(tr[1], 3),' (',names(tr[1]),')') ), - "\n --------- ", - "\n Scenarios fitted: ", ifelse(is.Waiver(self$scenarios),text_yellow('None'), text_green('Yes')) - ) - ) - }, - # Verify that set Model exist and check self-validity - verify = function(self){ - assertthat::validate_that( !is.Waiver(self$modelobject), - !is.Waiver(self$modelid) - ) - # Get Model object and check that ID is correct - if(inherits(self$modelobject, "DistributionModel")){ - x <- self$modelobject - } else { - x <- get(self$modelobject) - } - assertthat::validate_that(x$id == self$modelid) - # Check that objects are correctly set or found - assertthat::assert_that(is.Waiver(self$get_predictors()) || inherits(self$get_predictors(), "PredictorDataset")) - assertthat::assert_that(is.Waiver(self$get_data()) || (inherits(self$get_data(), "stars") || is.Raster(self$get_data())) ) - assertthat::assert_that(is.Waiver(self$get_constraints()) || is.list(self$get_constraints())) - invisible() - }, - # Show the name of the Model - show = function(self) { - if(is.character(self$modelobject)){ - return( self$modelobject ) - } else { - return( fit$model$runname ) - } - }, - # Get projection - get_projection = function(self){ - return( self$predictors$get_projection() ) - }, - # Get resolution - get_resolution = function(self){ - return( self$predictors$get_resolution() ) - }, - # Get Model - get_model = function(self){ - if(is.Waiver(self$modelobject)) return( new_waiver() ) - else { - if(inherits(self$modelobject, "DistributionModel")){ - return( self$modelobject ) - } else { - if(!exists(self$modelobject)) return( FALSE ) else { - return( get(self$modelobject) ) - } - } - } - }, - # Get provided limits - get_limits = function(self){ - if(is.Waiver(self$limits)) return(NULL) - return(self$limits) - }, - # Get Model predictors - get_predictor_names = function(self) { - if(is.Waiver(self$predictors)) return(self$predictors) - if(inherits(self$predictors, "PredictorDataset")) { - self$predictors$get_names() - } else { - stop("Feature data is of an unrecognized class") - } - }, - # Get time period of projection - get_timeperiod = function(self, what = "range"){ - if(is.Waiver(self$predictors)) return(self$predictors) - if(inherits(self$predictors, "PredictorDataset")) { - if(what == "range"){ - return( - c( min(as.Date(self$predictors$timeperiod)), max(as.Date(self$predictors$timeperiod)) ) - ) - } else { - return( - sort( self$predictors$timeperiod ) - ) - } - } - }, - # Get constrains for model - get_constraints = function(self){ - return( self$constraints ) - }, - # Get thresholds if specified - get_threshold = function(self){ - if('threshold' %notin% names(self)) return( new_waiver() ) - return( self$threshold ) - }, - # Apply specific threshold - apply_threshold = function(self, tr = new_waiver()){ - # Assertions - if(is.Waiver(tr)) assertthat::assert_that( is.numeric(self$threshold), msg = 'No threshold value found.') - assertthat::assert_that( !is.Waiver(self$scenarios), msg = 'No scenarios found.') - # Get prediction and threshold - sc <- self$get_data() - if(!is.Waiver(tr)) tr <- self$threshold - # reclassify to binary - sc[sc < tr] <- 0; sc[sc >= tr] <- 1 - names(sc) <- 'presence' - return(sc) - }, - # Set Predictors - set_predictors = function(self, x){ - assertthat::assert_that(inherits(x, "PredictorDataset")) - bdproto(NULL, self, predictors = x) - }, - # Set constrains - set_constraints = function(self, x){ - if(!is.Waiver(self$get_constraints())){ - cr <- self$get_constraints() - # FIXME: Remove duplicates - bdproto(NULL, self, constraints = c(cr, x)) - } else { - bdproto(NULL, self, constraints = x) - } - }, - # Get Predictors - get_predictors = function(self){ - return(self$predictors) - }, - # Remove predictors - rm_predictors = function(self, names){ - if(is.Waiver(self$predictors) || is.null(self$predictors)) return(NULL) - if(missing(names)){ - names <- self$get_predictor_names() # Assume all names - } - assertthat::assert_that( - is.character(names) || assertthat::is.scalar(names) || is.vector(names) - ) - # Get predictor collection - prcol <- bdproto(NULL, self) - # Set the object - prcol$predictors$rm_data(names) - if(length(prcol$get_predictor_names())==0) prcol$predictors <- new_waiver() - return(prcol) - }, - # Get scenario predictions - get_data = function(self, what = "scenarios"){ - return(self[[what]]) - }, - # Plot the prediction - plot = function(self, what = "suitability", which = NULL, ...){ - if(is.Waiver(self$get_data())){ - if(getOption('ibis.setupmessages')) myLog('[Scenario]','red','No scenarios found') - invisible() - } else { - # Get unique number of data values. Surely there must be an easier val - vals <- self$get_data()[what] %>% stars:::pull.stars() %>% as.vector() %>% unique() - vals <- length(na.omit(vals)) - if(vals>2) col <- ibis_colours$sdm_colour else col <- c('grey25','coral') - if(is.null(which)){ - stars:::plot.stars( self$get_data()[what], breaks = "equal", col = col ) - } else { - # Assert that which is actually within the dimensions - assertthat::assert_that(which <= dim(self$get_data())[3], - msg = "Band selection out of bounds.") - obj <- self$get_data()[what,,,which] - stars:::plot.stars( obj, breaks = "equal", col = col, - main = paste0(what," for ", stars::st_get_dimension_values(obj, "band") ) ) - } - } - }, - # Convenience function to plot thresholds if set - plot_threshold = function(self, which = NULL){ - # Check that baseline and scenario thresholds are all there - if(!( 'threshold' %in% attributes(self$get_data())$names )) return(new_waiver()) - - self$plot(what = "threshold", which = which) - }, - # Plot Migclim results if existing - plot_migclim = function(self){ - # Get scenarios - mc <- self$get_data("scenarios_migclim") - if(is.Waiver(mc)) return(mc) - - # Otherwise plot the raster - ras <- mc$raster - - # Colour coding from MigClim::MigClim.plot - rstVals <- sort(raster::unique(ras)) - negativeNb <- length(which(rstVals < 0)) - positiveNb <- length(which(rstVals > 1 & rstVals < 30000)) - zeroExists <- any(rstVals == 0) - oneExists <- any(rstVals == 1) - unilimtedExists <- any(rstVals == 30000) - Colors <- rep("yellow", negativeNb) - if(zeroExists) Colors <- c(Colors, "grey94") - if (oneExists) Colors <- c(Colors, "black") - Colors <- c(Colors, rainbow(positiveNb, start = 0, end = 0.4)) - if (unilimtedExists) Colors <- c(Colors, "pink") - - # Plot - # 0 - Cells that have never been occupied and are unsuitable habitat at the end of the simulation - # 1 - Cells that belong to the species' initial distribution and that have remained occupied during the entire simulation. - # 1 < value < 30 000 - determine the dispersal step during which it was colonized. E.g. 101 is first dispersal even in first step - # 30 0000 - Potentially suitable cells that remained uncolonized - # <0 - Negative values indicate cells that were once occupied but have become decolonized. Code as for colonization - dev.new(width = 7, height = 7 * ((ymax(ras) - ymin(ras))/(xmax(ras) - xmin(ras)))) - plot(ras, col = Colors, breaks = c(min(rstVals) - 1, rstVals), legend = FALSE, - main = "Newly colonized and stable habitats") - }, - # Plot animation of scenarios - plot_animation = function(self, what = "suitability", fname = NULL){ - assertthat::assert_that(!is.Waiver(self$get_data()) ) - check_package('gganimate') - # Get scenarios - obj <- self$get_data()[what] - - # Make the animation plot - g <- ggplot2::ggplot() + - stars::geom_stars(data = obj, downsample = c(1,1,0)) + - ggplot2::coord_equal() + - ggplot2::theme_bw(base_size = 20) + - ggplot2::scale_x_discrete(expand=c(0,0)) + - ggplot2::scale_y_discrete(expand=c(0,0)) + - ggplot2::scale_fill_gradientn(colours = ibis_colours$sdm_colour, na.value = NA) + - # Animation options - gganimate::transition_time(band) + - gganimate::ease_aes() + - ggplot2::labs(x = '', y ='', title = "{frame_time}") - - if(is.null(fname)){ - gganimate::anim_save(animation = g, fname) - } else { g } - }, - #Plot relative change between baseline and projected thresholds - plot_relative_change = function(self, position = NULL, variable = 'mean', plot = TRUE){ - # Default position is the last one - assertthat::assert_that(is.null(position) || is.numeric(position) || is.character(position), - is.character(variable)) - # Threshold - obj <- self$get_model() - thresh_reference <- grep('threshold',obj$show_rasters(),value = T) - # If there is more than one threshold only use the one from variable - if(length(thresh_reference)>1) { - warning('More than one baseline threshold. Using the first one.') - thresh_reference <- grep(variable, thresh_reference,value = T)[1] - } - # Check that baseline and scenarios are all there - assertthat::assert_that( - !is.Waiver(self$get_data()), - 'threshold' %in% attributes(self$get_data())$names, - length(thresh_reference) >0 & is.character(thresh_reference), - is.Raster( self$get_model()$get_data('prediction') ), - msg = "Threshold not found!" - ) - - # Not get the baseline raster - baseline <- self$get_model()$get_data(thresh_reference) - # And the last scenario prediction - scenario <- self$get_data()['threshold'] - time <- stars::st_get_dimension_values(scenario, which = 3) # 3 assumed to be time band - if(is.numeric(position)) position <- time[position] - if(is.null(position)) position <- time[length(time)] - final <- scenario %>% - stars:::filter.stars(band == position) %>% - as('Raster') - raster::projection(final) <- raster::projection(baseline) - # -- # - if(!inherits(final, 'RasterLayer')) final <- final[[1]] # In case it is a rasterbrick or similar - if(!compareRaster(baseline, final,stopiffalse = FALSE)) final <- alignRasters(final, baseline, cl = FALSE) # In case they somehow differ? - - # Calculate overlays - diff_f <- raster::overlay(baseline, final, fun = function(x, y){x + y * 2}) - diff_f <- raster::ratify(diff_f) - # 0 = Unsuitable | 1 = Loss | 2 = Gain | 3 = stable - rat <- levels(diff_f)[[1]] - rat <- merge.data.frame(rat, data.frame(ID = seq(0,3), diff = c("Unsuitable", "Loss", "Gain", "Stable")),all = TRUE) - levels(diff_f) <- rat - diff_f <- raster::mask(diff_f, baseline) - rm(baseline, final) - - # Plot - if(plot){ - # Colours - cols <- c("Unsuitable" = "gray92", "Loss" = "#DE646A", "Gain" = "cyan3", "Stable" = "gray60") - - # Convert to raster - diff_ff <- as.data.frame(diff_f, xy = TRUE) - names(diff_ff)[3] <- "Change" - diff_ff$Change <- factor(diff_ff$Change, levels = names(cols)) - - # Use ggplot - g <- ggplot2::ggplot() + - ggplot2::coord_equal() + - ggplot2::geom_raster(data = diff_ff, ggplot2::aes(x = x, y = y, fill = Change)) + - ggplot2::theme_light(base_size = 18) + - ggplot2::scale_x_discrete(expand=c(0,0)) + - ggplot2::scale_y_discrete(expand=c(0,0)) + - ggplot2::scale_fill_manual(values = cols, na.value = 'transparent') + - ggplot2::theme(legend.position = "bottom") + - ggplot2::labs(x = "", y = "", title = paste0('Change between baseline and ', position)) - return(g) - } else { - # Return - return(diff_f) - } - }, - # Summarize the change in layers between timesteps - summary = function(self, layer = "threshold", plot = FALSE, relative = FALSE){ - # Check that baseline and scenario thresholds are all there - assertthat::assert_that( - !is.Waiver(self$get_data()), - is.logical(plot), is.logical(relative) - ) - if( layer == "threshold" & 'threshold' %in% attributes(self$get_data())$names ){ - # TODO: Try and get rid of dplyr dependency. Currently too much work to not use it - check_package("dplyr") - # Get the scenario predictions and from there the thresholds - scenario <- self$get_data()['threshold'] - time <- stars::st_get_dimension_values(scenario,which = 'band') - assertthat::assert_that(!is.na(sf::st_crs(scenario)), msg = "Scenario not correctly projected.") - # HACK: Add area to stars - ar <- stars:::st_area.stars(scenario) - # Get the unit - ar_unit <- units::deparse_unit(ar$area) - new <- as(scenario,"Raster") * as(ar, "Raster") - new <- raster::setZ(new, time) - # Convert to scenarios to data.frame - df <- stars:::as.data.frame.stars(stars:::st_as_stars(new)) %>% subset(., complete.cases(.)) - names(df)[4] <- "area" - # --- # - # Now calculate from this data.frame several metrics related to the area and change in area - df <- df %>% dplyr::group_by(x,y) %>% dplyr::mutate(id = dplyr::cur_group_id()) %>% - dplyr::ungroup() %>% dplyr::select(-x,-y) %>% - dplyr::mutate(area = dplyr::if_else(is.na(area), 0, area)) %>% # Convert missing data to 0 - dplyr::arrange(id, band) - df$area <- units::as_units(df$area, units::as_units(ar_unit)) # Set Units - # Convert to km2 and remove units as this causes issues with dplyr - df$area <- units::set_units(df$area, "km2") %>% units::drop_units() - - # Total amount of area occupied for a given time step - out <- df %>% dplyr::group_by(band) %>% dplyr::summarise(area_km2 = sum(area, na.rm = TRUE)) - out$totarea <- raster::cellStats((new[[1]]>=0) * as(ar, "Raster"), "sum") - if(units::deparse_unit(units::as_units(ar_unit)) == "m2") { - out$totarea <- out$totarea / 1e6 - out <- dplyr::rename(out, totarea_km2 = totarea) - } - - # Total amount of area lost / gained / stable since previous time step - totchange_occ <- df %>% - dplyr::group_by(id) %>% - dplyr::mutate(change = (area - dplyr::lag(area)) ) %>% dplyr::ungroup() %>% - subset(., complete.cases(.)) - o <- totchange_occ %>% dplyr::group_by(band) %>% - dplyr::summarise(totchange_stable_km2 = sum(area[change == 0]), - totchange_gain_km2 = sum(change[change > 0]), - totchange_loss_km2 = sum(change[change < 0])) - out <- out %>% dplyr::left_join(o, by = "band") - - if(relative == TRUE){ - # Finally calculate relative change to baseline (first entry) for all entries where this is possible - relChange <- function(v, fac = 100) (((v- v[1]) / v[1]) * fac) - out <- subset(out, select = c("band", "area_km2", "totarea_km2")) - out[,c("area_km2")] <- apply( out[,c("area_km2")], 2, relChange) - } - } else { - # Get the scenario predictions and from there the thresholds - scenario <- self$get_data()['suitability'] - times <- stars::st_get_dimension_values(scenario, which = 'band') - # Get area - ar <- stars:::st_area.stars(scenario) - # Get the unit - ar_unit <- units::deparse_unit(ar$area) - - # TODO: Check whether one could not simply multiply with area (poisson > density, binomial > suitable area) - mod <- self$get_model() - scenario <- scenario * ar - out <- summarise_projection(scenario, fun = "mean", relative = relative) - } - - if(plot){ - if( 'threshold' %in% attributes(self$get_data())$names ){ - if(has_name(out,"band")) out <- dplyr::rename(out, "time" = "band") - ggplot2::ggplot(out, - ggplot2::aes(x = time, y = as.numeric(area_km2))) + - ggplot2::theme_classic(base_size = 18) + - ggplot2::geom_line(size = 2) + - ggplot2::labs(x = "Time", y = expression(Area(km^2))) - } else { - ggplot2::ggplot(out, - ggplot2::aes(x = time, - y = suitability_q50, - ymin = suitability_q25, - ymax = suitability_q75)) + - ggplot2::theme_classic(base_size = 18) + - ggplot2::geom_ribbon(fill = "grey90") + - ggplot2::geom_line(size = 2) + - ggplot2::labs(x = "Time", y = expression(Area(km^2)), title = "Relative suitable habitat") - } - } - return(out) - }, - # Summarize beforeafter - summary_beforeafter = function(self){ - # Check that baseline and scenario thresholds are all there - assertthat::assert_that( - !is.Waiver(self$get_data()), - ( 'threshold' %in% attributes(self$get_data())$names ), - msg = "This function only works with added thresholds." - ) - scenario <- self$get_data()['threshold'] - - # Get runname - runname <- self$get_model()[["model"]]$runname - - return( - tibble::add_column( summarise_change(scenario), runname = runname, .before = 1) - ) - }, - # Calculate slopes - calc_scenarios_slope = function(self, what = 'suitability', plot = TRUE, oftype = "stars"){ - if(is.Waiver(self$get_data())) return( new_waiver() ) - assertthat::assert_that(what %in% attributes(self$get_data())$names ) - - oo <- self$get_data()[what] - tt <- as.numeric( stars::st_get_dimension_values(self$scenarios, 3) ) - # Calc pixel-wise linear slope - out <- stars::st_apply( - oo, - 1:2, - function(x) { - if (anyNA(x)) - NA_real_ - else - lm.fit(cbind(1, tt), x)$coefficients[2] - } - ) - names(out) <- 'linear_coefficient' - if(oftype == "stars"){ - if(plot) stars:::plot.stars(out, breaks = "fisher", col = c(ibis_colours$divg_bluered[1:10],"grey90",ibis_colours$divg_bluered[11:20])) - } else { - out <- as(out, "Raster") - if(plot) plot(out, col = c(ibis_colours$divg_bluered[1:10],"grey90",ibis_colours$divg_bluered[11:20])) - } - return(out) - }, - # Save object - save = function(self, fname, type = 'tif', dt = 'FLT4S'){ - assertthat::assert_that( - !missing(fname), - is.character(fname), - is.character(type), - !is.Waiver(self$get_data()), - is.character(dt) - ) - # Match input types - type <- match.arg(type, c('gtif','gtiff','tif','nc','ncdf', 'feather'), several.ok = FALSE) - dt <- match.arg(dt, c('LOG1S','INT1S','INT1U','INT2S','INT2U','INT4S','INT4U','FLT4S','FLT8S'), several.ok = FALSE ) - - if(file.exists(fname)) warning('Overwritting existing file...') - - # Respecify type if output filename has already been set - if(gsub('\\.','',raster::extension(fname)) != type) type <- gsub('\\.','',raster::extension(fname)) - - # Change output type for stars - dtstars <- switch(dt, - "LOG1S" = "Byte", - "INT1U" = "UInt16", - "INT1S" = "Int16", - "INT2U" = "UInt16", - "INT2S" = "Int16", - "INT4S" = "Int32", - "INT4U" = "UInt32", - "FLT4S" = "Float32", - "FLT8S" = "Float64" - ) - - # Get scenario object - ras <- self$get_data() - # If Migclim has been computed, save as well - if(!is.Waiver(self$scenarios_migclim)) ras_migclim <- self$get_data("scenarios_migclim") - - if(type %in% c('gtif','gtiff','tif')){ - # Write stars output for every band - for(i in 1:length(ras)){ - # Band specific output - fname2 <- paste0( tools::file_path_sans_ext(fname), "__", names(ras)[i], raster::extension(fname)) - stars::write_stars( - obj = ras, - layer = i, - dsn = fname2, - options = c("COMPRESS=DEFLATE"), - type = ifelse(is.factor(ras[[1]]), "Byte", dtstars), - NA_value = NA_real_, - update = ifelse(file.exists(fname2), TRUE, FALSE), - normalize_path = TRUE, - progress = TRUE - ) - } - if(!is.Waiver(self$scenarios_migclim)){ - fname2 <- paste0( tools::file_path_sans_ext(fname), "__migclim", raster::extension(fname)) - writeGeoTiff(ras_migclim, fname = fname, dt = dt) - } - } else if(type %in% c('nc','ncdf')) { - # Save as netcdf, for now in individual files - for(i in 1:length(ras)){ - # Band specific output - fname2 <- paste0( tools::file_path_sans_ext(fname), "__", names(ras)[i], raster::extension(fname)) - stars::write_stars( - obj = ras, - layer = 1:length(ras), - dsn = fname2, - type = ifelse(is.factor(ras[[1]]), "Byte", dtstars), - NA_value = NA_real_, - update = ifelse(file.exists(fname2), TRUE, FALSE), - normalize_path = TRUE, - progress = TRUE - ) - } - if(!is.Waiver(self$scenarios_migclim)){ - fname2 <- paste0( tools::file_path_sans_ext(fname), "__migclim", raster::extension(fname)) - writeNetCDF(ras_migclim, fname = fname, varName = "MigCLIM output", dt = dt) - } - } else if(type %in% 'feather'){ - assertthat::assert_that('feather' %in% installed.packages()[,1], - msg = 'Feather package not installed!') - fname <- paste0( tools::file_path_sans_ext(fname), "__migclim", ".feather") - feather::write_feather(ras, path = fname) - if(!is.Waiver(self$scenarios_migclim)){ - fname2 <- paste0( tools::file_path_sans_ext(fname), "__migclim", raster::extension(fname)) - feather::write_feather(ras, path = fname) - } - } - invisible() - } -) +#' @include utils.R bdproto.R +NULL + +#' @export +if (!methods::isClass("BiodiversityScenario")) methods::setOldClass("BiodiversityScenario") +NULL + +#' Prototype for a biodiversity scenario from a trained model +#' +#' Base [`proto`] class for any biodiversity scenario objects. +#' Serves as container that supplies data and functions to +#' other [`proto`] classes. +#' +#' @name BiodiversityScenario-class +#' @aliases BiodiversityScenario +#' @family bdproto +#' @keywords bdproto +NULL + +#' @export +BiodiversityScenario <- bdproto( + "BiodiversityScenario", + modelobject = new_waiver(), # The id of the model + modelid = new_waiver(), + limits = new_waiver(), + predictors = new_waiver(), + constraints = new_waiver(), + scenarios = new_waiver(), + # Print message with summary of model + print = function(self) { + # Check that model exists + fit <- self$get_model() + timeperiod <- self$get_timeperiod() + # Get set predictors and time period + pn = ifelse(is.Waiver(self$get_predictor_names()),'None',name_atomic(self$get_predictor_names(), "predictors")) + tp = ifelse(is.Waiver(timeperiod),'None', + paste0( + paste0( timeperiod,collapse = ' -- '), + ' (',round(as.numeric(difftime(self$get_timeperiod()[2],self$get_timeperiod()[1],unit="weeks"))/52.25,1),' years)' + ) + ) + # Constrains + cs <- self$get_constraints() + if(!is.Waiver(cs)) cs <- vapply(cs, function(x) x$method, character(1)) + # Thresholds + tr <- self$get_threshold() + + message(paste0( + ifelse(is.Waiver(self$limits),"Spatial-temporal scenario:","Spatial-temporal scenario (limited):"), + '\n Used model: ',ifelse(is.Waiver(fit) || isFALSE(fit), text_red('None'), class(fit)[1] ), + "\n --------- ", + "\n Predictors: ", pn, + "\n Time period: ", tp, + ifelse(!is.Waiver(cs)||!is.Waiver(tr), "\n --------- ", ""), + ifelse(is.Waiver(cs),"", paste0("\n Constraints: ", text_green(paste(paste0(names(cs),' (',cs,')'),collapse = ', ')) ) ), + ifelse(is.Waiver(tr),"", paste0("\n Threshold: ", round(tr[1], 3),' (',names(tr[1]),')') ), + "\n --------- ", + "\n Scenarios fitted: ", ifelse(is.Waiver(self$scenarios),text_yellow('None'), text_green('Yes')) + ) + ) + }, + # Verify that set Model exist and check self-validity + verify = function(self){ + assertthat::validate_that( !is.Waiver(self$modelobject), + !is.Waiver(self$modelid) + ) + # Get Model object and check that ID is correct + if(inherits(self$modelobject, "DistributionModel")){ + x <- self$modelobject + } else { + x <- get(self$modelobject) + } + assertthat::validate_that(x$id == self$modelid) + # Check that objects are correctly set or found + assertthat::assert_that(is.Waiver(self$get_predictors()) || inherits(self$get_predictors(), "PredictorDataset")) + assertthat::assert_that(is.Waiver(self$get_data()) || (inherits(self$get_data(), "stars") || is.Raster(self$get_data())) ) + assertthat::assert_that(is.Waiver(self$get_constraints()) || is.list(self$get_constraints())) + invisible() + }, + # Show the name of the Model + show = function(self) { + if(is.character(self$modelobject)){ + return( self$modelobject ) + } else { + return( fit$model$runname ) + } + }, + # Get projection + get_projection = function(self){ + return( self$predictors$get_projection() ) + }, + # Get resolution + get_resolution = function(self){ + return( self$predictors$get_resolution() ) + }, + # Get Model + get_model = function(self){ + if(is.Waiver(self$modelobject)) return( new_waiver() ) + else { + if(inherits(self$modelobject, "DistributionModel")){ + return( self$modelobject ) + } else { + if(!exists(self$modelobject)) return( FALSE ) else { + return( get(self$modelobject) ) + } + } + } + }, + # Get provided limits + get_limits = function(self){ + if(is.Waiver(self$limits)) return(NULL) + return(self$limits) + }, + # Get Model predictors + get_predictor_names = function(self) { + if(is.Waiver(self$predictors)) return(self$predictors) + if(inherits(self$predictors, "PredictorDataset")) { + self$predictors$get_names() + } else { + stop("Feature data is of an unrecognized class") + } + }, + # Get time period of projection + get_timeperiod = function(self, what = "range"){ + if(is.Waiver(self$predictors)) return(self$predictors) + if(inherits(self$predictors, "PredictorDataset")) { + if(what == "range"){ + return( + c( min(as.Date(self$predictors$timeperiod)), max(as.Date(self$predictors$timeperiod)) ) + ) + } else { + return( + sort( self$predictors$timeperiod ) + ) + } + } + }, + # Get constrains for model + get_constraints = function(self){ + return( self$constraints ) + }, + # Get thresholds if specified + get_threshold = function(self){ + if('threshold' %notin% names(self)) return( new_waiver() ) + return( self$threshold ) + }, + # Apply specific threshold + apply_threshold = function(self, tr = new_waiver()){ + # Assertions + if(is.Waiver(tr)) assertthat::assert_that( is.numeric(self$threshold), msg = 'No threshold value found.') + assertthat::assert_that( !is.Waiver(self$scenarios), msg = 'No scenarios found.') + # Get prediction and threshold + sc <- self$get_data() + if(!is.Waiver(tr)) tr <- self$threshold + # reclassify to binary + sc[sc < tr] <- 0; sc[sc >= tr] <- 1 + names(sc) <- 'presence' + return(sc) + }, + # Set Predictors + set_predictors = function(self, x){ + assertthat::assert_that(inherits(x, "PredictorDataset")) + bdproto(NULL, self, predictors = x) + }, + # Set constrains + set_constraints = function(self, x){ + if(!is.Waiver(self$get_constraints())){ + cr <- self$get_constraints() + # FIXME: Remove duplicates + bdproto(NULL, self, constraints = c(cr, x)) + } else { + bdproto(NULL, self, constraints = x) + } + }, + # Get Predictors + get_predictors = function(self){ + return(self$predictors) + }, + # Remove predictors + rm_predictors = function(self, names){ + if(is.Waiver(self$predictors) || is.null(self$predictors)) return(NULL) + if(missing(names)){ + names <- self$get_predictor_names() # Assume all names + } + assertthat::assert_that( + is.character(names) || assertthat::is.scalar(names) || is.vector(names) + ) + # Get predictor collection + prcol <- bdproto(NULL, self) + # Set the object + prcol$predictors$rm_data(names) + if(length(prcol$get_predictor_names())==0) prcol$predictors <- new_waiver() + return(prcol) + }, + # Get scenario predictions + get_data = function(self, what = "scenarios"){ + return(self[[what]]) + }, + # Plot the prediction + plot = function(self, what = "suitability", which = NULL, ...){ + if(is.Waiver(self$get_data())){ + if(getOption('ibis.setupmessages')) myLog('[Scenario]','red','No scenarios found') + invisible() + } else { + # Get unique number of data values. Surely there must be an easier val + vals <- self$get_data()[what] |> stars:::pull.stars() |> as.vector() |> unique() + vals <- length(stats::na.omit(vals)) + if(vals>2) col <- ibis_colours$sdm_colour else col <- c('grey25','coral') + if(is.null(which)){ + stars:::plot.stars( self$get_data()[what], breaks = "equal", col = col ) + } else { + # Assert that which is actually within the dimensions + assertthat::assert_that(which <= dim(self$get_data())[3], + msg = "Band selection out of bounds.") + obj <- self$get_data()[what,,,which] + stars:::plot.stars( obj, breaks = "equal", col = col, + main = paste0(what," for ", stars::st_get_dimension_values(obj, "band") ) ) + } + } + }, + # Convenience function to plot thresholds if set + plot_threshold = function(self, which = NULL){ + # Check that baseline and scenario thresholds are all there + if(!( 'threshold' %in% attributes(self$get_data())$names )) return(new_waiver()) + + self$plot(what = "threshold", which = which) + }, + # Plot Migclim results if existing + plot_migclim = function(self){ + # Get scenarios + mc <- self$get_data("scenarios_migclim") + if(is.Waiver(mc)) return(mc) + + # Otherwise plot the raster + ras <- mc$raster + + # Colour coding from MigClim::MigClim.plot + rstVals <- sort(raster::unique(ras)) + negativeNb <- length(which(rstVals < 0)) + positiveNb <- length(which(rstVals > 1 & rstVals < 30000)) + zeroExists <- any(rstVals == 0) + oneExists <- any(rstVals == 1) + unilimtedExists <- any(rstVals == 30000) + Colors <- rep("yellow", negativeNb) + if(zeroExists) Colors <- c(Colors, "grey94") + if (oneExists) Colors <- c(Colors, "black") + Colors <- c(Colors, rainbow(positiveNb, start = 0, end = 0.4)) + if (unilimtedExists) Colors <- c(Colors, "pink") + + # Plot + # 0 - Cells that have never been occupied and are unsuitable habitat at the end of the simulation + # 1 - Cells that belong to the species' initial distribution and that have remained occupied during the entire simulation. + # 1 < value < 30 000 - determine the dispersal step during which it was colonized. E.g. 101 is first dispersal even in first step + # 30 0000 - Potentially suitable cells that remained uncolonized + # <0 - Negative values indicate cells that were once occupied but have become decolonized. Code as for colonization + dev.new(width = 7, height = 7 * ((ymax(ras) - ymin(ras))/(xmax(ras) - xmin(ras)))) + plot(ras, col = Colors, breaks = c(min(rstVals) - 1, rstVals), legend = FALSE, + main = "Newly colonized and stable habitats") + }, + # Plot animation of scenarios + plot_animation = function(self, what = "suitability", fname = NULL){ + assertthat::assert_that(!is.Waiver(self$get_data()) ) + check_package('gganimate') + # Get scenarios + obj <- self$get_data()[what] + + # Make the animation plot + g <- ggplot2::ggplot() + + stars::geom_stars(data = obj, downsample = c(1,1,0)) + + ggplot2::coord_equal() + + ggplot2::theme_bw(base_size = 20) + + ggplot2::scale_x_discrete(expand=c(0,0)) + + ggplot2::scale_y_discrete(expand=c(0,0)) + + ggplot2::scale_fill_gradientn(colours = ibis_colours$sdm_colour, na.value = NA) + + # Animation options + gganimate::transition_time(band) + + gganimate::ease_aes() + + ggplot2::labs(x = '', y ='', title = "{frame_time}") + + if(is.null(fname)){ + gganimate::anim_save(animation = g, fname) + } else { g } + }, + #Plot relative change between baseline and projected thresholds + plot_relative_change = function(self, position = NULL, variable = 'mean', plot = TRUE){ + # Default position is the last one + assertthat::assert_that(is.null(position) || is.numeric(position) || is.character(position), + is.character(variable)) + # Threshold + obj <- self$get_model() + thresh_reference <- grep('threshold',obj$show_rasters(),value = T) + # If there is more than one threshold only use the one from variable + if(length(thresh_reference)>1) { + warning('More than one baseline threshold. Using the first one.') + thresh_reference <- grep(variable, thresh_reference,value = T)[1] + } + # Check that baseline and scenarios are all there + assertthat::assert_that( + !is.Waiver(self$get_data()), + 'threshold' %in% attributes(self$get_data())$names, + length(thresh_reference) >0 & is.character(thresh_reference), + is.Raster( self$get_model()$get_data('prediction') ), + msg = "Threshold not found!" + ) + + # Not get the baseline raster + baseline <- self$get_model()$get_data(thresh_reference) + # And the last scenario prediction + scenario <- self$get_data()['threshold'] + time <- stars::st_get_dimension_values(scenario, which = 3) # 3 assumed to be time band + if(is.numeric(position)) position <- time[position] + if(is.null(position)) position <- time[length(time)] + final <- scenario |> + stars:::filter.stars(band == position) |> + methods::as('Raster') + raster::projection(final) <- raster::projection(baseline) + # -- # + if(!inherits(final, 'RasterLayer')) final <- final[[1]] # In case it is a rasterbrick or similar + if(!compareRaster(baseline, final,stopiffalse = FALSE)) final <- alignRasters(final, baseline, cl = FALSE) # In case they somehow differ? + + # Calculate overlays + diff_f <- raster::overlay(baseline, final, fun = function(x, y){x + y * 2}) + diff_f <- raster::ratify(diff_f) + # 0 = Unsuitable | 1 = Loss | 2 = Gain | 3 = stable + rat <- levels(diff_f)[[1]] + rat <- merge.data.frame(rat, data.frame(ID = seq(0,3), diff = c("Unsuitable", "Loss", "Gain", "Stable")),all = TRUE) + levels(diff_f) <- rat + diff_f <- raster::mask(diff_f, baseline) + rm(baseline, final) + + # Plot + if(plot){ + # Colours + cols <- c("Unsuitable" = "gray92", "Loss" = "#DE646A", "Gain" = "cyan3", "Stable" = "gray60") + + # Convert to raster + diff_ff <- raster::as.data.frame(diff_f, xy = TRUE) + names(diff_ff)[3] <- "Change" + diff_ff$Change <- factor(diff_ff$Change, levels = names(cols)) + + # Use ggplot + g <- ggplot2::ggplot() + + ggplot2::coord_equal() + + ggplot2::geom_raster(data = diff_ff, ggplot2::aes(x = x, y = y, fill = Change)) + + ggplot2::theme_light(base_size = 18) + + ggplot2::scale_x_discrete(expand=c(0,0)) + + ggplot2::scale_y_discrete(expand=c(0,0)) + + ggplot2::scale_fill_manual(values = cols, na.value = 'transparent') + + ggplot2::theme(legend.position = "bottom") + + ggplot2::labs(x = "", y = "", title = paste0('Change between baseline and ', position)) + return(g) + } else { + # Return + return(diff_f) + } + }, + # Summarize the change in layers between timesteps + summary = function(self, layer = "threshold", plot = FALSE, relative = FALSE){ + # Check that baseline and scenario thresholds are all there + assertthat::assert_that( + !is.Waiver(self$get_data()), + is.logical(plot), is.logical(relative) + ) + if( layer == "threshold" & 'threshold' %in% attributes(self$get_data())$names ){ + # TODO: Try and get rid of dplyr dependency. Currently too much work to not use it + check_package("dplyr") + # Get the scenario predictions and from there the thresholds + scenario <- self$get_data()['threshold'] + time <- stars::st_get_dimension_values(scenario,which = 'band') + assertthat::assert_that(!is.na(sf::st_crs(scenario)), msg = "Scenario not correctly projected.") + # HACK: Add area to stars + ar <- stars:::st_area.stars(scenario) + # Get the unit + ar_unit <- units::deparse_unit(ar$area) + new <- methods::as(scenario,"Raster") * methods::as(ar, "Raster") + new <- raster::setZ(new, time) + # Convert to scenarios to data.frame + df <- stars:::as.data.frame.stars(stars:::st_as_stars(new)) |> (\(.) subset(., stats::complete.cases(.)))() + names(df)[4] <- "area" + # --- # + # Now calculate from this data.frame several metrics related to the area and change in area + df <- df |> dplyr::group_by(x,y) |> dplyr::mutate(id = dplyr::cur_group_id()) |> + dplyr::ungroup() |> dplyr::select(-x,-y) |> + dplyr::mutate(area = dplyr::if_else(is.na(area), 0, area)) |> # Convert missing data to 0 + dplyr::arrange(id, band) + df$area <- units::as_units(df$area, units::as_units(ar_unit)) # Set Units + # Convert to km2 and remove units as this causes issues with dplyr + df$area <- units::set_units(df$area, "km2") |> units::drop_units() + + # Total amount of area occupied for a given time step + out <- df |> dplyr::group_by(band) |> dplyr::summarise(area_km2 = sum(area, na.rm = TRUE)) + out$totarea <- raster::cellStats((new[[1]]>=0) * methods::as(ar, "Raster"), "sum") + if(units::deparse_unit(units::as_units(ar_unit)) == "m2") { + out$totarea <- out$totarea / 1e6 + out <- dplyr::rename(out, totarea_km2 = totarea) + } + + # Total amount of area lost / gained / stable since previous time step + totchange_occ <- df |> + dplyr::group_by(id) |> + dplyr::mutate(change = (area - dplyr::lag(area)) ) |> dplyr::ungroup() |> + (\(.) subset(., stats::complete.cases(.)))() + o <- totchange_occ |> dplyr::group_by(band) |> + dplyr::summarise(totchange_stable_km2 = sum(area[change == 0]), + totchange_gain_km2 = sum(change[change > 0]), + totchange_loss_km2 = sum(change[change < 0])) + out <- out |> dplyr::left_join(o, by = "band") + + if(relative == TRUE){ + # Finally calculate relative change to baseline (first entry) for all entries where this is possible + relChange <- function(v, fac = 100) (((v- v[1]) / v[1]) * fac) + out <- subset(out, select = c("band", "area_km2", "totarea_km2")) + out[,c("area_km2")] <- apply( out[,c("area_km2")], 2, relChange) + } + } else { + # Get the scenario predictions and from there the thresholds + scenario <- self$get_data()['suitability'] + times <- stars::st_get_dimension_values(scenario, which = 'band') + # Get area + ar <- stars:::st_area.stars(scenario) + # Get the unit + ar_unit <- units::deparse_unit(ar$area) + + # TODO: Check whether one could not simply multiply with area (poisson > density, binomial > suitable area) + mod <- self$get_model() + scenario <- scenario * ar + out <- summarise_projection(scenario, fun = "mean", relative = relative) + } + + if(plot){ + if( 'threshold' %in% attributes(self$get_data())$names ){ + if(has_name(out,"band")) out <- dplyr::rename(out, "time" = "band") + ggplot2::ggplot(out, + ggplot2::aes(x = time, y = as.numeric(area_km2))) + + ggplot2::theme_classic(base_size = 18) + + ggplot2::geom_line(size = 2) + + ggplot2::labs(x = "Time", y = expression(Area(km^2))) + } else { + ggplot2::ggplot(out, + ggplot2::aes(x = time, + y = suitability_q50, + ymin = suitability_q25, + ymax = suitability_q75)) + + ggplot2::theme_classic(base_size = 18) + + ggplot2::geom_ribbon(fill = "grey90") + + ggplot2::geom_line(size = 2) + + ggplot2::labs(x = "Time", y = expression(Area(km^2)), title = "Relative suitable habitat") + } + } + return(out) + }, + # Summarize beforeafter + summary_beforeafter = function(self){ + # Check that baseline and scenario thresholds are all there + assertthat::assert_that( + !is.Waiver(self$get_data()), + ( 'threshold' %in% attributes(self$get_data())$names ), + msg = "This function only works with added thresholds." + ) + scenario <- self$get_data()['threshold'] + + # Get runname + runname <- self$get_model()[["model"]]$runname + + return( + tibble::add_column( summarise_change(scenario), runname = runname, .before = 1) + ) + }, + # Calculate slopes + calc_scenarios_slope = function(self, what = 'suitability', plot = TRUE, oftype = "stars"){ + if(is.Waiver(self$get_data())) return( new_waiver() ) + assertthat::assert_that(what %in% attributes(self$get_data())$names ) + + oo <- self$get_data()[what] + tt <- as.numeric( stars::st_get_dimension_values(self$scenarios, 3) ) + # Calc pixel-wise linear slope + out <- stars::st_apply( + oo, + 1:2, + function(x) { + if (anyNA(x)) + NA_real_ + else + lm.fit(cbind(1, tt), x)$coefficients[2] + } + ) + names(out) <- 'linear_coefficient' + if(oftype == "stars"){ + if(plot) stars:::plot.stars(out, breaks = "fisher", col = c(ibis_colours$divg_bluered[1:10],"grey90",ibis_colours$divg_bluered[11:20])) + } else { + out <- methods::as(out, "Raster") + if(plot) plot(out, col = c(ibis_colours$divg_bluered[1:10],"grey90",ibis_colours$divg_bluered[11:20])) + } + return(out) + }, + # Save object + save = function(self, fname, type = 'tif', dt = 'FLT4S'){ + assertthat::assert_that( + !missing(fname), + is.character(fname), + is.character(type), + !is.Waiver(self$get_data()), + is.character(dt) + ) + # Match input types + type <- match.arg(type, c('gtif','gtiff','tif','nc','ncdf', 'feather'), several.ok = FALSE) + dt <- match.arg(dt, c('LOG1S','INT1S','INT1U','INT2S','INT2U','INT4S','INT4U','FLT4S','FLT8S'), several.ok = FALSE ) + + if(file.exists(fname)) warning('Overwritting existing file...') + + # Respecify type if output filename has already been set + if(gsub('\\.','',raster::extension(fname)) != type) type <- gsub('\\.','',raster::extension(fname)) + + # Change output type for stars + dtstars <- switch(dt, + "LOG1S" = "Byte", + "INT1U" = "UInt16", + "INT1S" = "Int16", + "INT2U" = "UInt16", + "INT2S" = "Int16", + "INT4S" = "Int32", + "INT4U" = "UInt32", + "FLT4S" = "Float32", + "FLT8S" = "Float64" + ) + + # Get scenario object + ras <- self$get_data() + # If Migclim has been computed, save as well + if(!is.Waiver(self$scenarios_migclim)) ras_migclim <- self$get_data("scenarios_migclim") + + if(type %in% c('gtif','gtiff','tif')){ + # Write stars output for every band + for(i in 1:length(ras)){ + # Band specific output + fname2 <- paste0( tools::file_path_sans_ext(fname), "__", names(ras)[i], raster::extension(fname)) + stars::write_stars( + obj = ras, + layer = i, + dsn = fname2, + options = c("COMPRESS=DEFLATE"), + type = ifelse(is.factor(ras[[1]]), "Byte", dtstars), + NA_value = NA_real_, + update = ifelse(file.exists(fname2), TRUE, FALSE), + normalize_path = TRUE, + progress = TRUE + ) + } + if(!is.Waiver(self$scenarios_migclim)){ + fname2 <- paste0( tools::file_path_sans_ext(fname), "__migclim", raster::extension(fname)) + writeGeoTiff(ras_migclim, fname = fname, dt = dt) + } + } else if(type %in% c('nc','ncdf')) { + # Save as netcdf, for now in individual files + for(i in 1:length(ras)){ + # Band specific output + fname2 <- paste0( tools::file_path_sans_ext(fname), "__", names(ras)[i], raster::extension(fname)) + stars::write_stars( + obj = ras, + layer = 1:length(ras), + dsn = fname2, + type = ifelse(is.factor(ras[[1]]), "Byte", dtstars), + NA_value = NA_real_, + update = ifelse(file.exists(fname2), TRUE, FALSE), + normalize_path = TRUE, + progress = TRUE + ) + } + if(!is.Waiver(self$scenarios_migclim)){ + fname2 <- paste0( tools::file_path_sans_ext(fname), "__migclim", raster::extension(fname)) + writeNetCDF(ras_migclim, fname = fname, varName = "MigCLIM output", dt = dt) + } + } else if(type %in% 'feather'){ + assertthat::assert_that('feather' %in% utils::installed.packages()[,1], + msg = 'Feather package not installed!') + fname <- paste0( tools::file_path_sans_ext(fname), "__migclim", ".feather") + feather::write_feather(ras, path = fname) + if(!is.Waiver(self$scenarios_migclim)){ + fname2 <- paste0( tools::file_path_sans_ext(fname), "__migclim", raster::extension(fname)) + feather::write_feather(ras, path = fname) + } + } + invisible() + } +) diff --git a/R/bdproto-distributionmodel.R b/R/bdproto-distributionmodel.R index 682c811d..f7acb522 100644 --- a/R/bdproto-distributionmodel.R +++ b/R/bdproto-distributionmodel.R @@ -1,421 +1,421 @@ -#' @include utils.R bdproto.R identifier.R -NULL - -#' @export -if (!methods::isClass("DistributionModel")) methods::setOldClass("DistributionModel") -NULL - -#' Prototype for the trained Model object -#' -#' All trained Models should inherit the options here -#' -#' @name DistributionModel-class -#' @aliases DistributionModel -#' @family bdproto -#' @keywords bdproto -NULL - -#' @export -DistributionModel <- bdproto( - "DistributionModel", - id = character(), # An id for any trained model - model = list(), - fits = list(), # List of fits with data - # Print message with summary of model - print = function(self) { - # TODO: Have a lot more information in here and to be prettified - - # Check whether prediction exists and number of layers - has_prediction <- "prediction" %in% self$show_rasters() - # Check whether threshold has been calculated - has_threshold <- grep('threshold',self$show_rasters(),value = TRUE)[1] - - # FIXME: Have engine-specific code moved to engine - if(inherits(self, 'INLA-Model') || inherits(self, 'INLABRU-Model') ){ - if( length( self$fits ) != 0 ){ - # Get strongest effects - ms <- subset(tidy_inla_summary(self$get_data('fit_best')), - select = c('variable', 'mean')) - ms <- ms[order(ms$mean,decreasing = TRUE),] # Sort - - message(paste0( - 'Trained ',class(self)[1],' (',self$show(),')', - '\n \033[2mStrongest summary effects:\033[22m', - '\n \033[34mPositive:\033[39m ', name_atomic(ms$variable[ms$mean>0]), - '\n \033[31mNegative:\033[39m ', name_atomic(ms$variable[ms$mean<0]), - ifelse(has_prediction, - paste0("\n Prediction fitted: ",text_green("yes")), - ""), - ifelse(!is.na(has_threshold), - paste0("\n Threshold created: ",text_green("yes")), - "") - )) - } - } else if( inherits(self, 'GDB-Model') ) { - - # Get Variable importance - vi <- mboost::varimp( - self$get_data('fit_best') - ) - vi <- sort( vi[which(vi>0)],decreasing = TRUE ) - - message(paste0( - 'Trained ',class(self)[1],' (',self$show(),')', - '\n \033[2mStrongest effects:\033[22m', - '\n ', name_atomic(names(vi)), - ifelse(has_prediction, - paste0("\n Prediction fitted: ",text_green("yes")), - ""), - ifelse(!is.na(has_threshold), - paste0("\n Threshold created: ",text_green("yes")), - "") - )) - } else if( inherits(self, 'BART-Model') ) { - # Calculate variable importance from the posterior trees - vi <- varimp.bart(self$get_data('fit_best')) - - message(paste0( - 'Trained ',class(self)[1],' (',self$show(),')', - '\n \033[2mStrongest effects:\033[22m', - '\n ', name_atomic(vi$names), - ifelse(has_prediction, - paste0("\n Prediction fitted: ",text_green("yes")), - ""), - ifelse(!is.na(has_threshold), - paste0("\n Threshold created: ",text_green("yes")), - "") - )) - } else if( inherits(self, 'STAN-Model') ) { - # Calculate variable importance from the posterior - vi <- rstan::summary(self$get_data('fit_best'))$summary |> as.data.frame() |> - tibble::rownames_to_column(var = "parameter") |> as.data.frame() - # Get beta coefficients only - vi <- vi[grep("beta", vi$parameter,ignore.case = TRUE),] - - # Get variable names from model object - # FIXME: This might not work for all possible modelling objects. For instance - model <- self$model - assertthat::assert_that(nrow(vi) == length(model$predictors_names), - length(vi$parameter) == length(model$predictors_names)) - vi$parameter <- model$predictors_names - - vi <- vi[order(abs(vi$mean),decreasing = TRUE),] - message(paste0( - 'Trained ',class(self)[1],' (',self$show(),')', - '\n \033[2mStrongest summary effects:\033[22m', - '\n \033[34mPositive:\033[39m ', name_atomic(vi$parameter[vi$mean>0]), - '\n \033[31mNegative:\033[39m ', name_atomic(vi$parameter[vi$mean<0]) - )) - } else if( inherits(self, 'XGBOOST-Model') ) { - vi <- xgboost::xgb.importance(model = self$get_data('fit_best'),) - - message(paste0( - 'Trained ',class(self)[1],' (',self$show(),')', - '\n \033[2mStrongest effects:\033[22m', - '\n ', name_atomic(vi$Feature), - ifelse(has_prediction, - paste0("\n Prediction fitted: ",text_green("yes")), - ""), - ifelse(!is.na(has_threshold), - paste0("\n Threshold created: ",text_green("yes")), - "") - )) - } else if( inherits(self, 'BREG-Model') ) { - obj <- self$get_data('fit_best') - # Summarize the beta coefficients from the posterior - ms <- posterior::summarise_draws(obj$beta) |> - subset(select = c('variable', 'mean')) - # Reorder - ms <- ms[order(ms$mean,decreasing = TRUE),] # Sort - - message(paste0( - 'Trained ',class(self)[1],' (',self$show(),')', - '\n \033[2mStrongest summary effects:\033[22m', - '\n \033[34mPositive:\033[39m ', name_atomic(ms$variable[ms$mean>0]), - '\n \033[31mNegative:\033[39m ', name_atomic(ms$variable[ms$mean<0]), - ifelse(has_prediction, - paste0("\n Prediction fitted: ",text_green("yes")), - ""), - ifelse(!is.na(has_threshold), - paste0("\n Threshold created: ",text_green("yes")), - "") - )) - } else if(inherits(self, 'GLMNET-Model')) { - obj <- self$get_data('fit_best') - - # Summarise coefficients within 1 standard deviation - ms <- tidy_glmnet_summary(obj) - - message(paste0( - 'Trained ',class(self)[1],' (',self$show(),')', - '\n \033[1mStrongest summary effects:\033[22m', - '\n \033[34mPositive:\033[39m ', name_atomic(ms$variable[ms$mean>0]), - '\n \033[31mNegative:\033[39m ', name_atomic(ms$variable[ms$mean<0]), - ifelse(has_prediction, - paste0("\n Prediction fitted: ",text_green("yes")), - ""), - ifelse(!is.na(has_threshold), - paste0("\n Threshold created: ",text_green("yes")), - "") - )) - - } else { - message(paste0( - 'Trained distribution model (',self$show(),')', - text_red('\n No fitted model found!'), - ifelse(has_prediction, - paste0("\n Prediction fitted: ",text_green("yes")), - ""), - ifelse(!is.na(has_threshold), - paste0("\n Threshold created: ",text_green("yes")), - "") - )) - } - }, - # Show the name of the Model - show = function(self) { - self$model$runname - }, - # Plot the prediction - plot = function(self, what = 'mean'){ - if( length( self$fits ) != 0 && !is.null( self$fits$prediction ) ){ - pred <- self$get_data('prediction') - assertthat::assert_that(is.Raster(pred)) - # Check if median is requested but not present, change to q50 - if(what == "median" && !(what %in% names(pred))) { what <- "q50" } - - # Match argument - what <- match.arg(what, names(pred), several.ok = FALSE) - assertthat::assert_that( what %in% names(pred),msg = paste0('Prediction type not found. Available: ', paste0(names(pred),collapse = '|'))) - raster::plot(pred[[what]], - main = paste0(self$model$runname, ' prediction (',what,')'), - box = FALSE, - axes = TRUE, - colNA = NA, col = ibis_colours[['sdm_colour']] - ) - } else { - message( - paste0('No model predictions found.') - ) - } - }, - # Plot threshold - plot_threshold = function(self, what = 1){ - assertthat::assert_that(is.numeric(what) || is.character(what)) - # Determines whether a threshold exists and plots it - rl <- self$show_rasters() - if(length(grep('threshold',rl))>0){ - - # Get stack of computed thresholds - ras <- raster::stack( self$get_data( grep('threshold',rl,value = TRUE)[[what]] ) ) - suppressWarnings( - ras <- raster::deratify(ras, complete = TRUE) - ) - # Get colour palette - format <- attr(ras[[1]], 'format') # Format attribute - if(format == "normalize"){ - col <- colorRampPalette(c("grey","#EB072F","#FFE900","#5A94DD","black"))(100) - } else if(format == "percentile") { - col <- colorRampPalette(c("grey","#EB072F","#FFE900","#5A94DD","black"))(length(unique(ras))) - } else { - # Binary - col <- c("grey", "black") - } - raster::plot(ras, - box = FALSE, - axes = TRUE, - colNA = NA, col = col - ) - } else { - message("No computed threshold was found!") - invisible() - } - }, - # Show model run time if settings exist - show_duration = function(self){ - if(!is.Waiver(self$settings)) self$settings$duration() - }, - # Get effects or importance tables from model - summary = function(self, obj = 'fit_best'){ - # Distinguishing between model types - if(inherits(self, 'GDB-Model')){ - clean_mboost_summary( self$get_data(obj) ) - } else if(inherits(self, 'INLA-Model') || inherits(self, 'INLABRU-Model')){ - tidy_inla_summary(self$get_data(obj)) - } else if(inherits(self, 'BART-Model')){ - # Number of times each variable is used by a tree split - # Tends to become less informative with higher numbers of splits - varimp.bart(self$get_data(obj)) %>% tibble::remove_rownames() - } else if(inherits(self, 'STAN-Model')){ - vi <- rstan::summary(self$get_data(obj))$summary |> as.data.frame() |> - tibble::rownames_to_column(var = "parameter") |> as.data.frame() - # Get beta coefficients only - vi <- vi[grep("beta", vi$parameter,ignore.case = TRUE),] - # FIXME: This might not work for all possible modelling objects. For instance - model <- self$model - assertthat::assert_that(nrow(vi) == length(model$predictors_names), - length(vi$parameter) == length(model$predictors_names)) - vi$parameter <- model$predictors_names - names(vi) <- make.names(names(vi)) - return( tibble::as_tibble( vi ) ) - } else if(inherits(self, 'BREG-Model')){ - posterior::summarise_draws(self$get_data(obj)$beta) - } else if(inherits(self, "XGBOOST-Model")){ - xgboost::xgb.importance(model = self$get_data(obj)) - } else if(inherits(self, 'GLMNET-Model')){ - tidy_glmnet_summary(self$get_data(obj)) - } - }, - # Dummy partial response calculation. To be overwritten per engine - partial = function(self){ - new_waiver() - }, - # Dummy spartial response calculation. To be overwritten per engine - spartial = function(self){ - new_waiver() - }, - # Generic plotting function for effect plots - effects = function(self, x = 'fit_best', what = 'fixed', ...){ - assertthat::assert_that(is.character(what)) - if(inherits(self, 'GDB-Model')){ - # How many effects - n <- length( coef( self$get_data(x) )) - # Use the base plotting - par.ori <- par(no.readonly = TRUE) - par(mfrow = c(ceiling(n/3),3)) - - mboost:::plot.mboost(x = self$get_data(x), - type = 'b',cex.axis=1.5, cex.lab=1.5) - - par(par.ori)#dev.off() - } else if(inherits(self, 'INLA-Model')) { - plot_inla_marginals(self$get_data(x),what = what) - } else if(inherits(self, 'GLMNET-Model')) { - if(what == "fixed"){ - glmnet:::plot.glmnet(self$get_data(x)$glmnet.fit, xvar = "lambda") # Deviance explained - } else{ plot(self$get_data(x)) } - } else if(inherits(self, 'STAN-Model')) { - # Get true beta parameters - ra <- grep("beta", names(self$get_data(x)),value = TRUE) # Get range - rstan::stan_plot(self$get_data(x), pars = ra) - } else if(inherits(self, 'INLABRU-Model')) { - # Use inlabru effect plot - ggplot2::ggplot() + - inlabru:::gg(self$get_data(x)$summary.fixed, bar = TRUE) - } else if(inherits(self, 'BART-Model')){ - message('Calculating partial dependence plots') - self$partial(self$get_data(x), x.vars = what, ...) - } else if(inherits(self, 'BREG-Model')){ - obj <- self$get_data(x) - if(what == "fixed") what <- "coefficients" - what <- match.arg(what, choices = c("coefficients", "scaled.coefficients","residuals", - "size", "fit", "help", "inclusion"), several.ok = FALSE) - if( length( grep("poisson", obj$call) ) > 0 ){ - BoomSpikeSlab::plot.poisson.spike(obj, y = what) - } else if( length( grep("binomial", obj$call) ) > 0 ){ - BoomSpikeSlab::plot.logit.spike(obj, y = what) - } else { - BoomSpikeSlab::plot.lm.spike(obj, y = what) - } - } else if(inherits(self, "XGBOOST-Model")){ - # Check whether linear model was fitted, otherwise plot tree - if( self$settings$get("only_linear") ){ - vi <- self$summary(x) - xgboost::xgb.ggplot.importance(vi) - } else { - obj <- self$get_data(x) - xgboost::xgb.plot.multi.trees(obj) - } - } else { - self$partial(self$get_data(x), x.vars = NULL) - } - }, - # Get equation - get_equation = function(self){ - self$get_data("fit_best_equation") - }, - # Get specific fit from this Model - get_data = function(self, x = "prediction") { - if (!x %in% names(self$fits)) - return(new_waiver()) - return(self$fits[[x]]) - }, - # Set fit for this Model - set_data = function(self, x, value) { - # Get biodiversity dataset collection - ff <- self$fits - # Set the object - ff[[x]] <- value - bdproto(NULL, self, fits = ff ) - }, - # Get the threshold value if calculated - get_thresholdvalue = function(self){ - # Determines whether a threshold exists and plots it - rl <- self$show_rasters() - if(length(grep('threshold',rl))==0) return( new_waiver() ) - - # Get the thresholded layer and return the respective attribute - obj <- self$get_data( grep('threshold',rl,value = TRUE) ) - assertthat::assert_that(assertthat::has_attr(obj, "threshold")) - return( - attr(obj, "threshold") - ) - }, - # List all rasters in object - show_rasters = function(self){ - rn <- names(self$fits) - rn <- rn[ which( sapply(rn, function(x) is.Raster(self$get_data(x)) ) ) ] - return(rn) - }, - # Get projection - get_projection = function(self){ - sf::st_crs(self$model$background) - }, - # Get resolution - get_resolution = function(self){ - if(!is.Waiver(self$get_data())){ - raster::res( self$get_data() ) - } else { - # Try to get it from the modelling object - self$model$predictors_object$get_resolution() - } - }, - # Remove calculated thresholds - rm_threshold = function(self){ - rl <- self$show_rasters() - if(length(grep('threshold',rl))>0){ - for(val in grep('threshold',rl,value = TRUE)){ - self$fits[[val]] <- NULL - } - } - invisible() - }, - # Save object - save = function(self, fname, type = 'gtif', dt = 'FLT4S'){ - assertthat::assert_that( - is.character(fname), - type %in% c('gtif','gtiff','tif','nc','ncdf'), - 'fits' %in% self$ls(), - dt %in% c('LOG1S','INT1S','INT1U','INT2S','INT2U','INT4S','INT4U','FLT4S','FLT8S') - ) - type <- tolower(type) - - # Get raster file in fitted object - cl <- sapply(self$fits, class) - ras <- self$fits[[grep('raster', cl,ignore.case = T)]] - - # Check that no-data value is not present in ras - assertthat::assert_that(any(!cellStats(ras,min) <= -9999),msg = 'No data value -9999 is potentially in prediction!') - - if(file.exists(fname)) warning('Overwritting existing file...') - if(type %in% c('gtif','gtiff','tif')){ - # Save as geotiff - writeGeoTiff(ras, fname = fname, dt = dt) - } else if(type %in% c('nc','ncdf')) { - # Save as netcdf - # TODO: Potentially change the unit descriptions - writeNetCDF(ras, fname = fname, varName = 'iSDM prediction', varUnit = "",varLong = "") - } - invisible() - } -) +#' @include utils.R bdproto.R identifier.R +NULL + +#' @export +if (!methods::isClass("DistributionModel")) methods::setOldClass("DistributionModel") +NULL + +#' Prototype for the trained Model object +#' +#' All trained Models should inherit the options here +#' +#' @name DistributionModel-class +#' @aliases DistributionModel +#' @family bdproto +#' @keywords bdproto +NULL + +#' @export +DistributionModel <- bdproto( + "DistributionModel", + id = character(), # An id for any trained model + model = list(), + fits = list(), # List of fits with data + # Print message with summary of model + print = function(self) { + # TODO: Have a lot more information in here and to be prettified + + # Check whether prediction exists and number of layers + has_prediction <- "prediction" %in% self$show_rasters() + # Check whether threshold has been calculated + has_threshold <- grep('threshold',self$show_rasters(),value = TRUE)[1] + + # FIXME: Have engine-specific code moved to engine + if(inherits(self, 'INLA-Model') || inherits(self, 'INLABRU-Model') ){ + if( length( self$fits ) != 0 ){ + # Get strongest effects + ms <- subset(tidy_inla_summary(self$get_data('fit_best')), + select = c('variable', 'mean')) + ms <- ms[order(ms$mean,decreasing = TRUE),] # Sort + + message(paste0( + 'Trained ',class(self)[1],' (',self$show(),')', + '\n \033[2mStrongest summary effects:\033[22m', + '\n \033[34mPositive:\033[39m ', name_atomic(ms$variable[ms$mean>0]), + '\n \033[31mNegative:\033[39m ', name_atomic(ms$variable[ms$mean<0]), + ifelse(has_prediction, + paste0("\n Prediction fitted: ",text_green("yes")), + ""), + ifelse(!is.na(has_threshold), + paste0("\n Threshold created: ",text_green("yes")), + "") + )) + } + } else if( inherits(self, 'GDB-Model') ) { + + # Get Variable importance + vi <- mboost::varimp( + self$get_data('fit_best') + ) + vi <- sort( vi[which(vi>0)],decreasing = TRUE ) + + message(paste0( + 'Trained ',class(self)[1],' (',self$show(),')', + '\n \033[2mStrongest effects:\033[22m', + '\n ', name_atomic(names(vi)), + ifelse(has_prediction, + paste0("\n Prediction fitted: ",text_green("yes")), + ""), + ifelse(!is.na(has_threshold), + paste0("\n Threshold created: ",text_green("yes")), + "") + )) + } else if( inherits(self, 'BART-Model') ) { + # Calculate variable importance from the posterior trees + vi <- varimp.bart(self$get_data('fit_best')) + + message(paste0( + 'Trained ',class(self)[1],' (',self$show(),')', + '\n \033[2mStrongest effects:\033[22m', + '\n ', name_atomic(vi$names), + ifelse(has_prediction, + paste0("\n Prediction fitted: ",text_green("yes")), + ""), + ifelse(!is.na(has_threshold), + paste0("\n Threshold created: ",text_green("yes")), + "") + )) + } else if( inherits(self, 'STAN-Model') ) { + # Calculate variable importance from the posterior + vi <- rstan::summary(self$get_data('fit_best'))$summary |> as.data.frame() |> + tibble::rownames_to_column(var = "parameter") |> as.data.frame() + # Get beta coefficients only + vi <- vi[grep("beta", vi$parameter,ignore.case = TRUE),] + + # Get variable names from model object + # FIXME: This might not work for all possible modelling objects. For instance + model <- self$model + assertthat::assert_that(nrow(vi) == length(model$predictors_names), + length(vi$parameter) == length(model$predictors_names)) + vi$parameter <- model$predictors_names + + vi <- vi[order(abs(vi$mean),decreasing = TRUE),] + message(paste0( + 'Trained ',class(self)[1],' (',self$show(),')', + '\n \033[2mStrongest summary effects:\033[22m', + '\n \033[34mPositive:\033[39m ', name_atomic(vi$parameter[vi$mean>0]), + '\n \033[31mNegative:\033[39m ', name_atomic(vi$parameter[vi$mean<0]) + )) + } else if( inherits(self, 'XGBOOST-Model') ) { + vi <- xgboost::xgb.importance(model = self$get_data('fit_best'),) + + message(paste0( + 'Trained ',class(self)[1],' (',self$show(),')', + '\n \033[2mStrongest effects:\033[22m', + '\n ', name_atomic(vi$Feature), + ifelse(has_prediction, + paste0("\n Prediction fitted: ",text_green("yes")), + ""), + ifelse(!is.na(has_threshold), + paste0("\n Threshold created: ",text_green("yes")), + "") + )) + } else if( inherits(self, 'BREG-Model') ) { + obj <- self$get_data('fit_best') + # Summarize the beta coefficients from the posterior + ms <- posterior::summarise_draws(obj$beta) |> + subset(select = c('variable', 'mean')) + # Reorder + ms <- ms[order(ms$mean,decreasing = TRUE),] # Sort + + message(paste0( + 'Trained ',class(self)[1],' (',self$show(),')', + '\n \033[2mStrongest summary effects:\033[22m', + '\n \033[34mPositive:\033[39m ', name_atomic(ms$variable[ms$mean>0]), + '\n \033[31mNegative:\033[39m ', name_atomic(ms$variable[ms$mean<0]), + ifelse(has_prediction, + paste0("\n Prediction fitted: ",text_green("yes")), + ""), + ifelse(!is.na(has_threshold), + paste0("\n Threshold created: ",text_green("yes")), + "") + )) + } else if(inherits(self, 'GLMNET-Model')) { + obj <- self$get_data('fit_best') + + # Summarise coefficients within 1 standard deviation + ms <- tidy_glmnet_summary(obj) + + message(paste0( + 'Trained ',class(self)[1],' (',self$show(),')', + '\n \033[1mStrongest summary effects:\033[22m', + '\n \033[34mPositive:\033[39m ', name_atomic(ms$variable[ms$mean>0]), + '\n \033[31mNegative:\033[39m ', name_atomic(ms$variable[ms$mean<0]), + ifelse(has_prediction, + paste0("\n Prediction fitted: ",text_green("yes")), + ""), + ifelse(!is.na(has_threshold), + paste0("\n Threshold created: ",text_green("yes")), + "") + )) + + } else { + message(paste0( + 'Trained distribution model (',self$show(),')', + text_red('\n No fitted model found!'), + ifelse(has_prediction, + paste0("\n Prediction fitted: ",text_green("yes")), + ""), + ifelse(!is.na(has_threshold), + paste0("\n Threshold created: ",text_green("yes")), + "") + )) + } + }, + # Show the name of the Model + show = function(self) { + self$model$runname + }, + # Plot the prediction + plot = function(self, what = 'mean'){ + if( length( self$fits ) != 0 && !is.null( self$fits$prediction ) ){ + pred <- self$get_data('prediction') + assertthat::assert_that(is.Raster(pred)) + # Check if median is requested but not present, change to q50 + if(what == "median" && !(what %in% names(pred))) { what <- "q50" } + + # Match argument + what <- match.arg(what, names(pred), several.ok = FALSE) + assertthat::assert_that( what %in% names(pred),msg = paste0('Prediction type not found. Available: ', paste0(names(pred),collapse = '|'))) + raster::plot(pred[[what]], + main = paste0(self$model$runname, ' prediction (',what,')'), + box = FALSE, + axes = TRUE, + colNA = NA, col = ibis_colours[['sdm_colour']] + ) + } else { + message( + paste0('No model predictions found.') + ) + } + }, + # Plot threshold + plot_threshold = function(self, what = 1){ + assertthat::assert_that(is.numeric(what) || is.character(what)) + # Determines whether a threshold exists and plots it + rl <- self$show_rasters() + if(length(grep('threshold',rl))>0){ + + # Get stack of computed thresholds + ras <- raster::stack( self$get_data( grep('threshold',rl,value = TRUE)[[what]] ) ) + suppressWarnings( + ras <- raster::deratify(ras, complete = TRUE) + ) + # Get colour palette + format <- attr(ras[[1]], 'format') # Format attribute + if(format == "normalize"){ + col <- colorRampPalette(c("grey","#EB072F","#FFE900","#5A94DD","black"))(100) + } else if(format == "percentile") { + col <- colorRampPalette(c("grey","#EB072F","#FFE900","#5A94DD","black"))(length(unique(ras))) + } else { + # Binary + col <- c("grey", "black") + } + raster::plot(ras, + box = FALSE, + axes = TRUE, + colNA = NA, col = col + ) + } else { + message("No computed threshold was found!") + invisible() + } + }, + # Show model run time if settings exist + show_duration = function(self){ + if(!is.Waiver(self$settings)) self$settings$duration() + }, + # Get effects or importance tables from model + summary = function(self, obj = 'fit_best'){ + # Distinguishing between model types + if(inherits(self, 'GDB-Model')){ + clean_mboost_summary( self$get_data(obj) ) + } else if(inherits(self, 'INLA-Model') || inherits(self, 'INLABRU-Model')){ + tidy_inla_summary(self$get_data(obj)) + } else if(inherits(self, 'BART-Model')){ + # Number of times each variable is used by a tree split + # Tends to become less informative with higher numbers of splits + varimp.bart(self$get_data(obj)) |> tibble::remove_rownames() + } else if(inherits(self, 'STAN-Model')){ + vi <- rstan::summary(self$get_data(obj))$summary |> as.data.frame() |> + tibble::rownames_to_column(var = "parameter") |> as.data.frame() + # Get beta coefficients only + vi <- vi[grep("beta", vi$parameter,ignore.case = TRUE),] + # FIXME: This might not work for all possible modelling objects. For instance + model <- self$model + assertthat::assert_that(nrow(vi) == length(model$predictors_names), + length(vi$parameter) == length(model$predictors_names)) + vi$parameter <- model$predictors_names + names(vi) <- make.names(names(vi)) + return( tibble::as_tibble( vi ) ) + } else if(inherits(self, 'BREG-Model')){ + posterior::summarise_draws(self$get_data(obj)$beta) + } else if(inherits(self, "XGBOOST-Model")){ + xgboost::xgb.importance(model = self$get_data(obj)) + } else if(inherits(self, 'GLMNET-Model')){ + tidy_glmnet_summary(self$get_data(obj)) + } + }, + # Dummy partial response calculation. To be overwritten per engine + partial = function(self){ + new_waiver() + }, + # Dummy spartial response calculation. To be overwritten per engine + spartial = function(self){ + new_waiver() + }, + # Generic plotting function for effect plots + effects = function(self, x = 'fit_best', what = 'fixed', ...){ + assertthat::assert_that(is.character(what)) + if(inherits(self, 'GDB-Model')){ + # How many effects + n <- length( stats::coef( self$get_data(x) )) + # Use the base plotting + par.ori <- graphics::par(no.readonly = TRUE) + graphics::par(mfrow = c(ceiling(n/3),3)) + + mboost:::plot.mboost(x = self$get_data(x), + type = 'b',cex.axis=1.5, cex.lab=1.5) + + graphics::par(par.ori)#dev.off() + } else if(inherits(self, 'INLA-Model')) { + plot_inla_marginals(self$get_data(x),what = what) + } else if(inherits(self, 'GLMNET-Model')) { + if(what == "fixed"){ + glmnet:::plot.glmnet(self$get_data(x)$glmnet.fit, xvar = "lambda") # Deviance explained + } else{ plot(self$get_data(x)) } + } else if(inherits(self, 'STAN-Model')) { + # Get true beta parameters + ra <- grep("beta", names(self$get_data(x)),value = TRUE) # Get range + rstan::stan_plot(self$get_data(x), pars = ra) + } else if(inherits(self, 'INLABRU-Model')) { + # Use inlabru effect plot + ggplot2::ggplot() + + inlabru::gg(self$get_data(x)$summary.fixed, bar = TRUE) + } else if(inherits(self, 'BART-Model')){ + message('Calculating partial dependence plots') + self$partial(self$get_data(x), x.vars = what, ...) + } else if(inherits(self, 'BREG-Model')){ + obj <- self$get_data(x) + if(what == "fixed") what <- "coefficients" + what <- match.arg(what, choices = c("coefficients", "scaled.coefficients","residuals", + "size", "fit", "help", "inclusion"), several.ok = FALSE) + if( length( grep("poisson", obj$call) ) > 0 ){ + BoomSpikeSlab::plot.poisson.spike(obj, y = what) + } else if( length( grep("binomial", obj$call) ) > 0 ){ + BoomSpikeSlab::plot.logit.spike(obj, y = what) + } else { + BoomSpikeSlab::plot.lm.spike(obj, y = what) + } + } else if(inherits(self, "XGBOOST-Model")){ + # Check whether linear model was fitted, otherwise plot tree + if( self$settings$get("only_linear") ){ + vi <- self$summary(x) + xgboost::xgb.ggplot.importance(vi) + } else { + obj <- self$get_data(x) + xgboost::xgb.plot.multi.trees(obj) + } + } else { + self$partial(self$get_data(x), x.vars = NULL) + } + }, + # Get equation + get_equation = function(self){ + self$get_data("fit_best_equation") + }, + # Get specific fit from this Model + get_data = function(self, x = "prediction") { + if (!x %in% names(self$fits)) + return(new_waiver()) + return(self$fits[[x]]) + }, + # Set fit for this Model + set_data = function(self, x, value) { + # Get biodiversity dataset collection + ff <- self$fits + # Set the object + ff[[x]] <- value + bdproto(NULL, self, fits = ff ) + }, + # Get the threshold value if calculated + get_thresholdvalue = function(self){ + # Determines whether a threshold exists and plots it + rl <- self$show_rasters() + if(length(grep('threshold',rl))==0) return( new_waiver() ) + + # Get the thresholded layer and return the respective attribute + obj <- self$get_data( grep('threshold',rl,value = TRUE) ) + assertthat::assert_that(assertthat::has_attr(obj, "threshold")) + return( + attr(obj, "threshold") + ) + }, + # List all rasters in object + show_rasters = function(self){ + rn <- names(self$fits) + rn <- rn[ which( sapply(rn, function(x) is.Raster(self$get_data(x)) ) ) ] + return(rn) + }, + # Get projection + get_projection = function(self){ + sf::st_crs(self$model$background) + }, + # Get resolution + get_resolution = function(self){ + if(!is.Waiver(self$get_data())){ + raster::res( self$get_data() ) + } else { + # Try to get it from the modelling object + self$model$predictors_object$get_resolution() + } + }, + # Remove calculated thresholds + rm_threshold = function(self){ + rl <- self$show_rasters() + if(length(grep('threshold',rl))>0){ + for(val in grep('threshold',rl,value = TRUE)){ + self$fits[[val]] <- NULL + } + } + invisible() + }, + # Save object + save = function(self, fname, type = 'gtif', dt = 'FLT4S'){ + assertthat::assert_that( + is.character(fname), + type %in% c('gtif','gtiff','tif','nc','ncdf'), + 'fits' %in% self$ls(), + dt %in% c('LOG1S','INT1S','INT1U','INT2S','INT2U','INT4S','INT4U','FLT4S','FLT8S') + ) + type <- tolower(type) + + # Get raster file in fitted object + cl <- sapply(self$fits, class) + ras <- self$fits[[grep('raster', cl,ignore.case = T)]] + + # Check that no-data value is not present in ras + assertthat::assert_that(any(!cellStats(ras,min) <= -9999),msg = 'No data value -9999 is potentially in prediction!') + + if(file.exists(fname)) warning('Overwritting existing file...') + if(type %in% c('gtif','gtiff','tif')){ + # Save as geotiff + writeGeoTiff(ras, fname = fname, dt = dt) + } else if(type %in% c('nc','ncdf')) { + # Save as netcdf + # TODO: Potentially change the unit descriptions + writeNetCDF(ras, fname = fname, varName = 'iSDM prediction', varUnit = "",varLong = "") + } + invisible() + } +) diff --git a/R/bdproto-predictors.R b/R/bdproto-predictors.R index eb4fa23b..aedf4a6c 100644 --- a/R/bdproto-predictors.R +++ b/R/bdproto-predictors.R @@ -1,178 +1,178 @@ -#' @include utils.R waiver.R bdproto.R -NULL - -#' @export -if (!methods::isClass("PredictorDataset")) methods::setOldClass("PredictorDataset") -NULL - -#' PredictorDataset prototype description -#' -#' @name PredictorDataset-class -#' @aliases PredictorDataset -#' @family bdproto -#' @keywords bdproto -NULL - -#' @export -PredictorDataset <- bdproto( - "PredictorDataset", - id = character(0), - data = new_waiver(), - # Printing function - print = function(self){ - # Getting names and time periods if set - nn <- name_atomic(self$get_names(), "predictors") - # Get Time dimension if existing - tt <- self$get_time() - if(!(is.Waiver(tt) || is.null(tt))) tt <- paste0(range(tt),collapse = " <> ") else tt <- NULL - message(paste0(self$name(),':', - '\n Name(s): ',nn, - ifelse(!is.null(tt), paste0("\n Timeperiod: ", tt), "") - ) - ) - }, - # Return name - name = function(self){ - 'Predictor dataset' - }, - # Get Id - id = function(self){ - self$id - }, - # Get names of data - get_names = function(self){ - names(self$get_data()) - }, - # Get data - get_data = function(self, df = FALSE, na.rm = TRUE, ...){ - if(df) { - if(any(is.factor(self$data))){ - # Bugs for factors, so need - out <- self$data[] |> as.data.frame() - out[,which(is.factor(self$data))] <- factor( out[,which(is.factor(self$data))] ) # Reformat factors variables - cbind(raster::coordinates(self$data), out ) # Attach coordinates and return - } else { - raster::as.data.frame(self$data, xy = TRUE, na.rm = na.rm, ...) - } - } else self$data - }, - # Get time dimension - get_time = function(self, ...){ - # Get data - d <- self$get_data() - if(is.Waiver(d)) return(new_waiver()) - if(!inherits(d, 'stars')){ - # Try and get a z dimension from the raster object - raster::getZ(d) - } else { - # Get dimensions - o <- stars::st_dimensions(d) - # Take third entry as the one likely to be the time variable - return( - to_POSIXct( - stars::st_get_dimension_values(d, names(o)[3], center = TRUE) - ) - ) - } - }, - # Get Projection - get_projection = function(self){ - assertthat::assert_that(is.Raster(self$data) || inherits(self$data,'stars')) - sf::st_crs(self$data) - }, - # Get Resolution - get_resolution = function(self){ - assertthat::assert_that(is.Raster(self$data) || inherits(self$data,'stars')) - if(is.Raster(self$data)){ - raster::res(self$data) - } else { - stars::st_res(self$data) - } - }, - # Clip the predictor dataset by another dataset - crop_data = function(self, pol){ - assertthat::assert_that(is.Raster(self$data) || inherits(self$data,'stars'), - inherits(pol, 'sf'), - all( unique(sf::st_geometry_type(pol)) %in% c("POLYGON","MULTIPOLYGON") ) - ) - self$data <- raster::crop(self$data, pol) - invisible() - }, - # Add a new Predictor dataset to this collection - set_data = function(self, x, value){ - assertthat::assert_that(assertthat::is.string(x), - is.Raster(value), - is_comparable_raster(self$get_data(), value)) - bdproto(NULL, self, data = addLayer(self$get_data(), value)) - }, - # Remove a specific Predictor by name - rm_data = function(self, x) { - assertthat::assert_that(is.vector(x) || is.character(x), - all(x %in% names(self$get_data())) - ) - # Match indices - ind <- match(x, self$get_names()) - if(is.Raster(self$get_data() )){ - # Overwrite predictor dataset - self$data <- raster::dropLayer(self$get_data(), ind) - } else { - suppressWarnings( - self$data <- stars:::select.stars(self$data, -ind) - ) - } - invisible() - }, - # Print input messages - show = function(self) { - self$print() - }, - # Collect info statistics with optional decimals - summary = function(self, digits = 2) { - # Get data - d <- self$get_data() - if(is.Waiver(d)) return(NULL) - # Need special handling if there any factors - if(any(is.factor(self$get_data()))){ - out <- self$get_data()[] |> as.data.frame() - out[,which(is.factor(self$data))] <- factor( out[,which(is.factor(self$data))] ) # Reformat factors variables - summary(out, digits = digits) - } else { - if(inherits(d, 'stars')){ - return( - summary(stars:::as.data.frame.stars(d)) - ) - } else { - # Assume raster - return( - round( - raster::summary( d ), digits = digits - ) - ) - } - } - rm(d) - }, - # Has derivates? - has_derivates = function(self){ - if(inherits(self$get_data(),'Raster')) - return( - length( grep("hinge__|bin__|quad__|thresh__", self$get_names() ) ) > 0 - ) - else - return( NULL ) - }, - # Number of Predictors in object - length = function(self) { - if(inherits(self$get_data(),'Raster')) - raster::nlayers(self$get_data()) - else - ncol(self$get_data) - }, - # Basic Plotting function - plot = function(self){ - # Plot the predictors - par.ori <- par(no.readonly = TRUE) - raster::plot( self$data, col = ibis_colours[['viridis_cividis']] ) - par(par.ori) - } -) +#' @include utils.R waiver.R bdproto.R +NULL + +#' @export +if (!methods::isClass("PredictorDataset")) methods::setOldClass("PredictorDataset") +NULL + +#' PredictorDataset prototype description +#' +#' @name PredictorDataset-class +#' @aliases PredictorDataset +#' @family bdproto +#' @keywords bdproto +NULL + +#' @export +PredictorDataset <- bdproto( + "PredictorDataset", + id = character(0), + data = new_waiver(), + # Printing function + print = function(self){ + # Getting names and time periods if set + nn <- name_atomic(self$get_names(), "predictors") + # Get Time dimension if existing + tt <- self$get_time() + if(!(is.Waiver(tt) || is.null(tt))) tt <- paste0(range(tt),collapse = " <> ") else tt <- NULL + message(paste0(self$name(),':', + '\n Name(s): ',nn, + ifelse(!is.null(tt), paste0("\n Timeperiod: ", tt), "") + ) + ) + }, + # Return name + name = function(self){ + 'Predictor dataset' + }, + # Get Id + id = function(self){ + self$id + }, + # Get names of data + get_names = function(self){ + names(self$get_data()) + }, + # Get data + get_data = function(self, df = FALSE, na.rm = TRUE, ...){ + if(df) { + if(any(is.factor(self$data))){ + # Bugs for factors, so need + out <- self$data[] |> as.data.frame() + out[,which(is.factor(self$data))] <- factor( out[,which(is.factor(self$data))] ) # Reformat factors variables + cbind(raster::coordinates(self$data), out ) # Attach coordinates and return + } else { + raster::as.data.frame(self$data, xy = TRUE, na.rm = na.rm, ...) + } + } else self$data + }, + # Get time dimension + get_time = function(self, ...){ + # Get data + d <- self$get_data() + if(is.Waiver(d)) return(new_waiver()) + if(!inherits(d, 'stars')){ + # Try and get a z dimension from the raster object + raster::getZ(d) + } else { + # Get dimensions + o <- stars::st_dimensions(d) + # Take third entry as the one likely to be the time variable + return( + to_POSIXct( + stars::st_get_dimension_values(d, names(o)[3], center = TRUE) + ) + ) + } + }, + # Get Projection + get_projection = function(self){ + assertthat::assert_that(is.Raster(self$data) || inherits(self$data,'stars')) + sf::st_crs(self$data) + }, + # Get Resolution + get_resolution = function(self){ + assertthat::assert_that(is.Raster(self$data) || inherits(self$data,'stars')) + if(is.Raster(self$data)){ + raster::res(self$data) + } else { + stars::st_res(self$data) + } + }, + # Clip the predictor dataset by another dataset + crop_data = function(self, pol){ + assertthat::assert_that(is.Raster(self$data) || inherits(self$data,'stars'), + inherits(pol, 'sf'), + all( unique(sf::st_geometry_type(pol)) %in% c("POLYGON","MULTIPOLYGON") ) + ) + self$data <- raster::crop(self$data, pol) + invisible() + }, + # Add a new Predictor dataset to this collection + set_data = function(self, x, value){ + assertthat::assert_that(assertthat::is.string(x), + is.Raster(value), + is_comparable_raster(self$get_data(), value)) + bdproto(NULL, self, data = addLayer(self$get_data(), value)) + }, + # Remove a specific Predictor by name + rm_data = function(self, x) { + assertthat::assert_that(is.vector(x) || is.character(x), + all(x %in% names(self$get_data())) + ) + # Match indices + ind <- match(x, self$get_names()) + if(is.Raster(self$get_data() )){ + # Overwrite predictor dataset + self$data <- raster::dropLayer(self$get_data(), ind) + } else { + suppressWarnings( + self$data <- stars:::select.stars(self$data, -ind) + ) + } + invisible() + }, + # Print input messages + show = function(self) { + self$print() + }, + # Collect info statistics with optional decimals + summary = function(self, digits = 2) { + # Get data + d <- self$get_data() + if(is.Waiver(d)) return(NULL) + # Need special handling if there any factors + if(any(is.factor(self$get_data()))){ + out <- self$get_data()[] |> as.data.frame() + out[,which(is.factor(self$data))] <- factor( out[,which(is.factor(self$data))] ) # Reformat factors variables + summary(out, digits = digits) + } else { + if(inherits(d, 'stars')){ + return( + summary(stars:::as.data.frame.stars(d)) + ) + } else { + # Assume raster + return( + round( + raster::summary( d ), digits = digits + ) + ) + } + } + rm(d) + }, + # Has derivates? + has_derivates = function(self){ + if(inherits(self$get_data(),'Raster')) + return( + length( grep("hinge__|bin__|quad__|thresh__", self$get_names() ) ) > 0 + ) + else + return( NULL ) + }, + # Number of Predictors in object + length = function(self) { + if(inherits(self$get_data(),'Raster')) + raster::nlayers(self$get_data()) + else + ncol(self$get_data) + }, + # Basic Plotting function + plot = function(self){ + # Plot the predictors + par.ori <- graphics::par(no.readonly = TRUE) + raster::plot( self$data, col = ibis_colours[['viridis_cividis']] ) + graphics::par(par.ori) + } +) diff --git a/R/effects.R b/R/effects.R index e9c83d2d..cbe67bd7 100644 --- a/R/effects.R +++ b/R/effects.R @@ -9,7 +9,8 @@ NULL #' @note #' For some models, where default coefficients plots are not available, #' this function will attempt to generate [partial] dependency plots instead. -#' @param x Any fitted [distribution] object. +#' @param object Any fitted [distribution] object. +#' @param ... Not used. #' #' @examples #' \dontrun{ @@ -18,7 +19,6 @@ NULL #' } #' @return None. #' @keywords partial -#' @importFrom stats effects #' @name effects NULL @@ -26,4 +26,4 @@ NULL #' @method effects DistributionModel #' @keywords partial #' @export -effects.DistributionModel <- function(x) x$effects() +effects.DistributionModel <- function(object, ...) object$effects() diff --git a/R/engine_bart.R b/R/engine_bart.R index cf94dc97..e035f6e1 100644 --- a/R/engine_bart.R +++ b/R/engine_bart.R @@ -1,615 +1,615 @@ -#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R -NULL - -#' Engine for use of Bayesian Additive Regression Trees (BART) -#' -#' @description The Bayesian regression approach to a sum of complementary trees is to shrink -#' the said fit of each tree through a regularization prior. BART models provide -#' non-linear highly flexible estimation and have been shown to compare favourable among machine learning -#' algorithms (Dorie et al. 2019). Default prior preference is for trees to be small (few terminal nodes) -#' and shrinkage towards \code{0}. -#' -#' This package requires the [dbarts] R-package to be installed. -#' Many of the functionalities of this [engine] have been inspired by the [embarcadero] R-package. Users -#' are therefore advised to cite if they make heavy use of BART. -#' @details -#' Prior distributions can furthermore be set for: -#' * probability that a tree stops at a node of a given depth (Not yet implemented) -#' * probability that a given variable is chosen for a splitting rule -#' * probability of splitting that variable at a particular value (Not yet implemented) -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param iter A [`numeric`] estimate of the number of trees to be used in the sum-of-trees formulation. -#' @param nburn A [`numeric`] estimate of the burn in samples. -#' @param chains A number of the number of chains to be used (Default: \code{4}). -#' @param type The mode used for creating posterior predictions. Either \code{"link"} or \code{"response"} (Default: \code{"response"}). -#' @param ... Other options. -#' @references -#' * Carlson, CJ. embarcadero: Species distribution modelling with Bayesian additive regression trees in r. Methods Ecol Evol. 2020; 11: 850– 858. https://doi.org/10.1111/2041-210X.13389 -#' * Dorie, V., Hill, J., Shalit, U., Scott, M., & Cervone, D. (2019). Automated versus do-it-yourself methods for causal inference: Lessons learned from a data analysis competition. Statistical Science, 34(1), 43-68. -#' * Vincent Dorie (2020). dbarts: Discrete Bayesian Additive Regression Trees Sampler. R package version 0.9-19. https://CRAN.R-project.org/package=dbarts -#' @importFrom foreach %do% %dopar% -#' @examples -#' \dontrun{ -#' # Add BART as an engine -#' x <- distribution(background) |> engine_bart(iter = 100) -#' } -#' @family engine -#' @name engine_bart -NULL -#' @rdname engine_bart -#' @export - -engine_bart <- function(x, - iter = 1000, - nburn = 250, - chains = 4, - type = "response", - ...) { - - # Check whether dbarts package is available - check_package('dbarts') - if(!("dbarts" %in% loadedNamespaces()) || ('dbarts' %notin% sessionInfo()$otherPkgs) ) { - try({requireNamespace('dbarts');attachNamespace("dbarts")},silent = TRUE) - } - - # assert that arguments are valid - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(x$background,'sf'), - is.numeric(nburn), - is.numeric(iter), - is.character(type), - is.numeric(chains) - ) - type <- match.arg(type, choices = c("link", "predictor", "response", "ppd"), several.ok = FALSE) - if(type == "predictor") type <- "link" - if(nburn > iter) nburn <- floor( iter / 4) - - # Create a background raster - if(is.Waiver(x$predictors)){ - # Create from background - template <- raster::raster( - ext = raster::extent(x$background), - crs = raster::projection(x$background), - res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution - diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 - ) - ) - } else { - # If predictor existing, use them - template <- emptyraster(x$predictors$get_data() ) - } - - # Burn in the background - template <- raster::rasterize(x$background, template, field = 0) - - # Set up dbarts control with some parameters, rest default - dc <- dbarts::dbartsControl(keepTrees = TRUE, # Keep trees - n.burn = nburn, - n.trees = iter, - n.chains = chains, - n.threads = ifelse( dbarts::guessNumCores() < getOption('ibis.nthread'),dbarts::guessNumCores(),getOption('ibis.nthread')) - ) - # Other parameters - # Set up the parameter list - params <- list( - type = type, - ... - ) - - # Print a message in case there is already an engine object - if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') - - # Set engine in distribution object - x$set_engine( - bdproto( - "BART-Engine", - Engine, - name = "", - data = list( - 'template' = template, - 'dc' = dc, - 'params' = params - ), - # Dummy function for spatial latent effects - calc_latent_spatial = function(self, type = NULL, priors = NULL){ - new_waiver() - }, - # Dummy function for getting the equation of latent effects - get_equation_latent_spatial = function(self, method){ - new_waiver() - }, - # Function to respecify the control parameters - set_control = function(self, - iter = 1000, - nburn = 250, - chains = 4, - cores = dbarts::guessNumCores(), - verbose = TRUE, - ... - ){ - # Set up boosting control - dc <- dbarts::dbartsControl(verbose = verbose, - n.burn = nburn, - n.trees = iter, - n.chains = chains, - n.threads = cores, - ... - ) - # Overwrite existing - self$data$dc <- dc - }, - # Setup function - setup = function(self, model, settings = NULL, ...){ - # Simple security checks - assertthat::assert_that( - assertthat::has_name(model, 'background'), - assertthat::has_name(model, 'biodiversity'), - inherits(settings,'Settings') || is.null(settings), - nrow(model$predictors) == ncell(self$get_data('template')), - length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') - - # Add pseudo-absence points if necessary - if('poipo' == model$biodiversity[[1]]$type && model$biodiversity[[1]]$family == 'poisson') { - # Warning since PPMs are not really performing / correctly set up in bart - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Engine BART prone to overfit Poisson-distributed occurrence data.\nConsider non-linear xgboost as alternative!') - - # Get background layer - bg <- self$get_data('template') - assertthat::assert_that(!is.na(raster::cellStats(bg, min))) - - # Add pseudo-absence points - presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, - field_occurrence = 'observed', - template = bg, - settings = model$biodiversity[[1]]$pseudoabsence_settings) - if(inherits(presabs, 'sf')) presabs <- presabs %>% sf::st_drop_geometry() - # Sample environmental points for absence only points - abs <- subset(presabs, observed == 0) - # Re-extract environmental information for absence points - envs <- get_rastervalue(coords = abs[,c('x','y')], - env = model$predictors_object$get_data(df = FALSE), - rm.na = FALSE) - if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} - - # Format out - df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], - envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) - any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) - if(length(any_missing)>0){ - presabs <- presabs[-any_missing,] # This works as they are in the same order - model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] - # Fill the absences with 1 as multiplier. This works since absences follow the presences - model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, - rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) - } - df <- subset(df, complete.cases(df)) - assertthat::assert_that(nrow(presabs) == nrow(df)) - - # Overwrite observation data - model$biodiversity[[1]]$observations <- presabs - - # Preprocessing security checks - assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), - any(!is.na(presabs[['observed']])), - length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), - nrow(df) == nrow(model$biodiversity[[1]]$observations) - ) - - # Add offset if existent - if(!is.Waiver(model$offset)){ - ofs <- get_rastervalue(coords = df[,c('x','y')], - env = model$offset_object, - rm.na = FALSE) - # ofs <- get_ngbvalue(coords = df[,c('x','y')], - # env = model$offset, - # longlat = raster::isLonLat(bg), - # field_space = c('x','y') - # ) - model$biodiversity[[1]]$offset <- ofs - } - - # Define expectation as very small vector following Renner et al. - w <- ppm_weights(df = df, - pa = model$biodiversity[[1]]$observations[['observed']], - bg = bg, - weight = 1 # Set those to 1 so that absences become ratio of pres/abs - ) - df$w <- w # Also add as column - - model$biodiversity[[1]]$predictors <- df - model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) # Multiply with provided weights - } else { - # If family is not poisson, assume factor distribution for response - assertthat::assert_that( length( unique(model$biodiversity[[1]]$observations[['observed']])) == 2) - # calculating the case weights (equal weights) - # the order of weights should be the same as presences and backgrounds in the training data - prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences - bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds - w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) - model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect # Multiply with provided weights - model$biodiversity[[1]]$observations[['observed']] <- factor(model$biodiversity[[1]]$observations[['observed']]) - } - - # Split up factors as this is done anyway during the fitting! - # Check for factors and split them up - train_cov <- model$biodiversity[[1]]$predictors[, c(model$biodiversity[[1]]$predictors_names)] - # Check if there any factors, if yes split up - if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){ - vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")] - # Get factors - z <- explode_factor(train_cov[[vf]], name = vf) - # Remove variables from train_cov and append - train_cov[[vf]] <- NULL - train_cov <- cbind(train_cov, z) - model$biodiversity[[1]]$predictors <- train_cov # Save new in model object - model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) - - # Also update the formula - model$biodiversity[[1]]$equation <- update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . -", vf)) - model$biodiversity[[1]]$equation <- update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . +", paste0(colnames(z),collapse = "+"))) - } - - # Prediction container - pred_cov <- model$predictors[,c('x','y',model$biodiversity[[1]]$predictors_names)] - if(any(model$predictors_types$type=='factor')){ - vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")] - # Get factors - z <- explode_factor(pred_cov[[vf]], name = vf) - # Remove variables from train_cov and append - pred_cov[[vf]] <- NULL - pred_cov <- cbind(pred_cov, z) - pred_cov <- pred_cov[,c("x", "y", colnames(train_cov))] - model$predictors <- pred_cov # Save new in model object - model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) - model$biodiversity[[1]]$predictors_names <- colnames(train_cov) - model$predictors_names <- colnames(pred_cov) - assertthat::assert_that(all( colnames(train_cov) %in% colnames(pred_cov) )) - } - rm(train_cov, pred_cov) - - # Process and add priors if set - params <- self$get_data("params") - if(!is.Waiver(model$priors)){ - assertthat::assert_that( - all( model$priors$varnames() %in% model$biodiversity[[1]]$predictors_names ) - ) - # Match position of variables with monotonic constrains - mc <- rep(1 / length( model$biodiversity[[1]]$predictors_names ), length( model$biodiversity[[1]]$predictors_names ) ) - names(mc) <- model$biodiversity[[1]]$predictors_names - for(v in model$priors$varnames()){ - mc[v] <- model$priors$get(v) - } - # Save the priors in the model parameters - params[["priors"]] <- mc - } else { params[["priors"]] <- new_waiver() } - self$set_data("params", params) - - # Instead of invisible return the model object - return( model ) - }, - # Training function - train = function(self, model, settings, ...){ - assertthat::assert_that( - inherits(settings,'Settings'), - is.list(model),length(model) > 1, - # Check that model id and setting id are identical - settings$modelid == model$id - ) - # Get name - name <- model$biodiversity[[1]]$name - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green',paste0( 'Starting fitting: ', name)) - - # Get output raster - prediction <- self$get_data('template') - - # Get dbarts control and params - dc <- self$get_data('dc') - params <- self$get_data('params') - - # All other needed data for model fitting - equation <- model$biodiversity[[1]]$equation - data <- cbind(model$biodiversity[[1]]$predictors, data.frame(observed = model$biodiversity[[1]]$observations[,'observed']) ) - # Subset to predictor names - data <- subset(data, select = c('observed', model$biodiversity[[1]]$predictors_names) ) - if(model$biodiversity[[1]]$family=='binomial') data$observed <- factor(data$observed) - w <- model$biodiversity[[1]]$expect # The expected weight - full <- model$predictors # All predictors - - # Select predictors - full <- subset(full, select = c('x','y', model$biodiversity[[1]]$predictors_names)) - full$cellid <- rownames(full) # Add rownames - full <- subset(full, complete.cases(full)) - - # Clamp? - if( settings$get("clamp") ) full <- clamp_predictions(model, full) - - assertthat::assert_that( - is.null(w) || length(w) == nrow(data), - is.formula(equation), - all( model$biodiversity[[1]]$predictors_names %in% names(full) ) - ) - - if(!is.Waiver(model$offset)){ - # Add offset to full prediction and load vector - if(model$biodiversity[[1]]$family == "poisson"){ - # Offsets are only supported for binary dbarts models, but maybe there is an option - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Offsets are not supported for poisson models. Trying to modify weights.') - w <- w + model$biodiversity[[1]]$offset[,"spatial_offset"] - # Check and correct for issues - if(any(w < 0, na.rm = TRUE)) { - w <- scales::rescale(w, to = c(1e-6, 1)) - } - if(anyNA(w)){ - w[is.na(w)] <- 1e-6 - } - } else if(model$biodiversity[[1]]$family == "binomial"){ - # Set the created ranges and binaryOffset - off <- model$biodiversity[[1]]$offset[,"spatial_offset"] - } - } else { off = 0.0 } - - # Specify splitprobs depending on whether priors have been set - if( !is.Waiver(params[["priors"]]) ){ - splitprobs = params[["priors"]] - } else { splitprobs = NULL } - - # --- # - # Parameter tuning # - if(settings$get('varsel') == "reg"){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting hyperparameters search.') - - cv_bart <- dbarts::xbart( - formula = equation, data = data, - n.samples = round( nrow(data) * 0.1 ), # Draw posterior samples (10% of dataset) - n.test = 5, # Number of folds - method = "k-fold", - n.reps = 4L, # For replications - control = dc, - offset = off, - loss = ifelse(is.factor(data$observed), "log", "rmse"), - n.trees = dc@n.trees, - k = c(1, 2, 4), # Prior for node-mean SD - power = c(1.5, 2), # Prior growth probability - base = c(0.75, 0.8, 0.95), # Tree growth probability - drop = TRUE, # Drop those with only one record - n.threads = dc@n.threads, - verbose = settings$get('verbose') - ) - # An array of dimensions n.reps * length(n.trees) * length(k) * length(power) * length(base) - # Convert to data.frame - cv_bart <- as.data.frame.table(cv_bart) - best <- which.min(cv_bart$Freq) # Get the setting with lowest loss/error - k <- as.numeric( as.character(cv_bart$k[best]) ) - power <- as.numeric( as.character(cv_bart$power[best]) ) - base <- as.numeric( as.character(cv_bart$base[best]) ) - - } else { - # Pick default hyperparameters for bart - # k = 2 implies that the maximum and minimum are each approximately - # 2 standard deviations from the mean, or ≈ 95 percent prior probability - # in the interval (ymin, ymax) when Y is continuous and symmetrically distributed. - # Source: https://journals.sagepub.com/doi/10.1177/2378023119825886 - k <- 2.0; power = 2.0 ; base = 0.95 - } - # --- # - # Fit the model. Little hack to work correctly with binomials... - if(is.factor(data$observed)){ - fit_bart <- dbarts::bart(y.train = data[,'observed'], - x.train = data[,model$biodiversity[[1]]$predictors_names], - # To make partial plots faster - keeptrees = dc@keepTrees, - keepevery = 10, - # weights = w, - binaryOffset = off, - # Hyper parameters - k = k, power = power, base = base, - splitprobs = splitprobs, - ntree = dc@n.trees, - nthread = dc@n.threads, - nchain = dc@n.chains, - nskip = dc@n.burn, - verbose = settings$get('verbose') - ) - } else { - fit_bart <- dbarts::bart(y.train = data[,'observed'], - x.train = data[,model$biodiversity[[1]]$predictors_names], - # To make partial plots faster - keeptrees = dc@keepTrees, - keepevery = 10, - weights = w, - ntree = dc@n.trees, - # Hyper parameters - k = k, power = power, base = base, - splitprobs = splitprobs, - nthread = dc@n.threads, - nchain = dc@n.chains, - nskip = dc@n.burn, - verbose = settings$get('verbose') - ) - } - - # Predict spatially - if(!settings$get('inference_only')){ - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') - - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(full)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - check_package("foreach") - params <- self$get_data("params") - - full$rowid <- 1:nrow(full) - - # Tile the problem - splits <- cut(1:nrow(full), nrow(full) / min(nrow(full) / 4, 5000) ) - - # Get offset if existing - if(is.Waiver(model$offset)) of <- NULL else of <- scales::rescale(model$offset[full$cellid, "spatial_offset"], to = c(1e-6, 1)) - - # Make a prediction - ms <- foreach::foreach(s = unique(splits), - .inorder = TRUE, - .combine = rbind, - .errorhandling = "stop", - .multicombine = TRUE, - .export = c("splits", "fit_bart", "full", "model", "params", "of"), - .packages = c("dbarts", "matrixStats")) %do% { - i <- which(splits == s) - - pred_bart <- dbarts:::predict.bart(object = fit_bart, - newdata = full[i, model$biodiversity[[1]]$predictors_names], - type = params$type, - offset = of[i] - ) - # Summarize quantiles and sd from posterior - ms <- as.data.frame( - cbind( apply(pred_bart, 2, function(x) mean(x, na.rm = TRUE)), - matrixStats::colSds(pred_bart), - matrixStats::colQuantiles(pred_bart, probs = c(.05,.5,.95)), - apply(pred_bart, 2, mode) - ) - ) - names(ms) <- c("mean","sd", "q05", "q50", "q95", "mode") - ms$cv <- ms$sd / ms$mean - rm(pred_bart) - return( ms ) - } # End of processing - assertthat::assert_that(nrow(ms)>0, - nrow(ms) == nrow(full)) - - # Add them through a loop since the cellid changed - prediction <- raster::stack() - for(post in names(ms)){ - prediction2 <- self$get_data('template') - prediction2[as.numeric(full$cellid)] <- ms[[post]]; names(prediction2) <- post - prediction <- raster::addLayer(prediction, prediction2) - rm(prediction2) - } - # plot(prediction$mean, col = ibis_colours$sdm_colour) - try({rm(ms, full)},silent = TRUE) - } else { - # No prediction done - prediction <- NULL - } - # Compute end of computation time - settings$set('end.time', Sys.time()) - # Also append boosting control option to settings - for(entry in slotNames(dc)) settings$set(entry, slot(dc,entry)) - for(entry in names(params)) settings$set(entry, params[[entry]]) - # Create output - out <- bdproto( - "BART-Model", - DistributionModel, - id = model$id, - model = model, - settings = settings, - fits = list( - "fit_best" = fit_bart, - "params" = params, - "fit_best_equation" = equation, - "prediction" = prediction - ), - # Partial effects - partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = NULL, ...){ - model <- self$get_data('fit_best') - assertthat::assert_that(x.var %in% attr(model$fit$data@x,'term.labels') || is.null(x.var), - msg = 'Variable not in predicted model' ) - bart_partial_effect(model, x.vars = x.var, - transform = self$settings$data$binary, values = values, ... ) - }, - # Spatial partial dependence plot option from embercardo - spartial = function(self, predictors, x.var = NULL, equal = FALSE, smooth = 1, transform = TRUE, type = NULL){ - model <- self$get_data('fit_best') - assertthat::assert_that(x.var %in% attr(model$fit$data@x,'term.labels'), - msg = 'Variable not in predicted model' ) - - if( self$model$biodiversity[[1]]$family != 'binomial' && transform) warning('Check whether transform should not be set to False!') - - # Calculate - p <- bart_partial_space(model, predictors, x.var, equal, smooth, transform) - - raster::plot(p, col = ibis_colours$viridis_plasma, main = paste0(x.var, collapse ='|')) - # Also return spatial - return(p) - }, - get_coefficients = function(self){ - # Returns a vector of the coefficients with direction/importance - cofs <- self$summary() - cofs$Sigma <- NA - names(cofs) <- c("Feature", "Weights") - return(cofs) - }, - # Engine-specific projection function - project = function(self, newdata, type = "response", layer = 'mean'){ - assertthat::assert_that(!missing(newdata), - is.data.frame(newdata)) - - # get model data - model <- self$model - - # Define rowids as those with no missing data - rownames(newdata) <- 1:nrow(newdata) - newdata$rowid <- as.numeric( rownames(newdata) ) - newdata <- subset(newdata, complete.cases(newdata)) - - # Also get settings for bias values - settings <- self$settings - - # Clamp? - if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) - - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(newdata)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - - # Make a prediction - suppressWarnings( - pred_bart <- dbarts:::predict.bart(object = self$get_data('fit_best'), - newdata = newdata, - type = type) |> t() - ) - assertthat::assert_that(nrow(pred_bart) == nrow(newdata)) - # Fill output with summaries of the posterior - prediction <- emptyraster( model$predictors_object$get_data()[[1]] ) # Background - if(layer == "mean"){ - prediction[newdata$rowid] <- matrixStats::rowMeans2(pred_bart) - } else if(layer == "sd"){ - prediction[newdata$rowid] <- matrixStats::rowSds(pred_bart) - } else if(layer == "q05"){ - prediction[newdata$rowid] <- matrixStats::rowQuantiles(pred_bart, probs = c(.05)) - } else if(layer == "q50" || layer == "median"){ - prediction[newdata$rowid] <- matrixStats::rowQuantiles(pred_bart, probs = c(.5)) - } else if(layer == "q95"){ - prediction[newdata$rowid] <- matrixStats::rowQuantiles(pred_bart, probs = c(.95)) - } else if(layer == "mode"){ - prediction[newdata$rowid] <- apply(pred_bart, 1, mode) - } else if(layer == "cv"){ - prediction[newdata$rowid] <- matrixStats::rowSds(pred_bart) / matrixStats::rowMeans2(pred_bart) - } else { message("Custom posterior summary not yet implemented.")} - return(prediction) - } - ) - return(out) - } - ) - ) # End of bdproto object -} # End of function +#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R +NULL + +#' Engine for use of Bayesian Additive Regression Trees (BART) +#' +#' @description The Bayesian regression approach to a sum of complementary trees is to shrink +#' the said fit of each tree through a regularization prior. BART models provide +#' non-linear highly flexible estimation and have been shown to compare favourable among machine learning +#' algorithms (Dorie et al. 2019). Default prior preference is for trees to be small (few terminal nodes) +#' and shrinkage towards \code{0}. +#' +#' This package requires the [dbarts] R-package to be installed. +#' Many of the functionalities of this [engine] have been inspired by the [embarcadero] R-package. Users +#' are therefore advised to cite if they make heavy use of BART. +#' @details +#' Prior distributions can furthermore be set for: +#' * probability that a tree stops at a node of a given depth (Not yet implemented) +#' * probability that a given variable is chosen for a splitting rule +#' * probability of splitting that variable at a particular value (Not yet implemented) +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param iter A [`numeric`] estimate of the number of trees to be used in the sum-of-trees formulation (Default: \code{1000}). +#' @param nburn A [`numeric`] estimate of the burn in samples (Default: \code{250}). +#' @param chains A number of the number of chains to be used (Default: \code{4}). +#' @param type The mode used for creating posterior predictions. Either \code{"link"} or \code{"response"} (Default: \code{"response"}). +#' @param ... Other options. +#' @references +#' * Carlson, CJ. embarcadero: Species distribution modelling with Bayesian additive regression trees in r. Methods Ecol Evol. 2020; 11: 850– 858. https://doi.org/10.1111/2041-210X.13389 +#' * Dorie, V., Hill, J., Shalit, U., Scott, M., & Cervone, D. (2019). Automated versus do-it-yourself methods for causal inference: Lessons learned from a data analysis competition. Statistical Science, 34(1), 43-68. +#' * Vincent Dorie (2020). dbarts: Discrete Bayesian Additive Regression Trees Sampler. R package version 0.9-19. https://CRAN.R-project.org/package=dbarts +#' @returns An [engine]. +#' @examples +#' \dontrun{ +#' # Add BART as an engine +#' x <- distribution(background) |> engine_bart(iter = 100) +#' } +#' @family engine +#' @name engine_bart +NULL +#' @rdname engine_bart +#' @export + +engine_bart <- function(x, + iter = 1000, + nburn = 250, + chains = 4, + type = "response", + ...) { + + # Check whether dbarts package is available + check_package('dbarts') + if(!("dbarts" %in% loadedNamespaces()) || ('dbarts' %notin% utils::sessionInfo()$otherPkgs) ) { + try({requireNamespace('dbarts');attachNamespace("dbarts")},silent = TRUE) + } + + # assert that arguments are valid + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(x$background,'sf'), + is.numeric(nburn), + is.numeric(iter), + is.character(type), + is.numeric(chains) + ) + type <- match.arg(type, choices = c("link", "predictor", "response", "ppd"), several.ok = FALSE) + if(type == "predictor") type <- "link" + if(nburn > iter) nburn <- floor( iter / 4) + + # Create a background raster + if(is.Waiver(x$predictors)){ + # Create from background + template <- raster::raster( + ext = raster::extent(x$background), + crs = raster::projection(x$background), + res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution + diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 + ) + ) + } else { + # If predictor existing, use them + template <- emptyraster(x$predictors$get_data() ) + } + + # Burn in the background + template <- raster::rasterize(x$background, template, field = 0) + + # Set up dbarts control with some parameters, rest default + dc <- dbarts::dbartsControl(keepTrees = TRUE, # Keep trees + n.burn = nburn, + n.trees = iter, + n.chains = chains, + n.threads = ifelse( dbarts::guessNumCores() < getOption('ibis.nthread'),dbarts::guessNumCores(),getOption('ibis.nthread')) + ) + # Other parameters + # Set up the parameter list + params <- list( + type = type, + ... + ) + + # Print a message in case there is already an engine object + if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') + + # Set engine in distribution object + x$set_engine( + bdproto( + "BART-Engine", + Engine, + name = "", + data = list( + 'template' = template, + 'dc' = dc, + 'params' = params + ), + # Dummy function for spatial latent effects + calc_latent_spatial = function(self, type = NULL, priors = NULL){ + new_waiver() + }, + # Dummy function for getting the equation of latent effects + get_equation_latent_spatial = function(self, method){ + new_waiver() + }, + # Function to respecify the control parameters + set_control = function(self, + iter = 1000, + nburn = 250, + chains = 4, + cores = dbarts::guessNumCores(), + verbose = TRUE, + ... + ){ + # Set up boosting control + dc <- dbarts::dbartsControl(verbose = verbose, + n.burn = nburn, + n.trees = iter, + n.chains = chains, + n.threads = cores, + ... + ) + # Overwrite existing + self$data$dc <- dc + }, + # Setup function + setup = function(self, model, settings = NULL, ...){ + # Simple security checks + assertthat::assert_that( + assertthat::has_name(model, 'background'), + assertthat::has_name(model, 'biodiversity'), + inherits(settings,'Settings') || is.null(settings), + nrow(model$predictors) == ncell(self$get_data('template')), + length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') + + # Add pseudo-absence points if necessary + if('poipo' == model$biodiversity[[1]]$type && model$biodiversity[[1]]$family == 'poisson') { + # Warning since PPMs are not really performing / correctly set up in bart + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Engine BART prone to overfit Poisson-distributed occurrence data.\nConsider non-linear xgboost as alternative!') + + # Get background layer + bg <- self$get_data('template') + assertthat::assert_that(!is.na(raster::cellStats(bg, min))) + + # Add pseudo-absence points + presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, + field_occurrence = 'observed', + template = bg, + settings = model$biodiversity[[1]]$pseudoabsence_settings) + if(inherits(presabs, 'sf')) presabs <- presabs |> sf::st_drop_geometry() + # Sample environmental points for absence only points + abs <- subset(presabs, observed == 0) + # Re-extract environmental information for absence points + envs <- get_rastervalue(coords = abs[,c('x','y')], + env = model$predictors_object$get_data(df = FALSE), + rm.na = FALSE) + if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} + + # Format out + df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], + envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) + any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) + if(length(any_missing)>0){ + presabs <- presabs[-any_missing,] # This works as they are in the same order + model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] + # Fill the absences with 1 as multiplier. This works since absences follow the presences + model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, + rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) + } + df <- subset(df, stats::complete.cases(df)) + assertthat::assert_that(nrow(presabs) == nrow(df)) + + # Overwrite observation data + model$biodiversity[[1]]$observations <- presabs + + # Preprocessing security checks + assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), + any(!is.na(presabs[['observed']])), + length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), + nrow(df) == nrow(model$biodiversity[[1]]$observations) + ) + + # Add offset if existent + if(!is.Waiver(model$offset)){ + ofs <- get_rastervalue(coords = df[,c('x','y')], + env = model$offset_object, + rm.na = FALSE) + # ofs <- get_ngbvalue(coords = df[,c('x','y')], + # env = model$offset, + # longlat = raster::isLonLat(bg), + # field_space = c('x','y') + # ) + model$biodiversity[[1]]$offset <- ofs + } + + # Define expectation as very small vector following Renner et al. + w <- ppm_weights(df = df, + pa = model$biodiversity[[1]]$observations[['observed']], + bg = bg, + weight = 1 # Set those to 1 so that absences become ratio of pres/abs + ) + df$w <- w # Also add as column + + model$biodiversity[[1]]$predictors <- df + model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) # Multiply with provided weights + } else { + # If family is not poisson, assume factor distribution for response + assertthat::assert_that( length( unique(model$biodiversity[[1]]$observations[['observed']])) == 2) + # calculating the case weights (equal weights) + # the order of weights should be the same as presences and backgrounds in the training data + prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences + bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds + w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) + model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect # Multiply with provided weights + model$biodiversity[[1]]$observations[['observed']] <- factor(model$biodiversity[[1]]$observations[['observed']]) + } + + # Split up factors as this is done anyway during the fitting! + # Check for factors and split them up + train_cov <- model$biodiversity[[1]]$predictors[, c(model$biodiversity[[1]]$predictors_names)] + # Check if there any factors, if yes split up + if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){ + vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")] + # Get factors + z <- explode_factor(train_cov[[vf]], name = vf) + # Remove variables from train_cov and append + train_cov[[vf]] <- NULL + train_cov <- cbind(train_cov, z) + model$biodiversity[[1]]$predictors <- train_cov # Save new in model object + model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) + + # Also update the formula + model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . -", vf)) + model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . +", paste0(colnames(z),collapse = "+"))) + } + + # Prediction container + pred_cov <- model$predictors[,c('x','y',model$biodiversity[[1]]$predictors_names)] + if(any(model$predictors_types$type=='factor')){ + vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")] + # Get factors + z <- explode_factor(pred_cov[[vf]], name = vf) + # Remove variables from train_cov and append + pred_cov[[vf]] <- NULL + pred_cov <- cbind(pred_cov, z) + pred_cov <- pred_cov[,c("x", "y", colnames(train_cov))] + model$predictors <- pred_cov # Save new in model object + model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) + model$biodiversity[[1]]$predictors_names <- colnames(train_cov) + model$predictors_names <- colnames(pred_cov) + assertthat::assert_that(all( colnames(train_cov) %in% colnames(pred_cov) )) + } + rm(train_cov, pred_cov) + + # Process and add priors if set + params <- self$get_data("params") + if(!is.Waiver(model$priors)){ + assertthat::assert_that( + all( model$priors$varnames() %in% model$biodiversity[[1]]$predictors_names ) + ) + # Match position of variables with monotonic constrains + mc <- rep(1 / length( model$biodiversity[[1]]$predictors_names ), length( model$biodiversity[[1]]$predictors_names ) ) + names(mc) <- model$biodiversity[[1]]$predictors_names + for(v in model$priors$varnames()){ + mc[v] <- model$priors$get(v) + } + # Save the priors in the model parameters + params[["priors"]] <- mc + } else { params[["priors"]] <- new_waiver() } + self$set_data("params", params) + + # Instead of invisible return the model object + return( model ) + }, + # Training function + train = function(self, model, settings, ...){ + assertthat::assert_that( + inherits(settings,'Settings'), + is.list(model),length(model) > 1, + # Check that model id and setting id are identical + settings$modelid == model$id + ) + # Get name + name <- model$biodiversity[[1]]$name + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green',paste0( 'Starting fitting: ', name)) + + # Get output raster + prediction <- self$get_data('template') + + # Get dbarts control and params + dc <- self$get_data('dc') + params <- self$get_data('params') + + # All other needed data for model fitting + equation <- model$biodiversity[[1]]$equation + data <- cbind(model$biodiversity[[1]]$predictors, data.frame(observed = model$biodiversity[[1]]$observations[,'observed']) ) + # Subset to predictor names + data <- subset(data, select = c('observed', model$biodiversity[[1]]$predictors_names) ) + if(model$biodiversity[[1]]$family=='binomial') data$observed <- factor(data$observed) + w <- model$biodiversity[[1]]$expect # The expected weight + full <- model$predictors # All predictors + + # Select predictors + full <- subset(full, select = c('x','y', model$biodiversity[[1]]$predictors_names)) + full$cellid <- rownames(full) # Add rownames + full <- subset(full, stats::complete.cases(full)) + + # Clamp? + if( settings$get("clamp") ) full <- clamp_predictions(model, full) + + assertthat::assert_that( + is.null(w) || length(w) == nrow(data), + is.formula(equation), + all( model$biodiversity[[1]]$predictors_names %in% names(full) ) + ) + + if(!is.Waiver(model$offset)){ + # Add offset to full prediction and load vector + if(model$biodiversity[[1]]$family == "poisson"){ + # Offsets are only supported for binary dbarts models, but maybe there is an option + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Offsets are not supported for poisson models. Trying to modify weights.') + w <- w + model$biodiversity[[1]]$offset[,"spatial_offset"] + # Check and correct for issues + if(any(w < 0, na.rm = TRUE)) { + w <- scales::rescale(w, to = c(1e-6, 1)) + } + if(anyNA(w)){ + w[is.na(w)] <- 1e-6 + } + } else if(model$biodiversity[[1]]$family == "binomial"){ + # Set the created ranges and binaryOffset + off <- model$biodiversity[[1]]$offset[,"spatial_offset"] + } + } else { off = 0.0 } + + # Specify splitprobs depending on whether priors have been set + if( !is.Waiver(params[["priors"]]) ){ + splitprobs = params[["priors"]] + } else { splitprobs = NULL } + + # --- # + # Parameter tuning # + if(settings$get('optim_hyperparam')){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting hyperparameters search.') + + cv_bart <- dbarts::xbart( + formula = equation, data = data, + n.samples = round( nrow(data) * 0.1 ), # Draw posterior samples (10% of dataset) + n.test = 5, # Number of folds + method = "k-fold", + n.reps = 4L, # For replications + control = dc, + offset = off, + loss = ifelse(is.factor(data$observed), "log", "rmse"), + n.trees = dc@n.trees, + k = c(1, 2, 4), # Prior for node-mean SD + power = c(1.5, 2), # Prior growth probability + base = c(0.75, 0.8, 0.95), # Tree growth probability + drop = TRUE, # Drop those with only one record + n.threads = dc@n.threads, + verbose = settings$get('verbose') + ) + # An array of dimensions n.reps * length(n.trees) * length(k) * length(power) * length(base) + # Convert to data.frame + cv_bart <- as.data.frame.table(cv_bart) + best <- which.min(cv_bart$Freq) # Get the setting with lowest loss/error + k <- as.numeric( as.character(cv_bart$k[best]) ) + power <- as.numeric( as.character(cv_bart$power[best]) ) + base <- as.numeric( as.character(cv_bart$base[best]) ) + + } else { + # Pick default hyperparameters for bart + # k = 2 implies that the maximum and minimum are each approximately + # 2 standard deviations from the mean, or ≈ 95 percent prior probability + # in the interval (ymin, ymax) when Y is continuous and symmetrically distributed. + # Source: https://journals.sagepub.com/doi/10.1177/2378023119825886 + k <- 2.0; power = 2.0 ; base = 0.95 + } + # --- # + # Fit the model. Little hack to work correctly with binomials... + if(is.factor(data$observed)){ + fit_bart <- dbarts::bart(y.train = data[,'observed'], + x.train = data[,model$biodiversity[[1]]$predictors_names], + # To make partial plots faster + keeptrees = dc@keepTrees, + keepevery = 10, + # weights = w, + binaryOffset = off, + # Hyper parameters + k = k, power = power, base = base, + splitprobs = splitprobs, + ntree = dc@n.trees, + nthread = dc@n.threads, + nchain = dc@n.chains, + nskip = dc@n.burn, + verbose = settings$get('verbose') + ) + } else { + fit_bart <- dbarts::bart(y.train = data[,'observed'], + x.train = data[,model$biodiversity[[1]]$predictors_names], + # To make partial plots faster + keeptrees = dc@keepTrees, + keepevery = 10, + weights = w, + ntree = dc@n.trees, + # Hyper parameters + k = k, power = power, base = base, + splitprobs = splitprobs, + nthread = dc@n.threads, + nchain = dc@n.chains, + nskip = dc@n.burn, + verbose = settings$get('verbose') + ) + } + + # Predict spatially + if(!settings$get('inference_only')){ + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') + + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(full)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + check_package("foreach") + params <- self$get_data("params") + + full$rowid <- 1:nrow(full) + + # Tile the problem + splits <- cut(1:nrow(full), nrow(full) / min(nrow(full) / 4, 5000) ) + + # Get offset if existing + if(is.Waiver(model$offset)) of <- NULL else of <- scales::rescale(model$offset[full$cellid, "spatial_offset"], to = c(1e-6, 1)) + + # Make a prediction + ms <- foreach::foreach(s = unique(splits), + .inorder = TRUE, + .combine = rbind, + .errorhandling = "stop", + .multicombine = TRUE, + .export = c("splits", "fit_bart", "full", "model", "params", "of"), + .packages = c("dbarts", "matrixStats")) %do% { + i <- which(splits == s) + + pred_bart <- dbarts:::predict.bart(object = fit_bart, + newdata = full[i, model$biodiversity[[1]]$predictors_names], + type = params$type, + offset = of[i] + ) + # Summarize quantiles and sd from posterior + ms <- as.data.frame( + cbind( apply(pred_bart, 2, function(x) mean(x, na.rm = TRUE)), + matrixStats::colSds(pred_bart), + matrixStats::colQuantiles(pred_bart, probs = c(.05,.5,.95)), + apply(pred_bart, 2, mode) + ) + ) + names(ms) <- c("mean","sd", "q05", "q50", "q95", "mode") + ms$cv <- ms$sd / ms$mean + rm(pred_bart) + return( ms ) + } # End of processing + assertthat::assert_that(nrow(ms)>0, + nrow(ms) == nrow(full)) + + # Add them through a loop since the cellid changed + prediction <- raster::stack() + for(post in names(ms)){ + prediction2 <- self$get_data('template') + prediction2[as.numeric(full$cellid)] <- ms[[post]]; names(prediction2) <- post + prediction <- raster::addLayer(prediction, prediction2) + rm(prediction2) + } + # plot(prediction$mean, col = ibis_colours$sdm_colour) + try({rm(ms, full)},silent = TRUE) + } else { + # No prediction done + prediction <- NULL + } + # Compute end of computation time + settings$set('end.time', Sys.time()) + # Also append boosting control option to settings + for(entry in methods::slotNames(dc)) settings$set(entry, methods::slot(dc,entry)) + for(entry in names(params)) settings$set(entry, params[[entry]]) + # Create output + out <- bdproto( + "BART-Model", + DistributionModel, + id = model$id, + model = model, + settings = settings, + fits = list( + "fit_best" = fit_bart, + "params" = params, + "fit_best_equation" = equation, + "prediction" = prediction + ), + # Partial effects + partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = NULL, ...){ + model <- self$get_data('fit_best') + assertthat::assert_that(x.var %in% attr(model$fit$data@x,'term.labels') || is.null(x.var), + msg = 'Variable not in predicted model' ) + bart_partial_effect(model, x.vars = x.var, + transform = self$settings$data$binary, values = values, ... ) + }, + # Spatial partial dependence plot option from embercardo + spartial = function(self, predictors, x.var = NULL, equal = FALSE, smooth = 1, transform = TRUE, type = NULL){ + model <- self$get_data('fit_best') + assertthat::assert_that(x.var %in% attr(model$fit$data@x,'term.labels'), + msg = 'Variable not in predicted model' ) + + if( self$model$biodiversity[[1]]$family != 'binomial' && transform) warning('Check whether transform should not be set to False!') + + # Calculate + p <- bart_partial_space(model, predictors, x.var, equal, smooth, transform) + + raster::plot(p, col = ibis_colours$viridis_plasma, main = paste0(x.var, collapse ='|')) + # Also return spatial + return(p) + }, + get_coefficients = function(self){ + # Returns a vector of the coefficients with direction/importance + cofs <- self$summary() + cofs$Sigma <- NA + names(cofs) <- c("Feature", "Weights") + return(cofs) + }, + # Engine-specific projection function + project = function(self, newdata, type = "response", layer = 'mean'){ + assertthat::assert_that(!missing(newdata), + is.data.frame(newdata)) + + # get model data + model <- self$model + + # Define rowids as those with no missing data + rownames(newdata) <- 1:nrow(newdata) + newdata$rowid <- as.numeric( rownames(newdata) ) + newdata <- subset(newdata, stats::complete.cases(newdata)) + + # Also get settings for bias values + settings <- self$settings + + # Clamp? + if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) + + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(newdata)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + + # Make a prediction + suppressWarnings( + pred_bart <- dbarts:::predict.bart(object = self$get_data('fit_best'), + newdata = newdata, + type = type) |> t() + ) + assertthat::assert_that(nrow(pred_bart) == nrow(newdata)) + # Fill output with summaries of the posterior + prediction <- emptyraster( model$predictors_object$get_data()[[1]] ) # Background + if(layer == "mean"){ + prediction[newdata$rowid] <- matrixStats::rowMeans2(pred_bart) + } else if(layer == "sd"){ + prediction[newdata$rowid] <- matrixStats::rowSds(pred_bart) + } else if(layer == "q05"){ + prediction[newdata$rowid] <- matrixStats::rowQuantiles(pred_bart, probs = c(.05)) + } else if(layer == "q50" || layer == "median"){ + prediction[newdata$rowid] <- matrixStats::rowQuantiles(pred_bart, probs = c(.5)) + } else if(layer == "q95"){ + prediction[newdata$rowid] <- matrixStats::rowQuantiles(pred_bart, probs = c(.95)) + } else if(layer == "mode"){ + prediction[newdata$rowid] <- apply(pred_bart, 1, mode) + } else if(layer == "cv"){ + prediction[newdata$rowid] <- matrixStats::rowSds(pred_bart) / matrixStats::rowMeans2(pred_bart) + } else { message("Custom posterior summary not yet implemented.")} + return(prediction) + } + ) + return(out) + } + ) + ) # End of bdproto object +} # End of function diff --git a/R/engine_breg.R b/R/engine_breg.R index 3aecd6d8..5466eaba 100644 --- a/R/engine_breg.R +++ b/R/engine_breg.R @@ -1,793 +1,799 @@ -#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R -NULL - -#' Engine for Bayesian regularized regression models -#' -#' @description -#' Efficient MCMC algorithm for linear regression models that makes use of -#' 'spike-and-slab' priors for some modest regularization on the amount of posterior -#' probability for a subset of the coefficients. -#' @details -#' This engine provides efficient Bayesian predictions through the \pkg{Boom} R-package. However note -#' that not all link and models functions are supported and certain functionalities such as offsets are generally -#' not available. -#' This engines allows the estimation of linear and non-linear effects via the \code{"only_linear"} option -#' specified in [train]. -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param iter [`numeric`] on the number of MCMC iterations to run (Default: \code{10000}). -#' @param nthread [`numeric`] on the number of CPU-threads to use for data augmentation. -#' @param type The mode used for creating posterior predictions. Either making \code{"link"} or \code{"response"} (Default: \code{"response"}). -#' @param ... Other none specified parameters passed on to the model. -#' @references -#' * Nguyen, K., Le, T., Nguyen, V., Nguyen, T., & Phung, D. (2016, November). Multiple kernel learning with data augmentation. In Asian Conference on Machine Learning (pp. 49-64). PMLR. -#' * Steven L. Scott (2021). BoomSpikeSlab: MCMC for Spike and Slab Regression. R package version 1.2.4. https://CRAN.R-project.org/package=BoomSpikeSlab -#' @family engine -#' @name engine_breg -NULL -#' @rdname engine_breg -#' @export - -engine_breg <- function(x, - iter = 10000, - nthread = getOption('ibis.nthread'), - type = "response", - ...) { - - # Check whether xgboost package is available - check_package('BoomSpikeSlab') - if(!("BoomSpikeSlab" %in% loadedNamespaces()) || ('BoomSpikeSlab' %notin% sessionInfo()$otherPkgs) ) { - try({requireNamespace('BoomSpikeSlab');attachNamespace("BoomSpikeSlab")},silent = TRUE) - } - - # assert that arguments are valid - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(x$background,'sf'), - is.numeric(iter), - is.character(type), - is.numeric(nthread) - ) - type <- match.arg(type, choices = c("predictor","link", "response"),several.ok = FALSE) - if(type=="predictor") type <- "link" # Convenience conversion - - # Create a background raster - if(is.Waiver(x$predictors)){ - # Create from background - template <- raster::raster( - ext = raster::extent(x$background), - crs = raster::projection(x$background), - res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution - diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 - ) - ) - } else { - # If predictor existing, use them - template <- emptyraster(x$predictors$get_data() ) - } - - # Burn in the background - template <- raster::rasterize(x$background, template, field = 0) - - # Set up the parameter list - params <- list( - iter = iter, - nthread = nthread, - type = type, - ... - ) - - # Print a message in case there is already an engine object - if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') - - # Set engine in distribution object - x$set_engine( - bdproto( - "BREG-Engine", - Engine, - name = "", - data = list( - 'template' = template, - 'params' = params - ), - # Dummy function for spatial latent effects - calc_latent_spatial = function(self, type = NULL, priors = NULL){ - new_waiver() - }, - # Dummy function for getting the equation of latent effects - get_equation_latent_spatial = function(self, method){ - new_waiver() - }, - # Function to respecify the control parameters - set_control = function(self, - params - ){ - assertthat::assert_that(is.list(params)) - # Overwrite existing - self$data$params <- params - invisible() - }, - # Setup function - setup = function(self, model, settings = NULL, ...){ - # Simple security checks - assertthat::assert_that( - assertthat::has_name(model, 'background'), - assertthat::has_name(model, 'biodiversity'), - inherits(settings,'Settings') || is.null(settings), - nrow(model$predictors) == ncell(self$get_data('template')), - !is.Waiver(self$get_data("params")), - length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') - - # Get parameters - params <- self$data$params - settings$set('iter', params$iter) - settings$set('type', params$type) - - # Distribution specific procedure - fam <- model$biodiversity[[1]]$family - - # Check whether regularization parameter is set to none, if yes, raise message - if(settings$get("varsel") != "none"){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Note: Engine_breg always applies regularization.') - } - - # -- # - # Expand predictors if specified in settings - if(settings$get('only_linear') == FALSE){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Non-linear estimation not added to engine. Suggest to create variable derivatives externally.') - } - - # Check if offset present and fam binomial, Raise warning - if(fam == "binomial" && !is.Waiver(model$offset)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Binomial models fitted with BREG do not support offsets. Offsets were ignored!') - } - # -- # - - # If a poisson family is used, weight the observations by their exposure - if(fam == "poisson"){ - # Get background layer - bg <- self$get_data("template") - assertthat::assert_that(!is.na(cellStats(bg,min))) - - # Add pseudo-absence points - presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, - field_occurrence = 'observed', - template = bg, - settings = model$biodiversity[[1]]$pseudoabsence_settings) - if(inherits(presabs, 'sf')) presabs <- presabs %>% sf::st_drop_geometry() - # Sample environmental points for absence only points - abs <- subset(presabs, observed == 0) - # Re-extract environmental information for absence points - envs <- get_rastervalue(coords = abs[,c('x','y')], - env = model$predictors_object$get_data(df = FALSE), - rm.na = FALSE) - if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} - - # Format out - df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], - envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) - any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) - if(length(any_missing)>0){ - presabs <- presabs[-any_missing,] # This works as they are in the same order - model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] - # Fill the absences with 1 as multiplier. This works since absences follow the presences - model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, - rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) - } - df <- subset(df, complete.cases(df)) - assertthat::assert_that(nrow(presabs) == nrow(df)) - - # Overwrite observation data - model$biodiversity[[1]]$observations <- presabs - - # Preprocessing security checks - assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), - any(!is.na(presabs[['observed']])), - length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), - nrow(df) == nrow(model$biodiversity[[1]]$observations) - ) - - # Add offset if existent - if(!is.Waiver(model$offset)){ - ofs <- get_rastervalue(coords = df[,c('x','y')], - env = model$offset_object, - rm.na = FALSE) - # Rename to spatial offset - names(ofs)[which(names(ofs)==names(model$offset_object))] <- "spatial_offset" - # ofs <- get_ngbvalue(coords = df[,c('x','y')], - # env = model$offset, - # longlat = raster::isLonLat(bg), - # field_space = c('x','y') - # ) - model$biodiversity[[1]]$offset <- ofs - } - - # Define expectation as very small vector following Renner et al. - w <- ppm_weights(df = df, - pa = model$biodiversity[[1]]$observations[['observed']], - bg = bg, - weight = 1e-6 - ) - assertthat::assert_that(length(w) == nrow(df)) - - model$biodiversity[[1]]$predictors <- df - model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) - - # Rasterize observed presences - pres <- raster::rasterize(model$biodiversity[[1]]$observations[,c("x","y")], - bg, fun = 'count', background = 0) - # Get for the full dataset - w_full <- ppm_weights(df = model$predictors, - pa = pres[], - bg = bg, - weight = 1 # Set those to 1 so that absences become ratio of pres/abs - ) - - # Add exposure to full model predictor - model$exposure <- w_full * (1/ unique(model$biodiversity[[1]]$expect)[1]) - - } else if(fam == "binomial"){ - # calculating the case weights (equal weights) - # the order of weights should be the same as presences and backgrounds in the training data - prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences - bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds - w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) - model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect - # Convert to numeric - model$biodiversity[[1]]$observations$observed <- as.numeric( model$biodiversity[[1]]$observations$observed ) - } - - # Check for factors and split them up - train_cov <- model$biodiversity[[1]]$predictors[,model$biodiversity[[1]]$predictors_names] - # Check if there any factors, if yes split up - if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){ - vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")] - # Get factors - z <- explode_factor(train_cov[[vf]], name = vf) - # Remove variables from train_cov and append - train_cov[[vf]] <- NULL - train_cov <- cbind(train_cov, z) - model$biodiversity[[1]]$predictors <- train_cov # Save new in model object - model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) - - # Also update the formula - model$biodiversity[[1]]$equation <- update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . -", vf)) - model$biodiversity[[1]]$equation <- update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . +", paste0(colnames(z),collapse = "+"))) - } - - # Prediction container - pred_cov <- model$predictors[,model$biodiversity[[1]]$predictors_names] - if(any(model$predictors_types$type=='factor')){ - vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")] - # Get factors - z <- explode_factor(pred_cov[[vf]], name = vf) - # Remove variables from train_cov and append - pred_cov[[vf]] <- NULL - pred_cov <- cbind(pred_cov, z) - pred_cov <- pred_cov[,colnames(train_cov)] - model$predictors <- pred_cov# Save new in model object - model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) - model$biodiversity[[1]]$predictors_names <- colnames(train_cov) - model$predictors_names <- colnames(pred_cov) - assertthat::assert_that(all( colnames(train_cov) %in% colnames(pred_cov) )) - } - rm(train_cov, pred_cov) - - # Instead of invisible return the model object - return( model ) - }, - # Training function - train = function(self, model, settings, ...){ - assertthat::assert_that( - inherits(settings,'Settings'), - is.list(model),length(model)>1, - # Check that model id and setting id are identical - settings$modelid == model$id - ) - # Get name - name <- model$biodiversity[[1]]$name - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green',paste0('Starting fitting: ', name)) - - # Verbosity - verbose <- settings$get("verbose") - - # Set prediction type also for later - settings$set('type', self$get_data("params")$type) - - # seed - seed <- settings$get("seed") - if(is.Waiver(seed)) { seed <- 1337; settings$set('seed', 1337) } - - # Get output raster - prediction <- self$get_data('template') - - # Get parameters control - params <- self$get_data('params') - - # All other needed data for model fitting - fam <- model$biodiversity[[1]]$family - li <- model$biodiversity[[1]]$link - if(!is.null(li)) if(getOption('ibis.setupmessages')) myLog('[Estimation]','red',paste0("Package does not support custom link functions. Ignored!")) - form <- model$biodiversity[[1]]$equation - df <- cbind(model$biodiversity[[1]]$predictors, - data.frame(observed = model$biodiversity[[1]]$observations[,'observed']) - ) - df <- subset(df, select = c(model$biodiversity[[1]]$predictors_names, "observed")) - w <- model$biodiversity[[1]]$expect # The expected exposure - # Get full prediction container - full <- model$predictors - w_full <- model$exposure - - # Priors - if(!is.Waiver(model$priors)){ - # Define a family specific Boom prior - pp <- setup_prior_boom(form = form, - data = df, - priors = model$priors, - family = fam, - exposure = w - ) - } else { pp <- NULL } - - # Get offset and add it to exposure - if(!is.Waiver(model$offset)){ - # Add offset to full prediction and load vector - w <- w + model$biodiversity[[1]]$offset[, 'spatial_offset'] - w_full <- w_full + model$offset[,'spatial_offset'] - # negative exposure does not work, so normalize again to range of 1e-6 to 1 - if(any(w < 0,na.rm = TRUE)) { - check_package('scales') - w <- scales::rescale(w, to = c(1e-6, 1)) - w_full <- scales::rescale(w_full, to = c(1e-6, 1)) - } - if(anyNA(w)){ - w[is.na(w)] <- 1e-6 - w_full[is.na(w_full)] <- 1e-6 - } - } - - # Clamp? - if( settings$get("clamp") ) full <- clamp_predictions(model, full) - - assertthat::assert_that( - is.null(w) || length(w) == nrow(df), - all(w >= 0,na.rm = TRUE) # Required for engine_breg - ) - # --- # - # Fit the model depending on the family - if(fam == "poisson"){ - # Fitting poisson model - fit_breg <- BoomSpikeSlab::poisson.spike( - formula = form, - exposure = w, - niter = params$iter, - data = df, - prior = pp, - nthreads = params$nthread, - ping = ifelse( settings$get("verbose"), params$iter / 10 , 0), - seed = seed - ) - } else if(fam == "binomial"){ - fit_breg <- BoomSpikeSlab::logit.spike( - formula = form, - niter = params$iter, - data = df, - prior = pp, - nthreads = params$nthread, - ping = ifelse( settings$get("verbose"), params$iter / 10 , 0), - seed = seed - ) - } else { - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Non supported family: ', fam) - fit_breg <- BoomSpikeSlab::lm.spike( - formula = form, - niter = params$iter, - data = df, - prior = pp, - nthreads = params$nthread, - ping = ifelse( settings$get("verbose"), params$iter / 10 , 0), - seed = seed - ) - } - # --- # - # Call garbage collector to save memory - invisible(gc()) - - # Predict spatially - if(!settings$get('inference_only')){ - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(full)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - - # Make a prediction, but do in parallel so as to not overuse memory - full$rowid <- 1:nrow(full) - full_sub <- subset(full, complete.cases(full)) - w_full_sub <- w_full[full_sub$rowid] - assertthat::assert_that((nrow(full_sub) == length(w_full_sub)) || is.null(w_full_sub) ) - - # Tile the problem - splits <- cut(1:nrow(full_sub), nrow(full_sub) / (min(100, nrow(full_sub) / 10)) ) - - # Now depending on parallization setting use foreach - if(getOption("ibis.runparallel")){ - # Check that future is registered - if(!foreach:::getDoParRegistered()) ibis_future(cores = getOption("ibis.nthread"), - strategy = getOption("ibis.futurestrategy")) - - # Run the outgoing command - # out <- foreach::foreach(s = unique(splits), - # .combine = rbind, - # .export = c("splits", "fit_breg", "full_sub", - # "w_full_sub", "fam", "params"), - # .packages = c("matrixStats"), - # .multicombine = TRUE, - # .inorder = TRUE, - # verbose = settings$get("verbose") ) %do% { - out <- parallel::mclapply(unique(splits), function(s) { - i <- which(splits == s) - # -> external code in utils-boom - pred_breg <- ibis.iSDM:::predict_boom( - obj = fit_breg, - newdata = full_sub[i,], - w = w_full_sub[i], - fam = fam, - params = params - ) - # Summarize the posterior - preds <- base::as.data.frame( - cbind( - matrixStats::rowMeans2(pred_breg, na.rm = TRUE), - matrixStats::rowSds(pred_breg, na.rm = TRUE), - matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), - apply(pred_breg, 1, mode) - ) - ) - names(preds) <- c("mean", "sd", "q05", "q50", "q95", "mode") - preds$cv <- preds$sd / preds$mean - return(preds) - }) - out <- do.call(rbind, out) - } else { - out <- data.frame() - pb <- progress::progress_bar$new(total = length(levels(unique(splits))), - format = "Creating model prediction (:spin) [:bar] :percent") - for(s in unique(splits)){ - pb$tick() - i <- which(splits == s) - # -> external code in utils-boom - pred_breg <- predict_boom( - obj = fit_breg, - newdata = full_sub[i,], - w = w_full_sub[i], - fam = fam, - params = params - ) - # Summarize the posterior - preds <- cbind( - matrixStats::rowMeans2(pred_breg, na.rm = TRUE), - matrixStats::rowSds(pred_breg, na.rm = TRUE), - matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), - apply(pred_breg, 1, mode) - ) |> as.data.frame() - names(preds) <- c("mean", "sd", "q05", "q50", "q95", "mode") - preds$cv <- preds$sd / preds$mean - out <- rbind(out, preds) - rm(preds, pred_breg) - } - } - assertthat::assert_that(is.data.frame(out), nrow(out)>0, - msg = "Something went wrong withe prediction. Output empty!") - # Fill output with summaries of the posterior - stk <- raster::stack() - for(v in colnames(out)){ - temp <- emptyraster(prediction) - temp[full_sub$rowid] <- out[,v] - names(temp) <- v - stk <- raster::addLayer(stk, temp) - } - prediction <- stk;rm(stk) - prediction <- raster::mask(prediction, self$get_data("template")) - try({rm(out, full, full_sub)},silent = TRUE) - } else { - # No prediction done - prediction <- NULL - } - # Compute end of computation time - settings$set('end.time', Sys.time()) - - # Definition of BREG Model object ---- - # Create output - out <- bdproto( - "BREG-Model", - DistributionModel, - id = model$id, - model = model, - settings = settings, - fits = list( - "fit_best" = fit_breg, - "fit_best_equation" = form, - "prediction" = prediction - ), - # Partial effects - partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = NULL, ...){ - assertthat::assert_that(is.character(x.var) || is.null(x.var), - is.null(constant) || is.numeric(constant), - is.null(type) || is.character(type), - is.numeric(variable_length) - ) - # Settings - settings <- self$settings - # Set type - if(is.null(type)) type <- self$settings$get("type") - type <- match.arg(type, c("link", "response"), several.ok = FALSE) - settings$set("type", type) - - mod <- self$get_data('fit_best') - model <- self$model - df <- model$biodiversity[[1]]$predictors - df <- subset(df, select = attr(mod$terms, "term.labels")) - w <- model$biodiversity[[1]]$expect # Also get exposure variable - - # Match x.var to argument - if(is.null(x.var)){ - x.var <- colnames(df) - } else { - x.var <- match.arg(x.var, names(df), several.ok = FALSE) - } - - # Calculate range of predictors - if(any(model$predictors_types$type=="factor")){ - rr <- sapply(df[model$predictors_types$predictors[model$predictors_types$type=="numeric"]], - function(x) range(x, na.rm = TRUE)) |> as.data.frame() - } else { - rr <- sapply(df, function(x) range(x, na.rm = TRUE)) |> as.data.frame() - } - - df_partial <- list() - if(!is.null(values)){ assertthat::assert_that(length(values) >= 1) } - - # Add all others as constant - if(is.null(constant)){ - for(n in names(rr)) df_partial[[n]] <- rep( mean(df[[n]], na.rm = TRUE), variable_length ) - } else { - for(n in names(rr)) df_partial[[n]] <- rep( constant, variable_length ) - } - if(!is.null(values)){ - df_partial[[x.var]] <- values - } else { - df_partial[[x.var]] <- seq(rr[1,x.var], rr[2,x.var], length.out = variable_length) - } - - df_partial <- df_partial %>% as.data.frame() - if(any(model$predictors_types$type=="factor")){ - lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) - df_partial[model$predictors_types$predictors[model$predictors_types$type=="factor"]] <- - factor(lvl[1], levels = lvl) - } - - # For Integrated model, take the last one - fam <- model$biodiversity[[length(model$biodiversity)]]$family - - pred_breg <- predict_boom( - obj = mod, - newdata = df_partial, - w = unique(w)[2], # The second entry of unique contains the non-observed variables - fam = fam, - params = settings$data # Use the settings as list - ) # Also attach the partial variable - - # Summarize the partial effect - pred_part <- cbind( - matrixStats::rowMeans2(pred_breg, na.rm = TRUE), - matrixStats::rowSds(pred_breg, na.rm = TRUE), - matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), - apply(pred_breg, 1, mode) - ) %>% as.data.frame() - names(pred_part) <- c("mean", "sd", "q05", "q50", "q95", "mode") - pred_part$cv <- pred_part$sd / pred_part$mean - # And attach the variable - pred_part <- cbind("partial_effect" = df_partial[[x.var]], pred_part) - - if(plot){ - # Make a plot - g <- ggplot2::ggplot(data = pred_part, ggplot2::aes(x = partial_effect, y = q50, ymin = q05, ymax = q95)) + - ggplot2::theme_classic(base_size = 18) + - ggplot2::geom_ribbon(fill = 'grey90') + - ggplot2::geom_line() + - ggplot2::labs(x = paste0("partial of ",x.var), y = expression(hat(y))) - print(g) - } - # Return the data - return(pred_part) - }, - # Spatial partial dependence plot - spartial = function(self, x.var, constant = NULL, plot = TRUE, type = NULL){ - assertthat::assert_that(is.character(x.var) || is.null(x.var), - "model" %in% names(self), - is.null(constant) || is.numeric(constant), - is.logical(plot), - is.character(type) || is.null(type) - ) - - # Settings - settings <- self$settings - # Set type - if(is.null(type)) type <- self$settings$get("type") - type <- match.arg(type, c("link", "response"), several.ok = FALSE) - settings$set("type", type) - - mod <- self$get_data('fit_best') - model <- self$model - df <- model$biodiversity[[length( model$biodiversity )]]$predictors - df <- subset(df, select = attr(mod$terms, "term.labels")) - w <- model$biodiversity[[1]]$expect # Also get exposure variable - - # Match x.var to argument - if(is.null(x.var)){ - x.var <- colnames(df) - } else { - x.var <- match.arg(x.var, names(df), several.ok = FALSE) - } - - # Make spatial container for prediction - suppressWarnings( - df_partial <- sp::SpatialPointsDataFrame(coords = model$predictors[,c('x', 'y')], - data = model$predictors[, names(model$predictors) %notin% c('x','y')], - proj4string = sp::CRS( sp::proj4string(as(model$background, "Spatial")) ) - ) - ) - df_partial <- as(df_partial, 'SpatialPixelsDataFrame') - - # Add all others as constant - if(is.null(constant)){ - for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- suppressWarnings( mean(model$predictors[[n]], na.rm = TRUE) ) - } else { - for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- constant - } - if(any(model$predictors_types$type=="factor")){ - lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) - df_partial[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]] <- - factor(lvl[1], levels = lvl) - # FIXME: Assigning the first level (usually reference) for now. But ideally find a way to skip factors from partial predictions - } - - # For Integrated model, take the last one - fam <- model$biodiversity[[length(model$biodiversity)]]$family - - pred_breg <- predict_boom( - obj = mod, - newdata = df_partial@data, - w = unique(w)[2], # The second entry of unique contains the non-observed variables - fam = fam, - params = settings$data # Use the settings as list - ) - - # Summarize the partial effect - pred_part <- cbind( - matrixStats::rowMeans2(pred_breg, na.rm = TRUE), - matrixStats::rowSds(pred_breg, na.rm = TRUE), - matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), - apply(pred_breg, 1, mode) - ) %>% as.data.frame() - names(pred_part) <- c("mean", "sd", "q05", "q50", "q95", "mode") - pred_part$cv <- pred_part$sd / pred_part$mean - - # Now create spatial prediction - prediction <- emptyraster( self$get_data('prediction') ) # Background - prediction <- fill_rasters(pred_part, prediction) - - # Do plot and return result - if(plot){ - plot(prediction, col = ibis_colours$viridis_orig) - } - return(prediction) - }, - # Get coefficients from breg - get_coefficients = function(self){ - # Returns a vector of the coefficients with direction/importance - obj <- self$get_data("fit_best") - cofs <- posterior::summarise_draws(obj$beta) - cofs <- subset(cofs, select = c("variable", "mean", "sd")) - names(cofs) <- c("Feature", "Beta", "Sigma") - # Remove intercept(s) - int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) - if(length(int)>0) cofs <- cofs[-int,] - return(cofs) - }, - # Engine-specific projection function - project = function(self, newdata, type = NULL, layer = "mean"){ - assertthat::assert_that("model" %in% names(self), - nrow(newdata) > 0, - all( c("x", "y") %in% names(newdata) ), - is.character(type) || is.null(type) - ) - - # Settings - settings <- self$settings - # Set type - if(is.null(type)) type <- self$settings$get("type") - type <- match.arg(type, c("link", "response"), several.ok = FALSE) - settings$set("type", type) - - mod <- self$get_data('fit_best') - model <- self$model - df <- newdata - df <- subset(df, select = attr(mod$terms, "term.labels")) - - # Clamp? - if( settings$get("clamp") ) df <- clamp_predictions(model, df) - - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% colnames(df)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - df[,settings$get('bias_variable')[i]] <- settings$get('bias_value')[i] - } - } - - df$rowid <- 1:nrow(df) - df_sub <- subset(df, complete.cases(df)) - w <- model$biodiversity[[1]]$expect # Also get exposure variable - - # For Integrated model, take the last one - fam <- model$biodiversity[[length(model$biodiversity)]]$family - - # Rather predict in steps than for the whole thing - out <- data.frame() - - # Tile the problem - splits <- cut(1:nrow(df_sub), nrow(df_sub) / (min(100, nrow(df_sub) / 10)) ) - - pb <- progress::progress_bar$new(total = length(levels(unique(splits))), - format = "Projecting on new data (:spin) [:bar] :percent") - for(s in unique(splits)){ - pb$tick() - i <- which(splits == s) - # -> external code in utils-boom - pred_breg <- predict_boom( - obj = mod, - newdata = df_sub[i,], - w = unique(w)[2], - fam = fam, - params = settings$data - ) - # Summarize the posterior - preds <- cbind( - matrixStats::rowMeans2(pred_breg, na.rm = TRUE), - matrixStats::rowSds(pred_breg, na.rm = TRUE), - matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), - apply(pred_breg, 1, mode) - ) |> as.data.frame() - names(preds) <- c("mean", "sd", "q05", "q50", "q95", "mode") - preds$cv <- preds$sd / preds$mean - out <- rbind(out, preds) - rm(preds, pred_breg) - } - - # Now create spatial prediction - prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background - prediction[df_sub$rowid] <- out[,layer] - names(prediction) <- layer - - return(prediction) - } - ) - return(out) - } - ) - ) # End of bdproto object -} # End of function +#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R +NULL + +#' Engine for Bayesian regularized regression models +#' +#' @description +#' Efficient MCMC algorithm for linear regression models that makes use of +#' 'spike-and-slab' priors for some modest regularization on the amount of posterior +#' probability for a subset of the coefficients. +#' @details +#' This engine provides efficient Bayesian predictions through the \pkg{Boom} R-package. However note +#' that not all link and models functions are supported and certain functionalities such as offsets are generally +#' not available. +#' This engines allows the estimation of linear and non-linear effects via the \code{"only_linear"} option +#' specified in [train]. +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param iter [`numeric`] on the number of MCMC iterations to run (Default: \code{10000}). +#' @param nthread [`numeric`] on the number of CPU-threads to use for data augmentation. +#' @param type The mode used for creating posterior predictions. Either making \code{"link"} or \code{"response"} (Default: \code{"response"}). +#' @param ... Other none specified parameters passed on to the model. +#' @references +#' * Nguyen, K., Le, T., Nguyen, V., Nguyen, T., & Phung, D. (2016, November). Multiple kernel learning with data augmentation. In Asian Conference on Machine Learning (pp. 49-64). PMLR. +#' * Steven L. Scott (2021). BoomSpikeSlab: MCMC for Spike and Slab Regression. R package version 1.2.4. https://CRAN.R-project.org/package=BoomSpikeSlab +#' @family engine +#' @returns An [engine]. +#' @examples +#' \dontrun{ +#' # Add BREG as an engine +#' x <- distribution(background) |> engine_breg(iter = 1000) +#' } +#' @name engine_breg +NULL +#' @rdname engine_breg +#' @export + +engine_breg <- function(x, + iter = 10000, + nthread = getOption('ibis.nthread'), + type = "response", + ...) { + + # Check whether xgboost package is available + check_package('BoomSpikeSlab') + if(!("BoomSpikeSlab" %in% loadedNamespaces()) || ('BoomSpikeSlab' %notin% utils::sessionInfo()$otherPkgs) ) { + try({requireNamespace('BoomSpikeSlab');attachNamespace("BoomSpikeSlab")},silent = TRUE) + } + + # assert that arguments are valid + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(x$background,'sf'), + is.numeric(iter), + is.character(type), + is.numeric(nthread) + ) + type <- match.arg(type, choices = c("predictor","link", "response"),several.ok = FALSE) + if(type=="predictor") type <- "link" # Convenience conversion + + # Create a background raster + if(is.Waiver(x$predictors)){ + # Create from background + template <- raster::raster( + ext = raster::extent(x$background), + crs = raster::projection(x$background), + res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution + diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 + ) + ) + } else { + # If predictor existing, use them + template <- emptyraster(x$predictors$get_data() ) + } + + # Burn in the background + template <- raster::rasterize(x$background, template, field = 0) + + # Set up the parameter list + params <- list( + iter = iter, + nthread = nthread, + type = type, + ... + ) + + # Print a message in case there is already an engine object + if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') + + # Set engine in distribution object + x$set_engine( + bdproto( + "BREG-Engine", + Engine, + name = "", + data = list( + 'template' = template, + 'params' = params + ), + # Dummy function for spatial latent effects + calc_latent_spatial = function(self, type = NULL, priors = NULL){ + new_waiver() + }, + # Dummy function for getting the equation of latent effects + get_equation_latent_spatial = function(self, method){ + new_waiver() + }, + # Function to respecify the control parameters + set_control = function(self, + params + ){ + assertthat::assert_that(is.list(params)) + # Overwrite existing + self$data$params <- params + invisible() + }, + # Setup function + setup = function(self, model, settings = NULL, ...){ + # Simple security checks + assertthat::assert_that( + assertthat::has_name(model, 'background'), + assertthat::has_name(model, 'biodiversity'), + inherits(settings,'Settings') || is.null(settings), + nrow(model$predictors) == ncell(self$get_data('template')), + !is.Waiver(self$get_data("params")), + length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') + + # Get parameters + params <- self$data$params + settings$set('iter', params$iter) + settings$set('type', params$type) + + # Distribution specific procedure + fam <- model$biodiversity[[1]]$family + + # Check whether regularization parameter is set to none, if yes, raise message + if(settings$get('optim_hyperparam')){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Note: Engine_breg always applies regularization.') + } + + # -- # + # Expand predictors if specified in settings + if(settings$get('only_linear') == FALSE){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Non-linear estimation not added to engine. Suggest to create variable derivatives externally.') + } + + # Check if offset present and fam binomial, Raise warning + if(fam == "binomial" && !is.Waiver(model$offset)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Binomial models fitted with BREG do not support offsets. Offsets were ignored!') + } + # -- # + + # If a poisson family is used, weight the observations by their exposure + if(fam == "poisson"){ + # Get background layer + bg <- self$get_data("template") + assertthat::assert_that(!is.na(cellStats(bg,min))) + + # Add pseudo-absence points + presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, + field_occurrence = 'observed', + template = bg, + settings = model$biodiversity[[1]]$pseudoabsence_settings) + if(inherits(presabs, 'sf')) presabs <- presabs |> sf::st_drop_geometry() + # Sample environmental points for absence only points + abs <- subset(presabs, observed == 0) + # Re-extract environmental information for absence points + envs <- get_rastervalue(coords = abs[,c('x','y')], + env = model$predictors_object$get_data(df = FALSE), + rm.na = FALSE) + if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} + + # Format out + df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], + envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) + any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) + if(length(any_missing)>0){ + presabs <- presabs[-any_missing,] # This works as they are in the same order + model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] + # Fill the absences with 1 as multiplier. This works since absences follow the presences + model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, + rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) + } + df <- subset(df, stats::complete.cases(df)) + assertthat::assert_that(nrow(presabs) == nrow(df)) + + # Overwrite observation data + model$biodiversity[[1]]$observations <- presabs + + # Preprocessing security checks + assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), + any(!is.na(presabs[['observed']])), + length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), + nrow(df) == nrow(model$biodiversity[[1]]$observations) + ) + + # Add offset if existent + if(!is.Waiver(model$offset)){ + ofs <- get_rastervalue(coords = df[,c('x','y')], + env = model$offset_object, + rm.na = FALSE) + # Rename to spatial offset + names(ofs)[which(names(ofs)==names(model$offset_object))] <- "spatial_offset" + # ofs <- get_ngbvalue(coords = df[,c('x','y')], + # env = model$offset, + # longlat = raster::isLonLat(bg), + # field_space = c('x','y') + # ) + model$biodiversity[[1]]$offset <- ofs + } + + # Define expectation as very small vector following Renner et al. + w <- ppm_weights(df = df, + pa = model$biodiversity[[1]]$observations[['observed']], + bg = bg, + weight = 1e-6 + ) + assertthat::assert_that(length(w) == nrow(df)) + + model$biodiversity[[1]]$predictors <- df + model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) + + # Rasterize observed presences + pres <- raster::rasterize(model$biodiversity[[1]]$observations[,c("x","y")], + bg, fun = 'count', background = 0) + # Get for the full dataset + w_full <- ppm_weights(df = model$predictors, + pa = pres[], + bg = bg, + weight = 1 # Set those to 1 so that absences become ratio of pres/abs + ) + + # Add exposure to full model predictor + model$exposure <- w_full * (1/ unique(model$biodiversity[[1]]$expect)[1]) + + } else if(fam == "binomial"){ + # calculating the case weights (equal weights) + # the order of weights should be the same as presences and backgrounds in the training data + prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences + bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds + w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) + model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect + # Convert to numeric + model$biodiversity[[1]]$observations$observed <- as.numeric( model$biodiversity[[1]]$observations$observed ) + } + + # Check for factors and split them up + train_cov <- model$biodiversity[[1]]$predictors[,model$biodiversity[[1]]$predictors_names] + # Check if there any factors, if yes split up + if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){ + vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")] + # Get factors + z <- explode_factor(train_cov[[vf]], name = vf) + # Remove variables from train_cov and append + train_cov[[vf]] <- NULL + train_cov <- cbind(train_cov, z) + model$biodiversity[[1]]$predictors <- train_cov # Save new in model object + model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) + + # Also update the formula + model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . -", vf)) + model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . +", paste0(colnames(z),collapse = "+"))) + } + + # Prediction container + pred_cov <- model$predictors[,model$biodiversity[[1]]$predictors_names] + if(any(model$predictors_types$type=='factor')){ + vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")] + # Get factors + z <- explode_factor(pred_cov[[vf]], name = vf) + # Remove variables from train_cov and append + pred_cov[[vf]] <- NULL + pred_cov <- cbind(pred_cov, z) + pred_cov <- pred_cov[,colnames(train_cov)] + model$predictors <- pred_cov# Save new in model object + model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) + model$biodiversity[[1]]$predictors_names <- colnames(train_cov) + model$predictors_names <- colnames(pred_cov) + assertthat::assert_that(all( colnames(train_cov) %in% colnames(pred_cov) )) + } + rm(train_cov, pred_cov) + + # Instead of invisible return the model object + return( model ) + }, + # Training function + train = function(self, model, settings, ...){ + assertthat::assert_that( + inherits(settings,'Settings'), + is.list(model),length(model)>1, + # Check that model id and setting id are identical + settings$modelid == model$id + ) + # Get name + name <- model$biodiversity[[1]]$name + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green',paste0('Starting fitting: ', name)) + + # Verbosity + verbose <- settings$get("verbose") + + # Set prediction type also for later + settings$set('type', self$get_data("params")$type) + + # seed + seed <- settings$get("seed") + if(is.Waiver(seed)) { seed <- 1337; settings$set('seed', 1337) } + + # Get output raster + prediction <- self$get_data('template') + + # Get parameters control + params <- self$get_data('params') + + # All other needed data for model fitting + fam <- model$biodiversity[[1]]$family + li <- model$biodiversity[[1]]$link + if(!is.null(li)) if(getOption('ibis.setupmessages')) myLog('[Estimation]','red',paste0("Package does not support custom link functions. Ignored!")) + form <- model$biodiversity[[1]]$equation + df <- cbind(model$biodiversity[[1]]$predictors, + data.frame(observed = model$biodiversity[[1]]$observations[,'observed']) + ) + df <- subset(df, select = c(model$biodiversity[[1]]$predictors_names, "observed")) + w <- model$biodiversity[[1]]$expect # The expected exposure + # Get full prediction container + full <- model$predictors + w_full <- model$exposure + + # Priors + if(!is.Waiver(model$priors)){ + # Define a family specific Boom prior + pp <- setup_prior_boom(form = form, + data = df, + priors = model$priors, + family = fam, + exposure = w + ) + } else { pp <- NULL } + + # Get offset and add it to exposure + if(!is.Waiver(model$offset)){ + # Add offset to full prediction and load vector + w <- w + model$biodiversity[[1]]$offset[, 'spatial_offset'] + w_full <- w_full + model$offset[,'spatial_offset'] + # negative exposure does not work, so normalize again to range of 1e-6 to 1 + if(any(w < 0,na.rm = TRUE)) { + check_package('scales') + w <- scales::rescale(w, to = c(1e-6, 1)) + w_full <- scales::rescale(w_full, to = c(1e-6, 1)) + } + if(anyNA(w)){ + w[is.na(w)] <- 1e-6 + w_full[is.na(w_full)] <- 1e-6 + } + } + + # Clamp? + if( settings$get("clamp") ) full <- clamp_predictions(model, full) + + assertthat::assert_that( + is.null(w) || length(w) == nrow(df), + all(w >= 0,na.rm = TRUE) # Required for engine_breg + ) + # --- # + # Fit the model depending on the family + if(fam == "poisson"){ + # Fitting poisson model + fit_breg <- BoomSpikeSlab::poisson.spike( + formula = form, + exposure = w, + niter = params$iter, + data = df, + prior = pp, + nthreads = params$nthread, + ping = ifelse( settings$get("verbose"), params$iter / 10 , 0), + seed = seed + ) + } else if(fam == "binomial"){ + fit_breg <- BoomSpikeSlab::logit.spike( + formula = form, + niter = params$iter, + data = df, + prior = pp, + nthreads = params$nthread, + ping = ifelse( settings$get("verbose"), params$iter / 10 , 0), + seed = seed + ) + } else { + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Non supported family: ', fam) + fit_breg <- BoomSpikeSlab::lm.spike( + formula = form, + niter = params$iter, + data = df, + prior = pp, + nthreads = params$nthread, + ping = ifelse( settings$get("verbose"), params$iter / 10 , 0), + seed = seed + ) + } + # --- # + # Call garbage collector to save memory + invisible(gc()) + + # Predict spatially + if(!settings$get('inference_only')){ + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(full)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + + # Make a prediction, but do in parallel so as to not overuse memory + full$rowid <- 1:nrow(full) + full_sub <- subset(full, stats::complete.cases(full)) + w_full_sub <- w_full[full_sub$rowid] + assertthat::assert_that((nrow(full_sub) == length(w_full_sub)) || is.null(w_full_sub) ) + + # Tile the problem + splits <- cut(1:nrow(full_sub), nrow(full_sub) / (min(100, nrow(full_sub) / 10)) ) + + # Now depending on parallization setting use foreach + if(getOption("ibis.runparallel")){ + # Check that future is registered + if(!foreach::getDoParRegistered()) ibis_future(cores = getOption("ibis.nthread"), + strategy = getOption("ibis.futurestrategy")) + + # Run the outgoing command + # out <- foreach::foreach(s = unique(splits), + # .combine = rbind, + # .export = c("splits", "fit_breg", "full_sub", + # "w_full_sub", "fam", "params"), + # .packages = c("matrixStats"), + # .multicombine = TRUE, + # .inorder = TRUE, + # verbose = settings$get("verbose") ) %do% { + out <- parallel::mclapply(unique(splits), function(s) { + i <- which(splits == s) + # -> external code in utils-boom + pred_breg <- predict_boom( + obj = fit_breg, + newdata = full_sub[i,], + w = w_full_sub[i], + fam = fam, + params = params + ) + # Summarize the posterior + preds <- as.data.frame( + cbind( + matrixStats::rowMeans2(pred_breg, na.rm = TRUE), + matrixStats::rowSds(pred_breg, na.rm = TRUE), + matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), + apply(pred_breg, 1, mode) + ) + ) + names(preds) <- c("mean", "sd", "q05", "q50", "q95", "mode") + preds$cv <- preds$sd / preds$mean + return(preds) + }) + out <- do.call(rbind, out) + } else { + out <- data.frame() + pb <- progress::progress_bar$new(total = length(levels(unique(splits))), + format = "Creating model prediction (:spin) [:bar] :percent") + for(s in unique(splits)){ + pb$tick() + i <- which(splits == s) + # -> external code in utils-boom + pred_breg <- predict_boom( + obj = fit_breg, + newdata = full_sub[i,], + w = w_full_sub[i], + fam = fam, + params = params + ) + # Summarize the posterior + preds <- cbind( + matrixStats::rowMeans2(pred_breg, na.rm = TRUE), + matrixStats::rowSds(pred_breg, na.rm = TRUE), + matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), + apply(pred_breg, 1, mode) + ) |> as.data.frame() + names(preds) <- c("mean", "sd", "q05", "q50", "q95", "mode") + preds$cv <- preds$sd / preds$mean + out <- rbind(out, preds) + rm(preds, pred_breg) + } + } + assertthat::assert_that(is.data.frame(out), nrow(out)>0, + msg = "Something went wrong withe prediction. Output empty!") + # Fill output with summaries of the posterior + stk <- raster::stack() + for(v in colnames(out)){ + temp <- emptyraster(prediction) + temp[full_sub$rowid] <- out[,v] + names(temp) <- v + stk <- raster::addLayer(stk, temp) + } + prediction <- stk;rm(stk) + prediction <- raster::mask(prediction, self$get_data("template")) + try({rm(out, full, full_sub)},silent = TRUE) + } else { + # No prediction done + prediction <- NULL + } + # Compute end of computation time + settings$set('end.time', Sys.time()) + + # Definition of BREG Model object ---- + # Create output + out <- bdproto( + "BREG-Model", + DistributionModel, + id = model$id, + model = model, + settings = settings, + fits = list( + "fit_best" = fit_breg, + "fit_best_equation" = form, + "prediction" = prediction + ), + # Partial effects + partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = NULL, ...){ + assertthat::assert_that(is.character(x.var) || is.null(x.var), + is.null(constant) || is.numeric(constant), + is.null(type) || is.character(type), + is.numeric(variable_length) + ) + # Settings + settings <- self$settings + # Set type + if(is.null(type)) type <- self$settings$get("type") + type <- match.arg(type, c("link", "response"), several.ok = FALSE) + settings$set("type", type) + + mod <- self$get_data('fit_best') + model <- self$model + df <- model$biodiversity[[1]]$predictors + df <- subset(df, select = attr(mod$terms, "term.labels")) + w <- model$biodiversity[[1]]$expect # Also get exposure variable + + # Match x.var to argument + if(is.null(x.var)){ + x.var <- colnames(df) + } else { + x.var <- match.arg(x.var, names(df), several.ok = FALSE) + } + + # Calculate range of predictors + if(any(model$predictors_types$type=="factor")){ + rr <- sapply(df[model$predictors_types$predictors[model$predictors_types$type=="numeric"]], + function(x) range(x, na.rm = TRUE)) |> as.data.frame() + } else { + rr <- sapply(df, function(x) range(x, na.rm = TRUE)) |> as.data.frame() + } + + df_partial <- list() + if(!is.null(values)){ assertthat::assert_that(length(values) >= 1) } + + # Add all others as constant + if(is.null(constant)){ + for(n in names(rr)) df_partial[[n]] <- rep( mean(df[[n]], na.rm = TRUE), variable_length ) + } else { + for(n in names(rr)) df_partial[[n]] <- rep( constant, variable_length ) + } + if(!is.null(values)){ + df_partial[[x.var]] <- values + } else { + df_partial[[x.var]] <- seq(rr[1,x.var], rr[2,x.var], length.out = variable_length) + } + + df_partial <- df_partial |> as.data.frame() + if(any(model$predictors_types$type=="factor")){ + lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) + df_partial[model$predictors_types$predictors[model$predictors_types$type=="factor"]] <- + factor(lvl[1], levels = lvl) + } + + # For Integrated model, take the last one + fam <- model$biodiversity[[length(model$biodiversity)]]$family + + pred_breg <- predict_boom( + obj = mod, + newdata = df_partial, + w = unique(w)[2], # The second entry of unique contains the non-observed variables + fam = fam, + params = settings$data # Use the settings as list + ) # Also attach the partial variable + + # Summarize the partial effect + pred_part <- cbind( + matrixStats::rowMeans2(pred_breg, na.rm = TRUE), + matrixStats::rowSds(pred_breg, na.rm = TRUE), + matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), + apply(pred_breg, 1, mode) + ) |> as.data.frame() + names(pred_part) <- c("mean", "sd", "q05", "q50", "q95", "mode") + pred_part$cv <- pred_part$sd / pred_part$mean + # And attach the variable + pred_part <- cbind("partial_effect" = df_partial[[x.var]], pred_part) + + if(plot){ + # Make a plot + g <- ggplot2::ggplot(data = pred_part, ggplot2::aes(x = partial_effect, y = q50, ymin = q05, ymax = q95)) + + ggplot2::theme_classic(base_size = 18) + + ggplot2::geom_ribbon(fill = 'grey90') + + ggplot2::geom_line() + + ggplot2::labs(x = paste0("partial of ",x.var), y = expression(hat(y))) + print(g) + } + # Return the data + return(pred_part) + }, + # Spatial partial dependence plot + spartial = function(self, x.var, constant = NULL, plot = TRUE, type = NULL){ + assertthat::assert_that(is.character(x.var) || is.null(x.var), + "model" %in% names(self), + is.null(constant) || is.numeric(constant), + is.logical(plot), + is.character(type) || is.null(type) + ) + + # Settings + settings <- self$settings + # Set type + if(is.null(type)) type <- self$settings$get("type") + type <- match.arg(type, c("link", "response"), several.ok = FALSE) + settings$set("type", type) + + mod <- self$get_data('fit_best') + model <- self$model + df <- model$biodiversity[[length( model$biodiversity )]]$predictors + df <- subset(df, select = attr(mod$terms, "term.labels")) + w <- model$biodiversity[[1]]$expect # Also get exposure variable + + # Match x.var to argument + if(is.null(x.var)){ + x.var <- colnames(df) + } else { + x.var <- match.arg(x.var, names(df), several.ok = FALSE) + } + + # Make spatial container for prediction + suppressWarnings( + df_partial <- sp::SpatialPointsDataFrame(coords = model$predictors[,c('x', 'y')], + data = model$predictors[, names(model$predictors) %notin% c('x','y')], + proj4string = sp::CRS(sp::proj4string(methods::as(model$background, "Spatial"))) + ) + ) + df_partial <- methods::as(df_partial, 'SpatialPixelsDataFrame') + + # Add all others as constant + if(is.null(constant)){ + for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- suppressWarnings( mean(model$predictors[[n]], na.rm = TRUE) ) + } else { + for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- constant + } + if(any(model$predictors_types$type=="factor")){ + lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) + df_partial[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]] <- + factor(lvl[1], levels = lvl) + # FIXME: Assigning the first level (usually reference) for now. But ideally find a way to skip factors from partial predictions + } + + # For Integrated model, take the last one + fam <- model$biodiversity[[length(model$biodiversity)]]$family + + pred_breg <- predict_boom( + obj = mod, + newdata = df_partial@data, + w = unique(w)[2], # The second entry of unique contains the non-observed variables + fam = fam, + params = settings$data # Use the settings as list + ) + + # Summarize the partial effect + pred_part <- cbind( + matrixStats::rowMeans2(pred_breg, na.rm = TRUE), + matrixStats::rowSds(pred_breg, na.rm = TRUE), + matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), + apply(pred_breg, 1, mode) + ) |> as.data.frame() + names(pred_part) <- c("mean", "sd", "q05", "q50", "q95", "mode") + pred_part$cv <- pred_part$sd / pred_part$mean + + # Now create spatial prediction + prediction <- emptyraster( self$get_data('prediction') ) # Background + prediction <- fill_rasters(pred_part, prediction) + + # Do plot and return result + if(plot){ + plot(prediction, col = ibis_colours$viridis_orig) + } + return(prediction) + }, + # Get coefficients from breg + get_coefficients = function(self){ + # Returns a vector of the coefficients with direction/importance + obj <- self$get_data("fit_best") + cofs <- posterior::summarise_draws(obj$beta) + cofs <- subset(cofs, select = c("variable", "mean", "sd")) + names(cofs) <- c("Feature", "Beta", "Sigma") + # Remove intercept(s) + int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) + if(length(int)>0) cofs <- cofs[-int,] + return(cofs) + }, + # Engine-specific projection function + project = function(self, newdata, type = NULL, layer = "mean"){ + assertthat::assert_that("model" %in% names(self), + nrow(newdata) > 0, + all( c("x", "y") %in% names(newdata) ), + is.character(type) || is.null(type) + ) + + # Settings + settings <- self$settings + # Set type + if(is.null(type)) type <- self$settings$get("type") + type <- match.arg(type, c("link", "response"), several.ok = FALSE) + settings$set("type", type) + + mod <- self$get_data('fit_best') + model <- self$model + df <- newdata + df <- subset(df, select = attr(mod$terms, "term.labels")) + + # Clamp? + if( settings$get("clamp") ) df <- clamp_predictions(model, df) + + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% colnames(df)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + df[,settings$get('bias_variable')[i]] <- settings$get('bias_value')[i] + } + } + + df$rowid <- 1:nrow(df) + df_sub <- subset(df, stats::complete.cases(df)) + w <- model$biodiversity[[1]]$expect # Also get exposure variable + + # For Integrated model, take the last one + fam <- model$biodiversity[[length(model$biodiversity)]]$family + + # Rather predict in steps than for the whole thing + out <- data.frame() + + # Tile the problem + splits <- cut(1:nrow(df_sub), nrow(df_sub) / (min(100, nrow(df_sub) / 10)) ) + + pb <- progress::progress_bar$new(total = length(levels(unique(splits))), + format = "Projecting on new data (:spin) [:bar] :percent") + for(s in unique(splits)){ + pb$tick() + i <- which(splits == s) + # -> external code in utils-boom + pred_breg <- predict_boom( + obj = mod, + newdata = df_sub[i,], + w = unique(w)[2], + fam = fam, + params = settings$data + ) + # Summarize the posterior + preds <- cbind( + matrixStats::rowMeans2(pred_breg, na.rm = TRUE), + matrixStats::rowSds(pred_breg, na.rm = TRUE), + matrixStats::rowQuantiles(pred_breg, probs = c(.05,.5,.95), na.rm = TRUE), + apply(pred_breg, 1, mode) + ) |> as.data.frame() + names(preds) <- c("mean", "sd", "q05", "q50", "q95", "mode") + preds$cv <- preds$sd / preds$mean + out <- rbind(out, preds) + rm(preds, pred_breg) + } + + # Now create spatial prediction + prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background + prediction[df_sub$rowid] <- out[,layer] + names(prediction) <- layer + + return(prediction) + } + ) + return(out) + } + ) + ) # End of bdproto object +} # End of function diff --git a/R/engine_gdb.R b/R/engine_gdb.R index 7e40c113..ed07fb48 100644 --- a/R/engine_gdb.R +++ b/R/engine_gdb.R @@ -30,6 +30,12 @@ NULL #' * Hofner, B., Müller, J., Hothorn, T., (2011). Monotonicity-constrained species distribution models. Ecology 92, 1895–901. #' * Mayr, A., Hofner, B. and Schmid, M. (2012). The importance of knowing when to stop - a sequential stopping rule for component-wise gradient boosting. Methods of Information in Medicine, 51, 178–186. #' @family engine +#' @returns An[engine]. +#' @examples +#' \dontrun{ +#' # Add GDB as an engine +#' x <- distribution(background) |> engine_gdb(iter = 1000) +#' } #' @name engine_gdb NULL #' @rdname engine_gdb @@ -42,7 +48,7 @@ engine_gdb <- function(x, ...) { # Check whether mboost package is available check_package('mboost') - if(!("mboost" %in% loadedNamespaces()) || ('mboost' %notin% sessionInfo()$otherPkgs) ) { + if(!("mboost" %in% loadedNamespaces()) || ('mboost' %notin% utils::sessionInfo()$otherPkgs) ) { try({requireNamespace('mboost');attachNamespace("mboost")},silent = TRUE) } @@ -164,7 +170,7 @@ engine_gdb <- function(x, field_occurrence = 'observed', template = bg, settings = model$biodiversity[[1]]$pseudoabsence_settings) - if(inherits(presabs, 'sf')) presabs <- presabs %>% sf::st_drop_geometry() + if(inherits(presabs, 'sf')) presabs <- presabs |> sf::st_drop_geometry() # Sample environmental points for absence only points abs <- subset(presabs, observed == 0) # Re-extract environmental information for absence points @@ -184,7 +190,7 @@ engine_gdb <- function(x, model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) } - df <- subset(df, complete.cases(df)) + df <- subset(df, stats::complete.cases(df)) assertthat::assert_that(nrow(presabs) == nrow(df)) # Check that factors have been correctly set if any @@ -269,8 +275,8 @@ engine_gdb <- function(x, fam <- switch (fam, "poisson" = mboost::Poisson(), "binomial" = mboost::Binomial(type = "glm", link = li), - "gaussian" = Gaussian(), - "hurdle" = Hurdle(nuirange = c(0,100)) + "gaussian" = mboost::Gaussian(), + "hurdle" = mboost::Hurdle(nuirange = c(0,100)) ) self$data[['family']] <- fam assertthat::assert_that(inherits(fam,'boost_family'),msg = 'Family misspecified.') @@ -312,7 +318,7 @@ engine_gdb <- function(x, full$cellid <- rownames(full) # Add row.names full$w <- model$exposure full$Intercept <- 1 - full <- subset(full, complete.cases(full)) + full <- subset(full, stats::complete.cases(full)) # Clamp? if( settings$get("clamp") ) full <- clamp_predictions(model, full) @@ -377,7 +383,7 @@ engine_gdb <- function(x, ) },silent = FALSE) # If error, decrease step size by a factor of 10 and try again. - if(inherits(fit_gdb, "try-error") || length(names(coef(fit_gdb)))< 2){ + if(inherits(fit_gdb, "try-error") || length(names(stats::coef(fit_gdb)))< 2){ if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Reducing learning rate by 1/100.') bc$nu <- bc$nu * 0.01 fit_gdb <- try({ @@ -396,13 +402,13 @@ engine_gdb <- function(x, } } - if(settings$get('varsel') == "reg"){ + if(settings$get('optim_hyperparam')){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting cross.validation.') + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting parameter search for optimal stopping.') # 5 fold Cross validation to prevent overfitting if(getOption("ibis.runparallel")){ grs <- seq(from = 10, to = max( bc$mstop *5), by = 10) - cvf <- mboost::cv(model.weights(fit_gdb),B = 5, type = "kfold") + cvf <- mboost::cv(stats::model.weights(fit_gdb),B = 5, type = "kfold") # Start cluster # cl <- parallel::makeCluster( getOption('ibis.nthread') ) @@ -416,7 +422,7 @@ engine_gdb <- function(x, } else { grs <- seq(from = 10, to = max( bc$mstop *5), by = 10) - cvf <- mboost::cv(model.weights(fit_gdb),B = 5, type = "kfold") + cvf <- mboost::cv(stats::model.weights(fit_gdb),B = 5, type = "kfold") try({cvm <- mboost::cvrisk(fit_gdb, folds = cvf, grid = grs, papply = pbapply::pblapply ) @@ -424,9 +430,9 @@ engine_gdb <- function(x, rm(cvf, grs) } # Check whether crossvalidation has run through successfully - if(exists('cvm') && mstop(cvm) > 0){ + if(exists('cvm') && mboost::mstop(cvm) > 0){ # Set the model to the optimal mstop to limit overfitting - fit_gdb[mstop(cvm)] + fit_gdb[mboost::mstop(cvm)] } else {cvm <- new_waiver()} } else { cvm <- new_waiver() @@ -516,7 +522,7 @@ engine_gdb <- function(x, # Add rowid newdata$rowid <- 1:nrow(newdata) # Subset to non-missing data - newdata <- subset(newdata, complete.cases(newdata)) + newdata <- subset(newdata, stats::complete.cases(newdata)) # Make empty template temp <- emptyraster( model$predictors_object$get_data()[[1]] ) # Background # Predict @@ -645,11 +651,11 @@ engine_gdb <- function(x, if(plot){ # Plot both partial spatial partial - par.ori <- par(no.readonly = TRUE) - par(mfrow = c(1,3)) + par.ori <- graphics::par(no.readonly = TRUE) + graphics::par(mfrow = c(1,3)) raster::plot(temp, main = expression(f[partial]), col = ibis_colours$divg_bluegreen) mboost::plot.mboost(mod,which = x.var) - par(par.ori) + graphics::par(par.ori) } return(temp) }, diff --git a/R/engine_glmnet.R b/R/engine_glmnet.R index e06473a6..c0c5b6a4 100644 --- a/R/engine_glmnet.R +++ b/R/engine_glmnet.R @@ -1,695 +1,701 @@ -#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R -NULL - -#' Engine for regularized regression models -#' -#' @description -#' This engine allows the estimation of linear coefficients using either ridge, lasso or elastic net regressions techniques. -#' Backbone of this engine is the \pkg{glmnet} R-package which is commonly used in SDMs, -#' including the popular \code{'maxnet'} (e.g. Maxent) package. -#' Ultimately this engine is an equivalent of [engine_breg], but in a "frequentist" setting. -#' If user aim to emulate a model that most closely resembles maxent within the ibis.iSDM modelling framework, -#' then this package is the best way of doing so. Compared to the \code{'maxnet'} R-package, -#' a number of efficiency settings are implemented in particular for cross-validation of alpha and lambda values. -#' -#' Limited amount of prior information can be specified for this engine, specifically via offsets or as -#' [`GLMNETPrior`], which allow to specify priors as regularization constants. -#' @details -#' Regularized regressions are effectively GLMs that are fitted with ridge, lasso or elastic-net regularization. -#' Which of them is chosen is critical dependent on the alpha value: -#' [*] For \code{alpha} equal to \code{0} a ridge regularization is used. Ridge regularization has the property that -#' it doesn't remove variables entirely, but instead sets their coefficients to \code{0}. -#' [*] For \code{alpha} equal to \code{1} a lasso regularization is used. Lassos tend to remove those coefficients -#' fully from the final model that do not improve the loss function. -#' [*] For \code{alpha} values between \code{0} and \code{1} a elastic-net regularization is used, which is essentially a combination -#' of the two. -#' The optimal lambda parameter can be determined via cross-validation. For this option set \code{"varsel"} in `train()` -#' to \code{"reg"}. -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param alpha A [`numeric`] giving the elasticnet mixing parameter, which has to be between \code{0} and \code{1}. -#' \code{alpha=1} is the lasso penalty, and \code{alpha=0} the ridge penalty (Default: \code{0}). -#' @param nlambda A [`numeric`] giving the number of lambda values to be used (Default: \code{100}). -#' @param lambda A [`numeric`] with a user supplied estimate of lambda. Usually best to let this parameter be -#' determined deterministically (Default: \code{NULL}). -#' @param type The mode used for creating posterior predictions. Either making \code{"link"} or \code{"response"} (Default: \code{"response"}). -#' @param ... Other parameters passed on to glmnet. -#' @references -#' * Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear Models via Coordinate Descent. Journal of Statistical Software, 33(1), 1-22. URL https://www.jstatsoft.org/v33/i01/. -#' * Renner, I.W., Elith, J., Baddeley, A., Fithian, W., Hastie, T., Phillips, S.J., Popovic, G. and Warton, D.I., 2015. Point process models for presence‐only analysis. Methods in Ecology and Evolution, 6(4), pp.366-379. -#' * Fithian, W. & Hastie, T. (2013) Finite-sample equivalence in statistical models for presence-only data. The Annals of Applied Statistics 7, 1917–1939 -#' @family engine -#' @name engine_glmnet -NULL -#' @rdname engine_glmnet -#' @export - -engine_glmnet <- function(x, - alpha = 0, - nlambda = 100, - lambda = NULL, - type = "response", - ...) { - - # Check whether glmnet package is available - check_package('glmnet') - if(!("glmnet" %in% loadedNamespaces()) || ('glmnet' %notin% sessionInfo()$otherPkgs) ) { - try({requireNamespace('glmnet');attachNamespace("glmnet")},silent = TRUE) - } - check_package('glmnetUtils') # glmnetUtils is a helper functions for formulas - if(!("glmnetUtils" %in% loadedNamespaces()) || ('glmnetUtils' %notin% sessionInfo()$otherPkgs) ) { - try({requireNamespace('glmnetUtils');attachNamespace("glmnetUtils")},silent = TRUE) - } - - # assert that arguments are valid - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(x$background,'sf'), - is.numeric(alpha), - is.numeric(lambda) || is.null(lambda), - is.numeric(nlambda), - is.character(type) - ) - assertthat::assert_that(alpha>=0, alpha <=1) - type <- match.arg(type, choices = c("predictor","link", "response"),several.ok = FALSE) - if(type=="predictor") type <- "link" # Convenience conversion - - # Create a background raster - if(is.Waiver(x$predictors)){ - # Create from background - template <- raster::raster( - ext = raster::extent(x$background), - crs = raster::projection(x$background), - res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution - diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 - ) - ) - } else { - # If predictor existing, use them - template <- emptyraster(x$predictors$get_data() ) - } - - # Burn in the background - template <- raster::rasterize(x$background, template, field = 0) - - # Set up the parameter list - params <- list( - alpha = alpha, - lambda = lambda, - nlambda = nlambda, - type = type, - ... - ) - - # Print a message in case there is already an engine object - if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') - - # Set engine in distribution object - x$set_engine( - bdproto( - "GLMNET-Engine", - Engine, - name = "", - data = list( - 'template' = template, - 'params' = params - ), - # Dummy function for spatial latent effects - calc_latent_spatial = function(self, type = NULL, priors = NULL){ - new_waiver() - }, - # Dummy function for getting the equation of latent effects - get_equation_latent_spatial = function(self, method){ - new_waiver() - }, - # Function to respecify the control parameters - set_control = function(self, - params - ){ - assertthat::assert_that(is.list(params)) - # Overwrite existing - self$data$params <- params - invisible() - }, - # Setup function - setup = function(self, model, settings = NULL, ...){ - # Simple security checks - assertthat::assert_that( - assertthat::has_name(model, 'background'), - assertthat::has_name(model, 'biodiversity'), - inherits(settings,'Settings') || is.null(settings), - nrow(model$predictors) == raster::ncell(self$get_data('template')), - !is.Waiver(self$get_data("params")), - length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') - - # Get parameters - params <- self$data$params - settings$set('type', params$type) - - # Distribution specific procedure - fam <- model$biodiversity[[1]]$family - - # -- # - - # If a poisson family is used, weight the observations by their exposure - if(fam == "poisson"){ - # Get background layer - bg <- self$get_data("template") - assertthat::assert_that(!is.na(raster::cellStats(bg, min))) - - # Add pseudo-absence points - presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, - field_occurrence = 'observed', - template = bg, - settings = model$biodiversity[[1]]$pseudoabsence_settings) - if(inherits(presabs, 'sf')) presabs <- presabs %>% sf::st_drop_geometry() - # Sample environmental points for absence only points - abs <- subset(presabs, observed == 0) - # Re-extract environmental information for absence points - envs <- get_rastervalue(coords = abs[,c('x','y')], - env = model$predictors_object$get_data(df = FALSE), - rm.na = FALSE) - if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} - - # Format out - df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], - envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) - any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) - if(length(any_missing)>0) { - presabs <- presabs[-any_missing,] # This works as they are in the same order - model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] - # Fill the absences with 1 as multiplier. This works since absences follow the presences - model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, - rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) - } - df <- subset(df, complete.cases(df)) - assertthat::assert_that(nrow(presabs) == nrow(df)) - - # Overwrite observation data - model$biodiversity[[1]]$observations <- presabs - - # Preprocessing security checks - assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), - any(!is.na(presabs[['observed']])), - length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), - nrow(df) == nrow(model$biodiversity[[1]]$observations) - ) - - # Add offset if existent - if(!is.Waiver(model$offset)){ - ofs <- get_rastervalue(coords = df[,c('x','y')], - env = model$offset_object, - rm.na = FALSE) - # Rename to spatial offset - names(ofs)[which(names(ofs)==names(model$offset_object))] <- "spatial_offset" - model$biodiversity[[1]]$offset <- ofs - } - - # Define expectation as very small vector following Renner et al. - w <- ppm_weights(df = df, - pa = model$biodiversity[[1]]$observations[['observed']], - bg = bg, - weight = 1e-6, # Arbitrary small weight - type = "DWPR" # Weights for down-weighted Poisson regression - ) - assertthat::assert_that(length(w) == nrow(df)) - - model$biodiversity[[1]]$predictors <- df - model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) # Multiply with prior weight - - # Rasterize observed presences - pres <- raster::rasterize(model$biodiversity[[1]]$observations[,c("x","y")], - bg, fun = 'count', background = 0) - # Get for the full dataset - w_full <- ppm_weights(df = model$predictors, - pa = pres[], - bg = bg, - weight = 1 # Set those to 1 so that absences become ratio of pres/abs - ) - - # Add exposure to full model predictor - model$exposure <- w_full * (1/unique(model$biodiversity[[1]]$expect)[1]) # Multiply with prior weight (first value) - - } else if(fam == "binomial"){ - # Check that observations are all <=1 - model$biodiversity[[1]]$observations[['observed']] <- ifelse(model$biodiversity[[1]]$observations[['observed']]>=1,1,0) - # calculating the case weights (equal weights) - # the order of weights should be the same as presences and backgrounds in the training data - prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences - bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds - w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) - model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect - model$biodiversity[[1]]$observations$observed <- as.factor( model$biodiversity[[1]]$observations$observed ) - } - - # Instead of invisible return the model object - return( model ) - }, - # Training function - train = function(self, model, settings, ...){ - assertthat::assert_that( - inherits(settings,'Settings'), - is.list(model),length(model)>1, - # Check that model id and setting id are identical - settings$modelid == model$id - ) - # Get name - name <- model$biodiversity[[1]]$name - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green',paste0('Starting fitting: ', name)) - - # Verbosity - verbose <- settings$get("verbose") - - # Set prediction type also for later - settings$set('type', self$get_data("params")$type) - - # seed - seed <- settings$get("seed") - if(is.Waiver(seed)) { seed <- 1337; settings$set('seed', 1337) } - - # Get output raster - prediction <- self$get_data('template') - - # Get parameters control - params <- self$get_data('params') - # Set glmnet control - glmnet::glmnet.control(factory = TRUE) # Reset to default - - # All other needed data for model fitting - fam <- model$biodiversity[[1]]$family - li <- model$biodiversity[[1]]$link - if(!is.null(li)){ - if(li %in% c("cloglog", "logit", "probit")){ - fam <- binomial(link = li) - } else { - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red',paste0("Custom link functions not supported!")) - } - } - - form <- model$biodiversity[[1]]$equation - df <- cbind(model$biodiversity[[1]]$predictors, - data.frame(observed = model$biodiversity[[1]]$observations[,'observed']) - ) - df <- subset(df, select = c(model$biodiversity[[1]]$predictors_names, "observed")) - w <- df$w <- model$biodiversity[[1]]$expect # The expected exposure - # Get full prediction container - full <- model$predictors - w_full <- model$exposure - - # Get offset and add it to exposure - if(!is.Waiver(model$offset)){ - # Add offset to full prediction and load vector - ofs <- model$biodiversity[[1]]$offset[, 'spatial_offset'] - ofs_pred <- model$offset[,'spatial_offset'] - } else { ofs <- NULL; ofs_pred <- NULL } - - # Check priors, e.g penalty factors - p.fac <- rep(1, sum( model$biodiversity[[1]]$predictors_types$type=="numeric") ) # Added plus 1 for the weight? - names(p.fac) <- model$biodiversity[[1]]$predictors_names[which(model$biodiversity[[1]]$predictors_types$type=="numeric")] - # Then add each factor level if set - if(any(model$predictors_types$type=="factor")){ - fac <- model$biodiversity[[1]]$predictors_names[which(model$biodiversity[[1]]$predictors_types$type=="factor")] - p.fac <- c(p.fac, rep(1, length( unique(df[,fac]) ) )) - } - # Duplicate p.fac container for lower and upper limits - lowlim <- rep(-Inf, length(p.fac)) |> setNames(names(p.fac)) - upplim <- rep(Inf, length(p.fac)) |> setNames(names(p.fac)) - - # Trick for creation for some default lambda values for the regularization multiplier - if(is.null(params$lambda)){ - reg <- default.regularization(p = df$observed, m = model.matrix(form, df)) * c(1, p.fac) # add 1 for the intercept - params$lambda <- 10^(seq(4, 0, length.out = 200)) * sum(p.fac)/length(p.fac) * sum(p.fac)/sum(w) - if(anyNA(params$lambda)) params$lambda <- NULL - } - - if(!is.Waiver(model$priors)){ - # Reset those contained in the prior object - for(v in model$priors$varnames()){ - p.fac[v] <- model$priors$get(v, what = "value") - lowlim[v] <- model$priors$get(v, what = "lims")[1] - upplim[v] <- model$priors$get(v, what = "lims")[2] - } - } - - # Clamp? - if( settings$get("clamp") ) full <- clamp_predictions(model, full) - - # -- # - # Expand predictors if non-linear is specified in settings - if(settings$get('only_linear') == FALSE){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow', - 'Non-linearity to glmnet is best introduced by adding derivates. Ignored!') - # linear_predictors <- attr(terms.formula(form), "term.labels") - # m <- outer(linear_predictors, linear_predictors, function(x, y) paste(x, y, sep = ":")) - # - # form <- update.formula(form, - # paste0( - # ". ~ . +", - # paste0("I(", linear_predictors,"^2)",collapse = " + "), - # " + ", - # paste0(m[lower.tri(m)], collapse = " + ") - # )) - # # Update penalty factors and limits - # for(var in attr(terms.formula(form), "term.labels")){ - # if(!(var %in% p.fac)){ - # v <- 1 # Take the maximum regularization penalty by default - # vlow <- -Inf; vupp <- Inf - # names(v) <- var; names(vlow) <- var; names(vupp) <- var - # p.fac <- append(p.fac, v) - # lowlim <- append(lowlim, vlow); upplim <- append(upplim, vupp) - # } - # } - } - - assertthat::assert_that( - is.null(w) || length(w) == nrow(df), - is.null(ofs) || is.vector(ofs), - is.null(ofs_pred) || is.vector(ofs_pred), - length(p.fac) == length(lowlim), - all(w >= 0,na.rm = TRUE) - ) - - # --- # - # Determine the optimal lambda through k-fold cross-validation - if(getOption("ibis.runparallel")){ - if(!foreach:::getDoParRegistered()) ibis_future(cores = getOption("ibis.nthread"), - strategy = getOption("ibis.futurestrategy")) - } - # Depending if regularized should be set, specify this separately - # if( (settings$get('varsel') == "reg") ){ - # if(getOption('ibis.setupmessages')) myLog('[Estimation]','green', - # 'Finding optimal combinations of alpha and lambda.') - # cv_gn <- try({ - # cva.glmnet(formula = form, - # data = df, - # alpha = params$alpha, # Elastic net mixing parameter - # lambda = params$lambda, # Overwrite lambda - # weights = w, # Case weights - # offset = ofs, - # family = fam, - # penalty.factor = p.fac, - # # Option for limiting the coefficients - # lower.limits = lowlim, - # upper.limits = upplim, - # standardize = FALSE, # Don't standardize to avoid doing anything to weights - # maxit = (10^5)*2, # Increase the maximum number of passes for lambda - # parallel = getOption("ibis.runparallel"), - # trace.it = settings$get("verbose"), - # nfolds = 10 # number of folds for cross-validation - # ) - # },silent = FALSE) - # } else { - cv_gn <- try({ - cv.glmnet(formula = form, - data = df, - alpha = params$alpha, # Elastic net mixing parameter - lambda = params$lambda, # Overwrite lambda - weights = w, # Case weights - offset = ofs, - family = fam, - penalty.factor = p.fac, - # Option for limiting the coefficients - lower.limits = lowlim, - upper.limits = upplim, - standardize = FALSE, # Don't standardize to avoid doing anything to weights - maxit = (10^5)*2, # Increase the maximum number of passes for lambda - parallel = getOption("ibis.runparallel"), - trace.it = settings$get("verbose"), - nfolds = 10 # number of folds for cross-validation - ) - },silent = FALSE) - # } - if(inherits(cv_gn, "try-error")) stop("Model failed to converge with provided input data!") - - # --- # - # Predict spatially - if(!settings$get('inference_only')){ - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(full)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - # Make a subset of non-na values - full$rowid <- 1:nrow(full) - full_sub <- subset(full, complete.cases(full)) - w_full_sub <- w_full[full_sub$rowid] - assertthat::assert_that((nrow(full_sub) == length(w_full_sub)) || is.null(w_full_sub) ) - - # Attempt prediction - out <- predict(object = cv_gn, - newdata = full_sub, - weights = w_full_sub, - newoffset = ofs_pred[full_sub$rowid], - s = determine_lambda(cv_gn), # Determine the best lambda value - type = params$type - ) - - # Fill output with summaries of the posterior - prediction[full_sub$rowid] <- out[,1] - names(prediction) <- "mean" - prediction <- raster::mask(prediction, self$get_data("template")) - try({rm(out, full, full_sub)},silent = TRUE) - } else { - # No prediction done - prediction <- NULL - } - # Compute end of computation time - settings$set('end.time', Sys.time()) - - # Definition of GLMNET Model object ---- - # Create output - out <- bdproto( - "GLMNET-Model", - DistributionModel, - id = model$id, - model = model, - settings = settings, - fits = list( - "fit_best" = cv_gn, - "fit_best_equation" = form, - "prediction" = prediction - ), - # Partial effects - partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = NULL, ...){ - assertthat::assert_that(is.character(x.var) || is.null(x.var), - is.null(constant) || is.numeric(constant), - is.null(type) || is.character(type), - is.numeric(variable_length) - ) - check_package("pdp") - # Settings - settings <- self$settings - - mod <- self$get_data('fit_best') - model <- self$model - df <- model$biodiversity[[length( model$biodiversity )]]$predictors - co <- coef(mod) |> row.names() # Get model coefficient names - # Set type - if(is.null(type)) type <- self$settings$get("type") - - # Match x.var to argument - if(is.null(x.var)){ - x.var <- colnames(df) - } else { - x.var <- match.arg(x.var, names(df), several.ok = FALSE) - } - - # Calculate range of predictors - if(any(model$predictors_types$type=="factor")){ - rr <- sapply(df[model$predictors_types$predictors[model$predictors_types$type=="numeric"]], - function(x) range(x, na.rm = TRUE)) |> as.data.frame() - } else { - rr <- sapply(df, function(x) range(x, na.rm = TRUE)) |> as.data.frame() - } - # if values are set, make sure that they cover the data.frame - if(!is.null(values)){ - assertthat::assert_that(length(x.var) == 1) - df2 <- list() - df2[[x.var]] <- values - # Then add the others - for(var in colnames(df)){ - if(var == x.var) next() - df2[[var]] <- mean(df[[var]], na.rm = TRUE) - } - df2 <- df2 |> as.data.frame() - } else { - df2 <- as.data.frame(seq(rr[1,x.var],rr[2,x.var], length.out = variable_length)) - names(df2) <- x.var - } - - # Check that variables are in - assertthat::assert_that(all( x.var %in% colnames(df) ), - msg = 'Variable not in predicted model.') - - pp <- data.frame() - pb <- progress::progress_bar$new(total = length(x.var)) - for(v in x.var){ - p1 <- pdp::partial(mod, pred.var = v, pred.grid = df2, ice = FALSE, center = FALSE, - type = "regression", - plot = FALSE, rug = TRUE, train = df) - p1 <- p1[,c(x.var, "yhat")] - names(p1) <- c("partial_effect", "mean") - p1$variable <- v - pp <- rbind(pp, p1) - if(length(x.var) > 1) pb$tick() - } - - if(plot){ - # Make a plot - g <- ggplot2::ggplot(data = pp, ggplot2::aes(x = partial_effect, y = mean)) + - ggplot2::theme_classic(base_size = 18) + - ggplot2::geom_line() + - ggplot2::labs(x = "", y = expression(hat(y))) + - ggplot2::facet_wrap(~variable,scales = 'free') - print(g) - } - return(pp) - }, - # Spatial partial dependence plot - spartial = function(self, x.var, constant = NULL, plot = TRUE, type = NULL){ - assertthat::assert_that(is.character(x.var), - "model" %in% names(self), - is.null(constant) || is.numeric(constant), - is.logical(plot), - is.character(type) || is.null(type) - ) - # Settings - settings <- self$settings - # Set type - if(is.null(type)) type <- self$settings$get("type") - type <- match.arg(type, c("link", "response"), several.ok = FALSE) - settings$set("type", type) - - mod <- self$get_data('fit_best') - model <- self$model - # For Integrated model, take the last one - fam <- model$biodiversity[[length(model$biodiversity)]]$family - df <- model$predictors - df$w <- model$exposure - df$rowid <- 1:nrow(df) - # Match x.var to argument - x.var <- match.arg(x.var, names(df), several.ok = FALSE) - df_sub <- subset(df, complete.cases(df)) - if(!is.Waiver(model$offset)) ofs <- model$offset[df_sub$rowid] else ofs <- NULL - assertthat::assert_that(nrow(df_sub)>0) - - # Add all others as constant - if(is.null(constant)){ - for(n in names(df_sub)) if(!n %in% c(x.var, "rowid", "w")) df_sub[[n]] <- suppressWarnings( mean(model$predictors[[n]], na.rm = TRUE) ) - } else { - for(n in names(df_sub)) if(!n %in% c(x.var, "rowid", "w")) df_sub[[n]] <- constant - } - # Reclassify factor levels - if(any(model$predictors_types$type=="factor")){ - lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) - df_sub[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]] <- - factor(lvl[1], levels = lvl) - } - # Predict with lambda - pred_gn <- predict( - object = mod, - newdata = df_sub, - weights = df_sub$w, # The second entry of unique contains the non-observed variables - newoffset = ofs, - s = determine_lambda(mod), # Determine best available lambda - fam = fam, - type = type - ) |> as.data.frame() - - # Now create spatial prediction - prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background - prediction[df_sub$rowid] <- pred_gn[,1] - names(prediction) <- paste0("spartial_",x.var) - - # Do plot and return result - if(plot){ - plot(prediction, col = ibis_colours$viridis_orig) - } - return(prediction) - }, - # Get coefficients from breg - get_coefficients = function(self){ - # Returns a vector of the coefficients with direction/importance - obj <- self$get_data("fit_best") - cofs <- tidy_glmnet_summary(obj) - names(cofs) <- c("Feature", "Beta") - # Remove intercept(s) - int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) - if(length(int)>0) cofs <- cofs[-int,] - return(cofs) - }, - # Engine-specific projection function - project = function(self, newdata, type = NULL, layer = "mean"){ - assertthat::assert_that("model" %in% names(self), - nrow(newdata) > 0, - all( c("x", "y") %in% names(newdata) ), - is.character(type) || is.null(type) - ) - - # Settings - settings <- self$settings - # Set type - if(is.null(type)) type <- self$settings$get("type") - type <- match.arg(type, c("link", "response"), several.ok = FALSE) - settings$set("type", type) - - mod <- self$get_data('fit_best') - model <- self$model - # For Integrated model, take the last one - fam <- model$biodiversity[[length(model$biodiversity)]]$family - - # Clamp? - if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) - - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(newdata)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - - df <- newdata - df$w <- model$exposure # Also get exposure variable - # Make a subset of non-na values - df$rowid <- 1:nrow(df) - df_sub <- subset(df, complete.cases(df)) - if(!is.Waiver(model$offset)) ofs <- model$offset[df_sub$rowid] else ofs <- NULL - assertthat::assert_that(nrow(df_sub)>0) - - pred_gn <- predict( - object = mod, - newdata = df_sub, - weights = df_sub$w, # The second entry of unique contains the non-observed variables - newoffset = ofs, - s = determine_lambda(mod), # Determine best available lambda - fam = fam, - type = type - ) |> as.data.frame() - names(pred_gn) <- layer - - # Now create spatial prediction - prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background - prediction[df_sub$rowid] <- pred_gn[,1] - - return(prediction) - } - ) - return(out) - } - ) - ) # End of bdproto object -} # End of function +#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R +NULL + +#' Engine for regularized regression models +#' +#' @description +#' This engine allows the estimation of linear coefficients using either ridge, lasso or elastic net regressions techniques. +#' Backbone of this engine is the \pkg{glmnet} R-package which is commonly used in SDMs, +#' including the popular \code{'maxnet'} (e.g. Maxent) package. +#' Ultimately this engine is an equivalent of [engine_breg], but in a "frequentist" setting. +#' If user aim to emulate a model that most closely resembles maxent within the ibis.iSDM modelling framework, +#' then this package is the best way of doing so. Compared to the \code{'maxnet'} R-package, +#' a number of efficiency settings are implemented in particular for cross-validation of alpha and lambda values. +#' +#' Limited amount of prior information can be specified for this engine, specifically via offsets or as +#' [`GLMNETPrior`], which allow to specify priors as regularization constants. +#' @details +#' Regularized regressions are effectively GLMs that are fitted with ridge, lasso or elastic-net regularization. +#' Which of them is chosen is critical dependent on the alpha value: +#' [*] For \code{alpha} equal to \code{0} a ridge regularization is used. Ridge regularization has the property that +#' it doesn't remove variables entirely, but instead sets their coefficients to \code{0}. +#' [*] For \code{alpha} equal to \code{1} a lasso regularization is used. Lassos tend to remove those coefficients +#' fully from the final model that do not improve the loss function. +#' [*] For \code{alpha} values between \code{0} and \code{1} a elastic-net regularization is used, which is essentially a combination +#' of the two. +#' The optimal lambda parameter can be determined via cross-validation. For this option set \code{"varsel"} in `train()` +#' to \code{"reg"}. +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param alpha A [`numeric`] giving the elasticnet mixing parameter, which has to be between \code{0} and \code{1}. +#' \code{alpha=1} is the lasso penalty, and \code{alpha=0} the ridge penalty (Default: \code{0}). +#' @param nlambda A [`numeric`] giving the number of lambda values to be used (Default: \code{100}). +#' @param lambda A [`numeric`] with a user supplied estimate of lambda. Usually best to let this parameter be +#' determined deterministically (Default: \code{NULL}). +#' @param type The mode used for creating posterior predictions. Either making \code{"link"} or \code{"response"} (Default: \code{"response"}). +#' @param ... Other parameters passed on to glmnet. +#' @references +#' * Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear Models via Coordinate Descent. Journal of Statistical Software, 33(1), 1-22. URL https://www.jstatsoft.org/v33/i01/. +#' * Renner, I.W., Elith, J., Baddeley, A., Fithian, W., Hastie, T., Phillips, S.J., Popovic, G. and Warton, D.I., 2015. Point process models for presence‐only analysis. Methods in Ecology and Evolution, 6(4), pp.366-379. +#' * Fithian, W. & Hastie, T. (2013) Finite-sample equivalence in statistical models for presence-only data. The Annals of Applied Statistics 7, 1917–1939 +#' @family engine +#' @returns An [engine]. +#' @examples +#' \dontrun{ +#' # Add BREG as an engine +#' x <- distribution(background) |> engine_glmnet(iter = 1000) +#' } +#' @name engine_glmnet +NULL +#' @rdname engine_glmnet +#' @export + +engine_glmnet <- function(x, + alpha = 0, + nlambda = 100, + lambda = NULL, + type = "response", + ...) { + + # Check whether glmnet package is available + check_package('glmnet') + if(!("glmnet" %in% loadedNamespaces()) || ('glmnet' %notin% utils::sessionInfo()$otherPkgs) ) { + try({requireNamespace('glmnet');attachNamespace("glmnet")},silent = TRUE) + } + check_package('glmnetUtils') # glmnetUtils is a helper functions for formulas + if(!("glmnetUtils" %in% loadedNamespaces()) || ('glmnetUtils' %notin% utils::sessionInfo()$otherPkgs) ) { + try({requireNamespace('glmnetUtils');attachNamespace("glmnetUtils")},silent = TRUE) + } + + # assert that arguments are valid + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(x$background,'sf'), + is.numeric(alpha), + is.numeric(lambda) || is.null(lambda), + is.numeric(nlambda), + is.character(type) + ) + assertthat::assert_that(alpha>=0, alpha <=1) + type <- match.arg(type, choices = c("predictor","link", "response"),several.ok = FALSE) + if(type=="predictor") type <- "link" # Convenience conversion + + # Create a background raster + if(is.Waiver(x$predictors)){ + # Create from background + template <- raster::raster( + ext = raster::extent(x$background), + crs = raster::projection(x$background), + res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution + diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 + ) + ) + } else { + # If predictor existing, use them + template <- emptyraster(x$predictors$get_data() ) + } + + # Burn in the background + template <- raster::rasterize(x$background, template, field = 0) + + # Set up the parameter list + params <- list( + alpha = alpha, + lambda = lambda, + nlambda = nlambda, + type = type, + ... + ) + + # Print a message in case there is already an engine object + if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') + + # Set engine in distribution object + x$set_engine( + bdproto( + "GLMNET-Engine", + Engine, + name = "", + data = list( + 'template' = template, + 'params' = params + ), + # Dummy function for spatial latent effects + calc_latent_spatial = function(self, type = NULL, priors = NULL){ + new_waiver() + }, + # Dummy function for getting the equation of latent effects + get_equation_latent_spatial = function(self, method){ + new_waiver() + }, + # Function to respecify the control parameters + set_control = function(self, + params + ){ + assertthat::assert_that(is.list(params)) + # Overwrite existing + self$data$params <- params + invisible() + }, + # Setup function + setup = function(self, model, settings = NULL, ...){ + # Simple security checks + assertthat::assert_that( + assertthat::has_name(model, 'background'), + assertthat::has_name(model, 'biodiversity'), + inherits(settings,'Settings') || is.null(settings), + nrow(model$predictors) == raster::ncell(self$get_data('template')), + !is.Waiver(self$get_data("params")), + length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') + + # Get parameters + params <- self$data$params + settings$set('type', params$type) + + # Distribution specific procedure + fam <- model$biodiversity[[1]]$family + + # -- # + + # If a poisson family is used, weight the observations by their exposure + if(fam == "poisson"){ + # Get background layer + bg <- self$get_data("template") + assertthat::assert_that(!is.na(raster::cellStats(bg, min))) + + # Add pseudo-absence points + presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, + field_occurrence = 'observed', + template = bg, + settings = model$biodiversity[[1]]$pseudoabsence_settings) + if(inherits(presabs, 'sf')) presabs <- presabs |> sf::st_drop_geometry() + # Sample environmental points for absence only points + abs <- subset(presabs, observed == 0) + # Re-extract environmental information for absence points + envs <- get_rastervalue(coords = abs[,c('x','y')], + env = model$predictors_object$get_data(df = FALSE), + rm.na = FALSE) + if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} + + # Format out + df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], + envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) + any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) + if(length(any_missing)>0) { + presabs <- presabs[-any_missing,] # This works as they are in the same order + model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] + # Fill the absences with 1 as multiplier. This works since absences follow the presences + model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, + rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) + } + df <- subset(df, stats::complete.cases(df)) + assertthat::assert_that(nrow(presabs) == nrow(df)) + + # Overwrite observation data + model$biodiversity[[1]]$observations <- presabs + + # Preprocessing security checks + assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), + any(!is.na(presabs[['observed']])), + length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), + nrow(df) == nrow(model$biodiversity[[1]]$observations) + ) + + # Add offset if existent + if(!is.Waiver(model$offset)){ + ofs <- get_rastervalue(coords = df[,c('x','y')], + env = model$offset_object, + rm.na = FALSE) + # Rename to spatial offset + names(ofs)[which(names(ofs)==names(model$offset_object))] <- "spatial_offset" + model$biodiversity[[1]]$offset <- ofs + } + + # Define expectation as very small vector following Renner et al. + w <- ppm_weights(df = df, + pa = model$biodiversity[[1]]$observations[['observed']], + bg = bg, + weight = 1e-6, # Arbitrary small weight + type = "DWPR" # Weights for down-weighted Poisson regression + ) + assertthat::assert_that(length(w) == nrow(df)) + + model$biodiversity[[1]]$predictors <- df + model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) # Multiply with prior weight + + # Rasterize observed presences + pres <- raster::rasterize(model$biodiversity[[1]]$observations[,c("x","y")], + bg, fun = 'count', background = 0) + # Get for the full dataset + w_full <- ppm_weights(df = model$predictors, + pa = pres[], + bg = bg, + weight = 1 # Set those to 1 so that absences become ratio of pres/abs + ) + + # Add exposure to full model predictor + model$exposure <- w_full * (1/unique(model$biodiversity[[1]]$expect)[1]) # Multiply with prior weight (first value) + + } else if(fam == "binomial"){ + # Check that observations are all <=1 + model$biodiversity[[1]]$observations[['observed']] <- ifelse(model$biodiversity[[1]]$observations[['observed']]>=1,1,0) + # calculating the case weights (equal weights) + # the order of weights should be the same as presences and backgrounds in the training data + prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences + bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds + w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) + model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect + model$biodiversity[[1]]$observations$observed <- as.factor( model$biodiversity[[1]]$observations$observed ) + } + + # Instead of invisible return the model object + return( model ) + }, + # Training function + train = function(self, model, settings, ...){ + assertthat::assert_that( + inherits(settings,'Settings'), + is.list(model),length(model)>1, + # Check that model id and setting id are identical + settings$modelid == model$id + ) + # Get name + name <- model$biodiversity[[1]]$name + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green',paste0('Starting fitting: ', name)) + + # Verbosity + verbose <- settings$get("verbose") + + # Set prediction type also for later + settings$set('type', self$get_data("params")$type) + + # seed + seed <- settings$get("seed") + if(is.Waiver(seed)) { seed <- 1337; settings$set('seed', 1337) } + + # Get output raster + prediction <- self$get_data('template') + + # Get parameters control + params <- self$get_data('params') + # Set glmnet control + glmnet::glmnet.control(factory = TRUE) # Reset to default + + # All other needed data for model fitting + fam <- model$biodiversity[[1]]$family + li <- model$biodiversity[[1]]$link + if(!is.null(li)){ + if(li %in% c("cloglog", "logit", "probit")){ + fam <- stats::binomial(link = li) + } else { + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red',paste0("Custom link functions not supported!")) + } + } + + form <- model$biodiversity[[1]]$equation + df <- cbind(model$biodiversity[[1]]$predictors, + data.frame(observed = model$biodiversity[[1]]$observations[,'observed']) + ) + df <- subset(df, select = c(model$biodiversity[[1]]$predictors_names, "observed")) + w <- df$w <- model$biodiversity[[1]]$expect # The expected exposure + # Get full prediction container + full <- model$predictors + w_full <- model$exposure + + # Get offset and add it to exposure + if(!is.Waiver(model$offset)){ + # Add offset to full prediction and load vector + ofs <- model$biodiversity[[1]]$offset[, 'spatial_offset'] + ofs_pred <- model$offset[,'spatial_offset'] + } else { ofs <- NULL; ofs_pred <- NULL } + + # Check priors, e.g penalty factors + p.fac <- rep(1, sum( model$biodiversity[[1]]$predictors_types$type=="numeric") ) # Added plus 1 for the weight? + names(p.fac) <- model$biodiversity[[1]]$predictors_names[which(model$biodiversity[[1]]$predictors_types$type=="numeric")] + # Then add each factor level if set + if(any(model$predictors_types$type=="factor")){ + fac <- model$biodiversity[[1]]$predictors_names[which(model$biodiversity[[1]]$predictors_types$type=="factor")] + p.fac <- c(p.fac, rep(1, length( unique(df[,fac]) ) )) + } + # Duplicate p.fac container for lower and upper limits + lowlim <- rep(-Inf, length(p.fac)) |> stats::setNames(names(p.fac)) + upplim <- rep(Inf, length(p.fac)) |> stats::setNames(names(p.fac)) + + # Trick for creation for some default lambda values for the regularization multiplier + if(is.null(params$lambda)){ + reg <- default.regularization(p = df$observed, m = stats::model.matrix(form, df)) * c(1, p.fac) # add 1 for the intercept + params$lambda <- 10^(seq(4, 0, length.out = 200)) * sum(p.fac)/length(p.fac) * sum(p.fac)/sum(w) + if(anyNA(params$lambda)) params$lambda <- NULL + } + + if(!is.Waiver(model$priors)){ + # Reset those contained in the prior object + for(v in model$priors$varnames()){ + p.fac[v] <- model$priors$get(v, what = "value") + lowlim[v] <- model$priors$get(v, what = "lims")[1] + upplim[v] <- model$priors$get(v, what = "lims")[2] + } + } + + # Clamp? + if( settings$get("clamp") ) full <- clamp_predictions(model, full) + + # -- # + # Expand predictors if non-linear is specified in settings + if(settings$get('only_linear') == FALSE){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow', + 'Non-linearity to glmnet is best introduced by adding derivates. Ignored!') + # linear_predictors <- attr(terms.formula(form), "term.labels") + # m <- outer(linear_predictors, linear_predictors, function(x, y) paste(x, y, sep = ":")) + # + # form <- stats::update.formula(form, + # paste0( + # ". ~ . +", + # paste0("I(", linear_predictors,"^2)",collapse = " + "), + # " + ", + # paste0(m[lower.tri(m)], collapse = " + ") + # )) + # # Update penalty factors and limits + # for(var in attr(terms.formula(form), "term.labels")){ + # if(!(var %in% p.fac)){ + # v <- 1 # Take the maximum regularization penalty by default + # vlow <- -Inf; vupp <- Inf + # names(v) <- var; names(vlow) <- var; names(vupp) <- var + # p.fac <- append(p.fac, v) + # lowlim <- append(lowlim, vlow); upplim <- append(upplim, vupp) + # } + # } + } + + assertthat::assert_that( + is.null(w) || length(w) == nrow(df), + is.null(ofs) || is.vector(ofs), + is.null(ofs_pred) || is.vector(ofs_pred), + length(p.fac) == length(lowlim), + all(w >= 0,na.rm = TRUE) + ) + + # --- # + # Determine the optimal lambda through k-fold cross-validation + if(getOption("ibis.runparallel")){ + if(!foreach::getDoParRegistered()) ibis_future(cores = getOption("ibis.nthread"), + strategy = getOption("ibis.futurestrategy")) + } + # Depending if regularized should be set, specify this separately + if( (settings$get('optim_hyperparam')) ){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green', + 'Finding optimal hyper parameters alpha and lambda.') + cv_gn <- try({ + glmnetUtils::cva.glmnet(formula = form, + data = df, + alpha = params$alpha, # Elastic net mixing parameter + lambda = params$lambda, # Overwrite lambda + weights = w, # Case weights + offset = ofs, + family = fam, + penalty.factor = p.fac, + # Option for limiting the coefficients + lower.limits = lowlim, + upper.limits = upplim, + standardize = FALSE, # Don't standardize to avoid doing anything to weights + maxit = (10^5)*2, # Increase the maximum number of passes for lambda + parallel = getOption("ibis.runparallel"), + trace.it = settings$get("verbose"), + nfolds = 10 # number of folds for cross-validation + ) + },silent = FALSE) + } else { + cv_gn <- try({ + glmnetUtils::cv.glmnet(formula = form, + data = df, + alpha = params$alpha, # Elastic net mixing parameter + lambda = params$lambda, # Overwrite lambda + weights = w, # Case weights + offset = ofs, + family = fam, + penalty.factor = p.fac, + # Option for limiting the coefficients + lower.limits = lowlim, + upper.limits = upplim, + standardize = FALSE, # Don't standardize to avoid doing anything to weights + maxit = (10^5)*2, # Increase the maximum number of passes for lambda + parallel = getOption("ibis.runparallel"), + trace.it = settings$get("verbose"), + nfolds = 10 # number of folds for cross-validation + ) + },silent = FALSE) + } + if(inherits(cv_gn, "try-error")) stop("Model failed to converge with provided input data!") + + # --- # + # Predict spatially + if(!settings$get('inference_only')){ + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(full)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + # Make a subset of non-na values + full$rowid <- 1:nrow(full) + full_sub <- subset(full, stats::complete.cases(full)) + w_full_sub <- w_full[full_sub$rowid] + assertthat::assert_that((nrow(full_sub) == length(w_full_sub)) || is.null(w_full_sub) ) + + # Attempt prediction + out <- predict(object = cv_gn, + newdata = full_sub, + weights = w_full_sub, + newoffset = ofs_pred[full_sub$rowid], + s = determine_lambda(cv_gn), # Determine the best lambda value + type = params$type + ) + + # Fill output with summaries of the posterior + prediction[full_sub$rowid] <- out[,1] + names(prediction) <- "mean" + prediction <- raster::mask(prediction, self$get_data("template")) + try({rm(out, full, full_sub)},silent = TRUE) + } else { + # No prediction done + prediction <- NULL + } + # Compute end of computation time + settings$set('end.time', Sys.time()) + + # Definition of GLMNET Model object ---- + # Create output + out <- bdproto( + "GLMNET-Model", + DistributionModel, + id = model$id, + model = model, + settings = settings, + fits = list( + "fit_best" = cv_gn, + "fit_best_equation" = form, + "prediction" = prediction + ), + # Partial effects + partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = NULL, ...){ + assertthat::assert_that(is.character(x.var) || is.null(x.var), + is.null(constant) || is.numeric(constant), + is.null(type) || is.character(type), + is.numeric(variable_length) + ) + check_package("pdp") + # Settings + settings <- self$settings + + mod <- self$get_data('fit_best') + model <- self$model + df <- model$biodiversity[[length( model$biodiversity )]]$predictors + co <- stats::coef(mod) |> row.names() # Get model coefficient names + # Set type + if(is.null(type)) type <- self$settings$get("type") + + # Match x.var to argument + if(is.null(x.var)){ + x.var <- colnames(df) + } else { + x.var <- match.arg(x.var, names(df), several.ok = FALSE) + } + + # Calculate range of predictors + if(any(model$predictors_types$type=="factor")){ + rr <- sapply(df[model$predictors_types$predictors[model$predictors_types$type=="numeric"]], + function(x) range(x, na.rm = TRUE)) |> as.data.frame() + } else { + rr <- sapply(df, function(x) range(x, na.rm = TRUE)) |> as.data.frame() + } + # if values are set, make sure that they cover the data.frame + if(!is.null(values)){ + assertthat::assert_that(length(x.var) == 1) + df2 <- list() + df2[[x.var]] <- values + # Then add the others + for(var in colnames(df)){ + if(var == x.var) next() + df2[[var]] <- mean(df[[var]], na.rm = TRUE) + } + df2 <- df2 |> as.data.frame() + } else { + df2 <- as.data.frame(seq(rr[1,x.var],rr[2,x.var], length.out = variable_length)) + names(df2) <- x.var + } + + # Check that variables are in + assertthat::assert_that(all( x.var %in% colnames(df) ), + msg = 'Variable not in predicted model.') + + pp <- data.frame() + pb <- progress::progress_bar$new(total = length(x.var)) + for(v in x.var){ + p1 <- pdp::partial(mod, pred.var = v, pred.grid = df2, ice = FALSE, center = FALSE, + type = "regression", + plot = FALSE, rug = TRUE, train = df) + p1 <- p1[,c(x.var, "yhat")] + names(p1) <- c("partial_effect", "mean") + p1$variable <- v + pp <- rbind(pp, p1) + if(length(x.var) > 1) pb$tick() + } + + if(plot){ + # Make a plot + g <- ggplot2::ggplot(data = pp, ggplot2::aes(x = partial_effect, y = mean)) + + ggplot2::theme_classic(base_size = 18) + + ggplot2::geom_line() + + ggplot2::labs(x = "", y = expression(hat(y))) + + ggplot2::facet_wrap(~variable,scales = 'free') + print(g) + } + return(pp) + }, + # Spatial partial dependence plot + spartial = function(self, x.var, constant = NULL, plot = TRUE, type = NULL){ + assertthat::assert_that(is.character(x.var), + "model" %in% names(self), + is.null(constant) || is.numeric(constant), + is.logical(plot), + is.character(type) || is.null(type) + ) + # Settings + settings <- self$settings + # Set type + if(is.null(type)) type <- self$settings$get("type") + type <- match.arg(type, c("link", "response"), several.ok = FALSE) + settings$set("type", type) + + mod <- self$get_data('fit_best') + model <- self$model + # For Integrated model, take the last one + fam <- model$biodiversity[[length(model$biodiversity)]]$family + df <- model$predictors + df$w <- model$exposure + df$rowid <- 1:nrow(df) + # Match x.var to argument + x.var <- match.arg(x.var, names(df), several.ok = FALSE) + df_sub <- subset(df, stats::complete.cases(df)) + if(!is.Waiver(model$offset)) ofs <- model$offset[df_sub$rowid] else ofs <- NULL + assertthat::assert_that(nrow(df_sub)>0) + + # Add all others as constant + if(is.null(constant)){ + for(n in names(df_sub)) if(!n %in% c(x.var, "rowid", "w")) df_sub[[n]] <- suppressWarnings( mean(model$predictors[[n]], na.rm = TRUE) ) + } else { + for(n in names(df_sub)) if(!n %in% c(x.var, "rowid", "w")) df_sub[[n]] <- constant + } + # Reclassify factor levels + if(any(model$predictors_types$type=="factor")){ + lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) + df_sub[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]] <- + factor(lvl[1], levels = lvl) + } + # Predict with lambda + pred_gn <- predict( + object = mod, + newdata = df_sub, + weights = df_sub$w, # The second entry of unique contains the non-observed variables + newoffset = ofs, + s = determine_lambda(mod), # Determine best available lambda + fam = fam, + type = type + ) |> as.data.frame() + + # Now create spatial prediction + prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background + prediction[df_sub$rowid] <- pred_gn[,1] + names(prediction) <- paste0("spartial_",x.var) + + # Do plot and return result + if(plot){ + plot(prediction, col = ibis_colours$viridis_orig) + } + return(prediction) + }, + # Get coefficients from breg + get_coefficients = function(self){ + # Returns a vector of the coefficients with direction/importance + obj <- self$get_data("fit_best") + cofs <- tidy_glmnet_summary(obj) + names(cofs) <- c("Feature", "Beta") + # Remove intercept(s) + int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) + if(length(int)>0) cofs <- cofs[-int,] + return(cofs) + }, + # Engine-specific projection function + project = function(self, newdata, type = NULL, layer = "mean"){ + assertthat::assert_that("model" %in% names(self), + nrow(newdata) > 0, + all( c("x", "y") %in% names(newdata) ), + is.character(type) || is.null(type) + ) + + # Settings + settings <- self$settings + # Set type + if(is.null(type)) type <- self$settings$get("type") + type <- match.arg(type, c("link", "response"), several.ok = FALSE) + settings$set("type", type) + + mod <- self$get_data('fit_best') + model <- self$model + # For Integrated model, take the last one + fam <- model$biodiversity[[length(model$biodiversity)]]$family + + # Clamp? + if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) + + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(newdata)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + + df <- newdata + df$w <- model$exposure # Also get exposure variable + # Make a subset of non-na values + df$rowid <- 1:nrow(df) + df_sub <- subset(df, stats::complete.cases(df)) + if(!is.Waiver(model$offset)) ofs <- model$offset[df_sub$rowid] else ofs <- NULL + assertthat::assert_that(nrow(df_sub)>0) + + pred_gn <- predict( + object = mod, + newdata = df_sub, + weights = df_sub$w, # The second entry of unique contains the non-observed variables + newoffset = ofs, + s = determine_lambda(mod), # Determine best available lambda + fam = fam, + type = type + ) |> as.data.frame() + names(pred_gn) <- layer + + # Now create spatial prediction + prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background + prediction[df_sub$rowid] <- pred_gn[,1] + + return(prediction) + } + ) + return(out) + } + ) + ) # End of bdproto object +} # End of function diff --git a/R/engine_inla.R b/R/engine_inla.R index 5f914739..3501a782 100644 --- a/R/engine_inla.R +++ b/R/engine_inla.R @@ -1,1020 +1,1026 @@ -#' @include bdproto-engine.R utils-inla.R bdproto-distributionmodel.R -NULL - -#' Use INLA as engine -#' -#' @description -#' Allows a full Bayesian analysis of linear and additive models using Integrated Nested Laplace approximation. -#' Engine has been largely superceded by the [engine_bru] package and users are advised to us this one, -#' unless specific options are required. -#' -#' @details -#' All \code{INLA} engines require the specification of a mesh that needs to be provided to the -#' \code{"optional_mesh"} parameter. Otherwise the mesh will be created based on best guesses of the -#' data spread. A good mesh needs to have triangles as regular as possible in size and shape: equilateral. -#' -#' [*] \code{"max.edge"}: The largest allowed triangle edge length, must be in the same scale units as the coordinates -#' Lower bounds affect the density of triangles -#' [*] \code{"offset"}: The automatic extension distance of the mesh -#' If positive: same scale units. If negative, interpreted as a factor relative to the approximate data diameter -#' i.e., a value of -0.10 will add a 10% of the data diameter as outer extension. -#' [*] \code{"cutoff"}: The minimum allowed distance between points, -#' it means that points at a closer distance than the supplied value are replaced by a single vertex. -#' it is critical when there are some points very close to each other, either for point locations or in the -#' domain boundary. -#' [*] \code{"proj_stepsize"}: The stepsize for spatial predictions, which affects the spatial grain of any outputs -#' created. -#' -#' Priors can be set via [INLAPrior]. -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param optional_mesh A directly supplied [`INLA`] mesh (Default: \code{NULL}) -#' @param optional_projstk A directly supplied projection stack. Useful if projection stack is identical for multiple species (Default: \code{NULL}) -#' @param max.edge The largest allowed triangle edge length, must be in the same scale units as the coordinates. -#' Default is an educated guess (Default: \code{NULL}). -#' @param offset interpreted as a numeric factor relative to the approximate data diameter. -#' Default is an educated guess (Default: \code{NULL}). -#' @param cutoff The minimum allowed distance between points on the mesh. -#' Default is an educated guess (Default: \code{NULL}). -#' @param proj_stepsize The stepsize in coordinate units between cells of the projection grid (Default: \code{NULL}). -#' @param timeout Specify a timeout for INLA models in sec. Afterwards it passed. -#' @param strategy Which approximation to use for the joint posterior. Options are \code{"auto"} ("default"), \code{"adaptative"}, -#' \code{"gaussian"}, \code{"simplified.laplace"} & \code{"laplace"}. -#' @param int.strategy Integration strategy. Options are \code{"auto"},\code{"grid"}, \code{"eb"} ("default") & \code{"ccd"}. -#' See also https://groups.google.com/g/r-inla-discussion-group/c/hDboQsJ1Mls -#' @param barrier Should a barrier model be added to the model? -#' @param type The mode used for creating posterior predictions. -#' Either summarizing the linear \code{"predictor"} or \code{"response"} (Default: \code{"response"}). -#' @param area Accepts a [`character`] denoting the type of area calculation to be done on the mesh (Default: \code{'gpc2'}). -#' @param nonconvex.bdry Create a non-convex boundary hulls instead (Default: \code{FALSE}) **Not yet implemented** -#' @param nonconvex.convex Non-convex minimal extension radius for convex curvature **Not yet implemented** -#' @param nonconvex.concave Non-convex minimal extension radius for concave curvature **Not yet implemented** -#' @param nonconvex.res Computation resolution for nonconvex.hulls **Not yet implemented** -#' @param ... Other options. -#' @references -#' * Havard Rue, Sara Martino, and Nicholas Chopin (2009), Approximate Bayesian Inference for Latent Gaussian Models Using Integrated Nested Laplace Approximations (with discussion), Journal of the Royal Statistical Society B, 71, 319-392. -#' * Finn Lindgren, Havard Rue, and Johan Lindstrom (2011). An Explicit Link Between Gaussian Fields and Gaussian Markov Random Fields: The Stochastic Partial Differential Equation Approach (with discussion), Journal of the Royal Statistical Society B, 73(4), 423-498. -#' * Simpson, Daniel, Janine B. Illian, S. H. Sørbye, and Håvard Rue. 2016. “Going Off Grid: Computationally Efficient Inference for Log-Gaussian Cox Processes.” Biometrika 1 (103): 49–70. -#' @family engine -#' @name engine_inla -NULL -#' @rdname engine_inla -#' @export -engine_inla <- function(x, - optional_mesh = NULL, - optional_projstk = NULL, - max.edge = NULL, - offset = NULL, - cutoff = NULL, - proj_stepsize = NULL, - timeout = NULL, - strategy = "auto", - int.strategy = "eb", - barrier = FALSE, - type = "response", - area = "gpc2", - # Not yet implemented. - nonconvex.bdry = FALSE, - nonconvex.convex = -0.15, - nonconvex.concave = -0.05, - nonconvex.res = 40, - ...) { - - # Check whether INLA package is available - check_package('INLA') - if(!isNamespaceLoaded("INLA")) { attachNamespace("INLA");requireNamespace('INLA') } - - myLog('[Deprecation]','yellow','Consider using engine_inlabru as engine with better prediction support.') - - # assert that arguments are valid - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(x$background,'sf'), - inherits(optional_mesh,'inla.mesh') || is.null(optional_mesh), - is.list(optional_projstk) || is.null(optional_projstk), - is.vector(max.edge) || is.null(max.edge), - (is.vector(offset) || is.numeric(offset)) || is.null(offset), - is.numeric(cutoff) || is.null(cutoff), - is.null(timeout) || is.numeric(timeout), - is.character(type), - is.character(area), - is.character(strategy), - is.character(int.strategy), - is.null(proj_stepsize) || is.numeric(proj_stepsize) - ) - type <- match.arg(type, c("predictor", "response"), several.ok = FALSE) - area <- match.arg(area, c("gpc", "gpc2", "km"), several.ok = FALSE) - # Check strategy settings - strategy <- match.arg(strategy, c("auto", "adaptative", "gaussian", "simplified.laplace", "laplace"), several.ok = FALSE) - int.strategy <- match.arg(int.strategy, c("auto", "grid", "eb", "ccd"), several.ok = FALSE) - - # Set the projection mesh - if(inherits(optional_mesh,'inla.mesh')) { - # Load a provided on - mesh <- optional_mesh - # Convert the study region - region.poly <- as(sf::st_geometry(x$background), "Spatial") - - # Security check for projection and if not set, use the one from background - if(is.null(mesh$crs)) mesh$crs <- sp::CRS( proj4string(region.poly) ) - - # Calculate area - ar <- suppressWarnings( - mesh_area(mesh = mesh, region.poly = region.poly, variant = area) - ) - } else { - mesh <- new_waiver() - ar <- new_waiver() - } - - # If time out is specified - if(!is.null(timeout)) INLA::inla.setOption(fmesher.timeout = timeout) - - # Get barrier from the region polygon - # TODO: Add this in addition to spatial field below, possibly specify an option to calculate this - if(barrier && !is.Waiver(mesh)){ - mesh_bar <- mesh_barrier(mesh, region.poly) - } else { mesh_bar <- new_waiver() } - - # --- # - # Create other parameters object - params <- list( - max.edge = max.edge, - offset = offset, - cutoff = cutoff, - proj_stepsize = proj_stepsize, - type = type, - area = area, - strategy = strategy, - int.strategy = int.strategy, - ... - ) - - # Print a message in case there is already an engine object - if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') - - # Set engine in distribution object - x$set_engine( - bdproto( - "INLA-Engine", - Engine, - name = "", - data = list( - 'mesh' = mesh, - 'mesh.area' = ar, - 'mesh.bar' = mesh_bar, - 'stk_pred' = optional_projstk, - 'params' = params - ), - # Function to create a mesh - create_mesh = function(self, model){ - assertthat::assert_that(is.list(model), - "background" %in% names(model)) - # Check if mesh is already present, if so use it - if(!is.Waiver(self$get_data("mesh"))) return() - # Create a new mesh based on the available data - - # Get parameters - params <- self$get_data("params") - - # Convert the study region - region.poly <- as(sf::st_geometry(model$background), "Spatial") - - # Convert to boundary object for later - suppressWarnings( - bdry <- INLA::inla.sp2segment( - sp = region.poly, - join = TRUE, - crs = INLA::inla.CRS(projargs = sp::proj4string(region.poly)) - ) - ) - bdry$loc <- INLA::inla.mesh.map(bdry$loc) - - # Try and infer mesh parameters if not set - - # Get all coordinates of observations - locs <- collect_occurrencepoints(model, include_absences = FALSE) - - assertthat::assert_that( - nrow(locs)>0, - ncol(locs)==2 - ) - - if(is.null(params$max.edge)){ - # A good guess here is usally a max.edge of between 1/3 to 1/5 of the spatial range. - max.edge <- c(diff(range(locs[,1]))/(3*5) , diff(range(locs[,1]))/(3*5) * 2) - params$max.edge <- max.edge - } - if(is.null(params$offset)){ - # Check whether the coordinate system is longlat - if( sf::st_is_longlat(bdry$crs) ){ - # Specify offset as 1/100 of the boundary distance - offset <- c( diff(range(bdry$loc[,1]))*0.01, - diff(range(bdry$loc[,1]))*0.01) - } else { - offset <- c( diff(range(bdry$loc[,1]))*0.01, - diff(range(bdry$loc[,1]))*0.01) - } - params$offset <- offset - } - if(is.null(params$cutoff)){ - # Specify as minimum distance between y coordinates - # Thus capturing most points on this level - # otherwise set to default - val <- min(abs(diff(locs[,2]))) - cutoff <- ifelse(val == 0, 1e-12, val) - params$cutoff <- cutoff - } - - suppressWarnings( - mesh <- INLA::inla.mesh.2d( - # Point localities - loc = locs, - # Boundary object - boundary = bdry, - # Mesh Parameters - max.edge = params$max.edge, - offset = params$offset, - cutoff = params$cutoff, - # Define the CRS - crs = bdry$crs - ) - ) - # Calculate area - # ar <- suppressMessages( - # suppressWarnings( - # mesh_area(mesh = mesh, region.poly = region.poly, variant = params$area) - # ) - # ) - # 06/01/2023: This should work and is identical to inlabru::ipoints - ar <- suppressWarnings( - diag( INLA::inla.mesh.fem(mesh = mesh)[[1]] ) - ) - assertthat::assert_that(length(ar) == mesh$n) - - # Now set the output - self$set_data("mesh", mesh) - self$set_data("mesh.area", ar) - - invisible() - }, - # Generic plotting function for the mesh - plot = function(self, assess = FALSE){ - if(is.Waiver(self$get_data('mesh'))) stop("No mesh found!") - - if(assess){ - # For an INLA mesh assessment - out <- INLA:::inla.mesh.assessment( - mesh = self$get_data('mesh'), - spatial.range = 3, - alpha = 2, - dims = c(300, 300) - ) - # Convert to raster stack - out <- raster::stack( - sp::SpatialPixelsDataFrame( sp::coordinates(out), data = as.data.frame(out), - proj4string = self$get_data('mesh')$crs ) - ) - - raster::plot(out[[c('sd','sd.dev','edge.len')]], - col = c("#00204D","#00336F","#39486B","#575C6D","#707173","#8A8779","#A69D75","#C4B56C","#E4CF5B","#FFEA46") - ) - } else { - INLA:::plot.inla.mesh( self$get_data('mesh') ) - } - }, - # Spatial latent function - # https://groups.google.com/g/r-inla-discussion-group/c/eqMhlbwChkQ/m/m0b0PuzL-PsJ - # Default SPDE prior - # It computes the approximate diameter of the mesh, multiplies by 0.2 to get a value for the prior median range, and then transforms it to log-kappa scale by the formula - # log(sqrt(8*nu)/range) where nu is alpha-dim/2. - calc_latent_spatial = function(self,type = 'spde', alpha = 2, - priors = NULL, - polynames = NULL, - varname = "spatial.field1", - ...){ - # Catch prior objects - if(is.null(priors) || is.Waiver(priors)) priors <- NULL - - # For calculating iCAR process - if(type == 'car'){ - # convert mesh to sf object - ns <- mesh_as_sf(self$data$mesh) - # Create adjacency matrix with queen's case - nc.nb <- spdep::poly2nb(ns, queen = TRUE) - #Convert the adjacency matrix into a file in the INLA format - adjmat <- spdep::nb2mat(nc.nb,style = "B") - adjmat <- as(adjmat, "dgTMatrix") - # adjmat <- INLA::inla.graph2matrix(nc.nb) - # Save the adjaceny matrix as output - self$data$latentspatial <- adjmat - self$data$s.index <- as.numeric(attr(nc.nb,varname)) - } else if(type=='spde'){ - # Check that everything is correctly specified - if(!is.null(priors)) if('spde' %notin% priors$varnames() ) priors <- NULL - - # Use default spde - if(is.null(priors) || is.Waiver(priors)){ - # Define PC Matern SPDE model and save - self$data$latentspatial <- INLA::inla.spde2.matern( - mesh = self$data$mesh, - alpha = alpha - ) - } else { - # Get priors - pr <- if(is.null(priors)) c(0.01, 0.05) else priors$get('spde','prior.range') - ps <- if(is.null(priors)) c(10, 0.05) else priors$get('spde','prior.sigma') - - # Define PC Matern SPDE model and save - self$data$latentspatial <- INLA::inla.spde2.pcmatern( - mesh = self$data$mesh, - alpha = alpha, - # P(Range < 1°) = 0.001 and P(sigma > 0.5) = 0.05 - prior.range = pr, prior.sigma = ps - ) - } - # Make index for spatial field - self$data$s.index <- INLA::inla.spde.make.index(name = varname, - n.spde = self$data$latentspatial$n.spde, - n.group = 1, - n.repl = 1) - # Security checks - assertthat::assert_that( - inherits(self$data$latentspatial,'inla.spde'), - length(self$data$s.index[[1]]) == self$data$mesh$n - ) - } else if(type == 'poly'){ - # Save column names of polynomial transformed coordinates - assertthat::assert_that(!is.null(polynames)) - self$data$latentspatial <- polynames - } - invisible() - }, - # Get latent spatial equation bit - # Set vars to 2 or larger to get copied spde's - get_equation_latent_spatial = function(self, method, vars = 1, separate_spde = FALSE){ - assertthat::assert_that(is.numeric(vars)) - if(method == 'spde'){ - assertthat::assert_that(inherits(self$data$latentspatial, 'inla.spde'), - msg = 'Latent spatial has not been calculated.') - # SPDE string - if(separate_spde){ - ss <- paste0("f(spatial.field",vars,", model = ",method,")") - } else { - if(vars >1){ - ss <- paste0("f(spatial.field",vars,", copy = \'spatial.field1\', model = ",method,", fixed = TRUE)") - } else { - ss <- paste0("f(spatial.field",vars,", model = ",method,")") - } - } - return(ss) - - } else if(method == 'car'){ - assertthat::assert_that(inherits(self$data$latentspatial,'dgTMatrix'), - msg = 'Neighborhood matrix has not been calculated.') - return( - # BESAG model or BYM model to specify - # BYM found to be largely similar to SPDE https://onlinelibrary.wiley.com/doi/pdf/10.1002/ece3.3081 - paste0('f(','spatial.field',', model = "bym", graph = ','adjmat',')') - ) - } - }, - # Configure stack - make_stack = function(self, model, id, intercept = TRUE, joint = FALSE) { - assertthat::assert_that( - is.list(model), - is.character(id) - ) - # Get Environment records - env <- model$predictors - - # Include intercept in here - # TODO: Note that this sets intercepts by type and not by dataset id - if(intercept) { - env$Intercept <- 1 # Overall Intercept - env[[paste0('Intercept', - ifelse(joint,paste0('_', - make.names(tolower(model$name)),'_', - model$type),''))]] <- 1 # Setting Intercept to common type, thus sharing with similar types - } - # Set up projection matrix for the data - suppressWarnings( - mat_proj <- INLA::inla.spde.make.A( - mesh = self$get_data('mesh'), - loc = as.matrix(env[,c('x','y')]) - ) - ) - # Create INLA stack - # The three main inla.stack() arguments are a vector list with the data (data), - # a list of projector matrices (each related to one block effect, - # A) and the list of effects (effects). - - # Response for inla stack - ll_resp <- list() - # Add the expected estimate and observed note - # FIXME: Currently only two likelihoods are supported (binomial/poisson) with the NA order being the determining factor - if(model$family == 'poisson') { - if(joint) ll_resp[[ 'observed' ]] <- cbind(model$observations[['observed']], NA ) - if(!joint) ll_resp[[ 'observed' ]] <- cbind(model$observations[['observed']] ) - ll_resp[[ 'e' ]] <- model$expect - } - if(model$family == 'binomial') { - if(joint) ll_resp[[ 'observed' ]] <- cbind(NA, model$observations[['observed']] ) - if(!joint) ll_resp[[ 'observed' ]] <- cbind( model$observations[['observed']] ) - ll_resp[[ 'Ntrials' ]] <- model$expect - } - - # Effects matrix - ll_effects <- list() - # Note, order adding this is important and matches the A matrix below - # ll_effects[['Intercept']] <- rep(1, nrow(model$observations)) - # ll_effects[['Intercept']][[paste0('Intercept',ifelse(joint,paste0('_',make.names(tolower(model$name)),'_',model$type),''))]] <- seq(1, self$get_data('mesh')$n) # Old code - ll_effects[['predictors']] <- env - ll_effects[['spatial.field1']] <- seq(1, self$get_data('mesh')$n) - - # Add offset if specified - if(!is.null(model$offset)){ - ll_effects[['predictors']] <- cbind( ll_effects[['predictors']], - subset(model[['offset']],select = "spatial_offset") - ) - } - - # Check whether equation has spatial field and otherwise add - # MJ 13/06: Spatial.field now set directly to effects - # if( 'spde' %in% all.vars(model$equation) ){ - # # Get Index Objects - # iset <- self$get_data('s.index') - # ll_effects[['spatial.field']] <- c(ll_effects[['spatial.field']], iset) - # } else if ( 'adjmat' %in% all.vars(model$equation) ){ - # iset <- self$get_data('s.index') - # ll_effects[['spatial.field']] <- c(ll_effects[['spatial.field']], data.frame(spatial.index = iset) ) - # } - # Define A - A <- list(1, mat_proj) - - # Define stack - stk <- INLA::inla.stack( - data = ll_resp, - A = A, - effects = ll_effects, - tag = paste0('stk_',as.character(model$type),'_',id) - ) - # Set the stack - self$set_data(paste0('stk_',as.character(model$type),'_',id), stk) - invisible() - }, - # Main INLA training function ---- - # Setup computation function - setup = function(self, model, settings,...){ - assertthat::assert_that( - 'background' %in% names(model), - 'biodiversity' %in% names(model), - all( model$biodiversity[[1]]$predictors_names %in% model$predictors_names ), - all( sapply(model$biodiversity, function(x) is.formula(x$equation)) ), - length(model$biodiversity)>=1, - msg = 'Some internal checks failed while setting up the model.' - ) - # Messager - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') - - # Set number of threads via set.Options - INLA::inla.setOption(num.threads = getOption('ibis.nthread'), - blas.num.threads = getOption('ibis.nthread')) - - # --- Prepare general inputs --- - # Check whether spatial latent effects were added - if( 'spde' %in% all.vars(model$biodiversity[[1]]$equation) ){ - # Get spatial index - spde <- self$get_data('s.index') - } else { spde <- NULL } - - # Check for existence of specified offset and use the full one in this case - if(!is.Waiver(model$offset)) offset <- subset(model[['offset']],select = "spatial_offset") else offset <- NULL - - # Projection stepsize - params <- self$get_data('params') - if(is.null( params$proj_stepsize )){ - # Set to stepsize equivalent of the resolution of the grid - val <- max(diff(model[['predictors']]$x)) # TODO: Check that it works when dummy variable is used - params$proj_stepsize <- val - self$set_data('params', params ) - rm(val) - } - - # Number of types to determine if a joint model is necessary - nty <- length( unique( as.character(sapply(model$biodiversity, function(z) z$type)) ) ) - - # Clean up previous data and integration stacks - chk <- grep('stk_int|stk_poipo|stk_poipa|stk_polpo|stk_polpa|stk_pred|stk_full', self$list_data()) - if(length(chk)>0) self$data[chk] <- NULL - - # Re-format the full predictors if there are any factor variables - # FIXME: Potentially outsource? - if(any(model$predictors_types$type=="factor")){ - vf <- model$predictors_types$predictors[model$predictors_types$type=="factor"] - for(k in vf){ - o <- explode_factor(model$predictors[[k]],name = k) - model$predictors <- cbind(model$predictors, o) - model$predictors_names <- c(model$predictors_names, colnames(o)) - model$predictors_types <- rbind(model$predictors_types, - data.frame(predictors = colnames(o), type = "numeric") ) - # Finally remove the original column from the predictor object - model$predictors[[k]] <- NULL - model$predictors_names <- model$predictors_names[-which( model$predictors_names == k )] - model$predictors_types <- subset(model$predictors_types, subset = predictors != k) - # FIXME: Hacky solution as to not overwrite predictor object - ras_back <- model$predictors_object$data - # Explode the columns in the raster object - model$predictors_object$data <- raster::addLayer( - model$predictors_object$data, - explode_factorized_raster(model$predictors_object$data[[k]]) - ) - model$predictors_object$data <- raster::dropLayer(model$predictors_object$data, k) - } - } else { ras_back <- new_waiver() } - - # Now for each dataset create a INLA stack - for(id in 1:length(model$biodiversity) ){ - - # If there any factor variables split them per type and explode them - if(any(model$biodiversity[[id]]$predictors_types$type=="factor")){ - vf <- model$biodiversity[[id]]$predictors_types$predictors[model$biodiversity[[id]]$predictors_types$type=="factor"] - fv <- model$biodiversity[[id]]$predictors[vf] - for(k in 1:ncol(fv)){ - o <- explode_factor(fv[,k],name = colnames(fv)[k]) - # Add - model$biodiversity[[id]]$predictors <- cbind(model$biodiversity[[id]]$predictors, o) - model$biodiversity[[id]]$predictors_names <- c(model$biodiversity[[id]]$predictors_names, colnames(o)) - model$biodiversity[[id]]$predictors_types <- rbind(model$biodiversity[[id]]$predictors_types, - data.frame(predictors = colnames(o), type = "numeric") ) - # Finally remove the original column from the predictor object - model$biodiversity[[id]]$predictors[[colnames(fv)[k]]] <- NULL - model$biodiversity[[id]]$predictors_names <- model$biodiversity[[id]]$predictors_names[-which( model$biodiversity[[id]]$predictors_names == colnames(fv)[k] )] - model$biodiversity[[id]]$predictors_types <- subset(model$biodiversity[[id]]$predictors_types, subset = predictors != colnames(fv)[k]) - - } - } - # Calculate observation stack INLA stack - # Save stacks by id instead of type - self$make_stack(model = model$biodiversity[[id]], - id = names(model$biodiversity)[id], - intercept = TRUE, - joint = ifelse(nty > 1, TRUE, FALSE) - ) - - # Define mesh.area dependent on whether a single variable only is used or not - if(model$biodiversity[[id]]$family == 'poisson'){ - # Only create on if not already existing - chk <- grep('stk_int', self$list_data()) - if(length(chk)==0){ - # Make integration stack for given poisson model - stk_int <- inla_make_integration_stack( - mesh = self$get_data('mesh'), - mesh.area = self$get_data('mesh.area'), - model = model, - id = names(model$biodiversity)[id], - joint = ifelse(nty > 1, TRUE, FALSE) - ) - # Save integration stack - self$set_data(paste0('stk_int_',names(model$biodiversity)[id]),stk_int) - } - } - } - - # ------------------ # - # Get all stacks defined so far and join them - stk_inference <- lapply( - self$list_data()[grep(paste(names(model$biodiversity),collapse = '|'), self$list_data())], - function(x) self$get_data(x) - ) - stk_inference <- do.call(INLA::inla.stack, stk_inference) - - # Clamp? - if( settings$get("clamp") ) model$predictors <- clamp_predictions(model, model$predictors) - - # Make projection stack if not directly supplied - if(is.null(self$data$stk_pred)){ - - stk_pred <- inla_make_projection_stack( - stk_resp = stk_inference, - model = model, - mesh = self$get_data('mesh'), - mesh.area = self$get_data('mesh.area'), - res = self$get_data('params')$proj_stepsize, - type = model$biodiversity[[id]]$type, - spde = spde, - settings = settings, - joint = ifelse(nty > 1, TRUE, FALSE) - ) - self$set_data('stk_pred', stk_pred) - } else { - # FIXME: Add some basic assertthat tests for when a prediction stack is directly supplied - stk_pred <- self$get_data('stk_pred') - } - - # Now join all stacks and save in full - # Note: If integrated stack is included, E must be set to relative area (in mesh.area). - self$set_data('stk_full', - INLA::inla.stack(stk_inference, stk_pred$stk_proj) - ) - if(!is.Waiver(ras_back)) model$predictors_object$data # Overwrite model object back to avoid issues with other engines. Hacky! - return(model) - }, - train = function(self, model, settings) { - # Check that all inputs are there - assertthat::assert_that( - inherits(settings,'Settings'), - is.list(model),length(model)>1, - # Check that model id and setting id are identical - settings$modelid == model$id, - any( (c('stk_full','stk_pred') %in% names(self$data)) ), - inherits(self$get_data('stk_full'),'inla.data.stack') - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting fitting...') - - # Get all datasets with id. This includes the data stacks and integration stacks - stk_inference <- lapply( - self$list_data()[grep(paste(names(model$biodiversity),collapse = '|'), self$list_data())], - function(x) self$get_data(x)) - stk_inference <- do.call(INLA::inla.stack, stk_inference) - - # Get full stack and projection grid - stk_full <- self$get_data('stk_full') - predcoords <- self$get_data('stk_pred')$predcoords - - # Get parameters - params <- self$get_data("params") - - # Get families and links - fam <- unique( as.character( sapply(model$biodiversity, function(x) x$family) ) ) - lin <- sapply(model$biodiversity, function(x) x$link) - # Define control family - cf <- list() - for(i in 1:length(fam)) cf[[i]] <- list(link = ifelse(fam[i] == 'poisson','log','cloglog' )) - if(length(fam)==1 && fam == 'binomial') cf[[1]]$link <- 'logit' - - # Shared link? Set to - if(length(fam)==1) {li <- 1} else { li <- NULL} # FIXME: Check whether links have to be set individually per observation - - if('spde' %in% all.vars(model$biodiversity[[1]]$equation) ){ - spde <- self$get_data('latentspatial') - stack_data_resp <- INLA::inla.stack.data(stk_inference, spde = self$get_data('latentspatial')) - stack_data_full <- INLA::inla.stack.data(stk_full, spde = self$get_data('latentspatial')) - } else { - adjmat <- spde <- self$get_data('latentspatial') - stack_data_resp <- INLA::inla.stack.data(stk_inference) - stack_data_full <- INLA::inla.stack.data(stk_full) - } - # ----------- # - # Provided or default formula - master_form <- as.formula( - paste0("observed ~ ", - # # If multiple datasets, remove intercept - ifelse(length(model$biodiversity)>1,"0 + ", ""), - paste0(sapply(model$biodiversity, function(x){ - attr(terms.formula(x$equation),"term.labels") - }) %>% c %>% unique(),collapse = " + ") - ) - ) - - # Perform variable selection - if( settings$get(what='varsel') == "reg"){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Performing variable selection...') - - k <- NULL - # Specify offsets and spde to be retained - # FIXME: Also set priors and offsets here? - if(is.Waiver(spde)) k <- NULL else k <- self$get_equation_latent_spatial('spde') - - # Use backward variable elimination - vs <- inla.backstep(master_form = master_form, - stack_data_resp = stack_data_resp, - stk_inference = stk_inference, - fam = fam, - cf = cf,li = li, - response = 'observed', - keep = k - ) - master_form <- to_formula(vs$form) - } - - # ------------------------------------------ # - # Train the model on the response - fit_resp <- INLA::inla(formula = master_form, # The specified formula - data = stack_data_resp, # The data stack - quantiles = c(0.05, 0.5, 0.95), - E = INLA::inla.stack.data(stk_inference)$e, # Expectation (Eta) for Poisson model - Ntrials = INLA::inla.stack.data(stk_inference)$Ntrials, - family = fam, # Family the data comes from - control.family = cf, # Control options - control.predictor = list(A = INLA::inla.stack.A(stk_inference), - link = li, # Link to NULL for multiple likelihoods! - compute = TRUE), # Compute for marginals of the predictors. - control.compute = list(cpo = FALSE, waic = TRUE, config = TRUE), #model diagnostics and config = TRUE gives you the GMRF - # control.fixed = list(mean = 0),# prec = list( initial = log(0.000001), fixed = TRUE)), # Added to see whether this changes GMRFlib convergence issues - verbose = settings$get(what='verbose'), # To see the log of the model runs - control.inla = INLA::control.inla(strategy = params$strategy, - int.strategy = params$int.strategy), - num.threads = getOption('ibis.nthread') - ) - - # Predict spatially - if(!settings$get(what='inference_only')){ - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') - - # Predict on full - fit_pred <- try({INLA::inla(formula = master_form, # The specified formula - data = stack_data_full, # The data stack - quantiles = c(0.05, 0.5, 0.95), - E = INLA::inla.stack.data(stk_full)$e, - Ntrials = INLA::inla.stack.data(stk_full)$Ntrials, - family= fam, # Family the data comes from - control.family = cf, # Control options - control.predictor = list(A = INLA::inla.stack.A(stk_full), - link = li, # Link to NULL for multiple likelihoods! - compute = TRUE), # Compute for marginals of the predictors. - control.compute = list(cpo = FALSE, waic = TRUE, config = TRUE, openmp.strategy = 'huge' ), - # control.mode = list(theta = thetas, restart = FALSE), # To speed up use previous thetas - verbose = settings$get(what='verbose'), # To see the log of the model runs - # control.results = list(return.marginals.random = FALSE, - # return.marginals.predictor = FALSE), # Don't predict marginals to save speed - # control.fixed = INLA::control.fixed(mean = 0),#, prec = list( initial = log(0.000001), fixed = TRUE)), # Added to see whether this changes GMRFlib convergence issues - control.inla = INLA::control.inla(strategy = params$strategy, - int.strategy = params$int.strategy), - num.threads = getOption('ibis.nthread') - ) - },silent = FALSE) - if(inherits(fit_pred,'try-error')) { print(fit_pred); stop('Model did not converge. Try to simplify structure and check priors!') } - # Create a spatial prediction - index.pred <- INLA::inla.stack.index(stk_full, 'stk_pred')$data - # Which type of prediction (linear predictor or response scale) - # The difference between both is that response applies the (inverse of the) link function, - # so it doesn't include the observation distribution part (measurement noise) of posterior predictions. - if(params$type == "predictor"){ - post <- fit_pred$summary.linear.predictor[index.pred, ] - } else { - post <- fit_pred$summary.fitted.values[index.pred, ] - } - assertthat::assert_that(nrow(post)>0, - nrow(post) == nrow(predcoords) ) # Check with cells in projection - # Back-transform for predictor - if(params$type == "predictor"){ - if(length(fam)==1){ - if(fam == 'poisson') post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] <- exp( post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] ) - if(fam == 'binomial') post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] <- logistic(post[,c('mean','0.05quant','0.5quant','0.95quant','mode')]) - } else { - # Joint likelihood of Poisson log and binomial cloglog following Simpson et al. - post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] <- exp( post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] ) - } - } - post <- subset(post, select = c('mean','sd','0.05quant','0.5quant','0.95quant','mode') ) - post$cv <- post$sd / post$mean - # Rename - names(post) <- c("mean", "sd", "q05", "q50", "q95", "mode","cv") - - # Fill prediction - suppressWarnings( - prediction <- raster::stack( - sp::SpatialPixelsDataFrame( - points = predcoords, - data = post, - proj4string = sp::CRS( self$get_data('mesh')$crs@projargs ) # x$engine$data$mesh$crs@projargs - ) - ) - ) - prediction <- raster::mask(prediction, model$background) # Mask with background - # Align with background - temp <- raster::raster( - sp::SpatialPixelsDataFrame( - points = model$predictors[,c('x','y')], - data = model$predictors[,c('x','y')], - proj4string = sp::CRS( self$get_data('mesh')$crs@projargs ) # x$engine$data$mesh$crs@projargs - ) - ) - prediction <- raster::resample(prediction, temp, method = 'bilinear') - - } else { - # No prediction to be conducted - fit_pred <- NULL - prediction <- NULL - } - - # Compute end of computation time - settings$set('end.time', Sys.time()) - - # Definition of INLA Model object ---- - out <- bdproto( - "INLA-Model", - DistributionModel, - id = model$id, - model = model, - settings = settings, - fits = list( - "fit_best" = fit_resp, - "fit_pred" = fit_pred, - "fit_best_equation" = master_form, - "mesh" = self$get_data('mesh'), - "spde" = self$get_data('latentspatial'), - "prediction" = prediction - ), - # Projection function - project = function(self, newdata, mode = 'coef', backtransf = NULL, layer = "mean"){ - assertthat::assert_that('fit_best' %in% names(self$fits), - is.data.frame(newdata) || is.matrix(newdata), - mode %in% c('coef','sim','full'), - assertthat::has_name(newdata,c('x','y')) - ) - stop("Projection using engine INLA is deprecated. Use engine_inlabru !") - - # Try and guess backtransformation - if(is.null(backtransf)){ - fam <- self$get_data('fit_best')$.args$family - backtransf <- ifelse(fam == 'poisson', exp, logistic) - } - - if(mode == 'coef'){ - # We use the coefficient prediction - out <- coef_prediction(mesh = self$get_data('mesh'), - mod = self, - type = 'mean', - backtransf = backtransf - ) - } else if(mode == 'sim'){ - # Simulate from posterior. Not yet coded - stop('Simulation from posterior not yet implemented. Use inlabru instead!') - } else { - stop('Full prediction not yet added.') - } - # Return result - return(out) - }, - # Partial response - # FIXME: Create external function - partial = function(self, x, x.var, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = "response"){ - # Goal is to create a sequence of value and constant and append to existing stack - # Alternative is to create a model-matrix through INLA::inla.make.lincomb() and - # model.matrix(~ vars, data = newDummydata) fed to make.lincomb - # provide via lincomb = M to an INLA call. - # Both should be identical - stop("Partial function not implemented. Consider using inlabru instead!") - # Check that provided model exists and variable exist in model - mod <- self$get_data('fit_best') - assertthat::assert_that(inherits(mod,'inla'), - 'model' %in% names(self), - inherits(x,'BiodiversityDistribution'), - length(x.var) == 1, is.character(x.var), - is.null(constant) || is.numeric(constant) - ) - varn <- mod$names.fixed - variable <- match.arg(x.var, varn, several.ok = FALSE) - assertthat::assert_that(variable %in% varn, length(variable)==1,!is.null(variable)) - - # ------------------ # - # Get all datasets with id in model. This includes the data stacks and integration stacks - stk_inference <- lapply( - x$engine$list_data()[grep(paste(names(model$biodiversity),collapse = '|'), x$engine$list_data())], - function(z) x$engine$get_data(z)) - stk_inference <- do.call(INLA::inla.stack, stk_inference) - # FIXME: Test that this works with SPDE present - stack_data_resp <- INLA::inla.stack.data(stk_inference) - # ------------------ # - - # If constant is null, calculate average across other values - if(is.null(constant)){ - constant <- lapply(stack_data_resp, function(x) mean(x,na.rm = T))[varn[varn %notin% variable]] - } - # For target variable calculate range - variable_range <- range(stack_data_resp[[variable]],na.rm = TRUE) - - # Create dummy data.frame - dummy <- data.frame(observed = rep(NA, variable_length)) - - seq(variable_range[1],variable_range[2],length.out = variable_length) - - # # add sequence of data and na to data.frame. predict those - # control.predictor = list(A = INLA::inla.stack.A(stk_full), - # link = li, # Link to NULL for multiple likelihoods! - # compute = TRUE), # Compute for marginals of the predictors. - - print('Refitting model for partial effect') - ufit <- INLA::inla(formula = as.formula(mod$.args$formula), # The specified formula - data = stk_inference, # The data stack - quantiles = c(0.05, 0.5, 0.95), - E = INLA::inla.stack.data(stk_inference)$e, # Expectation (Eta) for Poisson model - Ntrials = INLA::inla.stack.data(stk_inference)$Ntrials, - family = mod$.args$family, # Family the data comes from - control.family = mod$.args$control.family, # Control options - control.predictor = mod$.args$control.predictor, # Compute for marginals of the predictors. - control.compute = mod$.args$control.compute, - control.fixed = mod$.args$control.fixed, - verbose = FALSE, # To see the log of the model runs - control.inla = mod$.args$control.inla, - num.threads = getOption('ibis.nthread') - ) - control.predictor = list(A = INLA::inla.stack.A(stk_inference)) - - # Plot and return result - }, - # Get coefficients - get_coefficients = function(self){ - # Returns a vector of the coefficients with direction/importance - cofs <- self$summary() - cofs <- subset(cofs, select = c("variable", "mean", "sd")) - names(cofs) <- c("Feature", "Beta", "Sigma") - # Remove intercept(s) - int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) - if(length(int)>0) cofs <- cofs[-int,] - return(cofs) - }, - # Function to plot SPDE if existing - plot_spatial = function(self, dim = c(300,300), kappa_cor = FALSE, what = "spatial.field1", ...){ - assertthat::assert_that(is.vector(dim), - is.character(what)) - - if( length( self$fits$fit_best$size.spde2.blc ) == 1) - { - # Get spatial projections from model - # FIXME: Potentially make the plotting of this more flexible - gproj <- INLA::inla.mesh.projector(self$get_data('mesh'), dims = dim) - g.mean <- INLA::inla.mesh.project(gproj, - self$get_data('fit_best')$summary.random[[what]]$mean) - g.sd <- INLA::inla.mesh.project(gproj, self$get_data('fit_best')$summary.random[[what]]$sd) - - # Convert to rasters - g.mean <- t(g.mean) - g.mean <- g.mean[rev(1:length(g.mean[,1])),] - r.m <- raster::raster(g.mean, - xmn = range(gproj$x)[1], xmx = range(gproj$x)[2], - ymn = range(gproj$y)[1], ymx = range(gproj$y)[2], - crs = self$get_data('mesh')$crs - ) - g.sd <- t(g.sd) - g.sd <- g.sd[rev(1:length(g.sd[,1])),] - r.sd <- raster::raster(g.sd, - xmn = range(gproj$x)[1], xmx = range(gproj$x)[2], - ymn = range(gproj$y)[1], ymx = range(gproj$y)[2], - crs = self$get_data('mesh')$crs - ) - - spatial_field <- raster::stack(r.m, r.sd);names(spatial_field) <- c('SPDE_mean','SPDE_sd') - # Mask with prediction if exists - if(!is.null(self$get_data('prediction'))){ - spatial_field <- raster::resample(spatial_field, self$get_data('prediction')[[1]]) - spatial_field <- raster::mask(spatial_field, self$get_data('prediction')[[1]]) - } - - # -- # - if(kappa_cor){ - # Also build correlation fun - # Get SPDE results - spde_results <- INLA::inla.spde2.result( - inla = self$get_data('fit_best'), - name = what, - spde = self$get_data('spde'), - do.transfer = TRUE) - - # Large kappa (inverse range) equals a quick parameter change in space. - # Small kappa parameter have much longer, slower gradients. - Kappa <- INLA::inla.emarginal(function(x) x, spde_results$marginals.kappa[[1]]) - sigmau <- INLA::inla.emarginal(function(x) sqrt(x), spde_results$marginals.variance.nominal[[1]]) - r <- INLA::inla.emarginal(function(x) x, spde_results$marginals.range.nominal[[1]]) - - # Get Mesh and distance between points - mesh <- self$get_data('mesh') - D <- as.matrix( dist(mesh$loc[, 1:2]) ) - - # Distance vector. - dis.cor <- data.frame(distance = seq(0, max(D), length = 100)) - # Maximum distance by quarter of extent - dis.max <- abs((xmin(self$get_data('prediction')) - xmax(self$get_data('prediction')) ) / 2) # Take a quarter of the max distance - - # Modified Bessel function to get correlation strength - dis.cor$cor <- as.numeric((Kappa * dis.cor$distance) * base::besselK(Kappa * dis.cor$distance, 1)) - dis.cor$cor[1] <- 1 - # --- - # Build plot - layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE)) - plot(dis.cor$cor ~ dis.cor$distance, type = 'l', lwd = 3, - xlab = 'Distance (proj. unit)', ylab = 'Correlation', main = paste0('Kappa: ', round(Kappa,2) ) ) - abline(v = dis.max,lty = 'dotted') - plot(spatial_field[[1]],col = ibis_colours[['viridis_cividis']], main = 'mean spatial effect') - plot(spatial_field[[2]], main = 'sd spatial effect') - } else { - # Just plot the SPDE - par(mfrow=c(1,2)) - plot(spatial_field[[1]],col = ibis_colours[['viridis_cividis']], main = 'mean spatial effect') - plot(spatial_field[[2]], main = 'sd spatial effect') - # And return - return(spatial_field) - } - - - } else { - message(text_red('No spatial covariance in model specified.')) - } - } - ) - return(out) - } - )) -} +#' @include bdproto-engine.R utils-inla.R bdproto-distributionmodel.R +NULL + +#' Use INLA as engine +#' +#' @description +#' Allows a full Bayesian analysis of linear and additive models using Integrated Nested Laplace approximation. +#' Engine has been largely superceded by the [engine_bru] package and users are advised to us this one, +#' unless specific options are required. +#' +#' @details +#' All \code{INLA} engines require the specification of a mesh that needs to be provided to the +#' \code{"optional_mesh"} parameter. Otherwise the mesh will be created based on best guesses of the +#' data spread. A good mesh needs to have triangles as regular as possible in size and shape: equilateral. +#' +#' [*] \code{"max.edge"}: The largest allowed triangle edge length, must be in the same scale units as the coordinates +#' Lower bounds affect the density of triangles +#' [*] \code{"offset"}: The automatic extension distance of the mesh +#' If positive: same scale units. If negative, interpreted as a factor relative to the approximate data diameter +#' i.e., a value of -0.10 will add a 10% of the data diameter as outer extension. +#' [*] \code{"cutoff"}: The minimum allowed distance between points, +#' it means that points at a closer distance than the supplied value are replaced by a single vertex. +#' it is critical when there are some points very close to each other, either for point locations or in the +#' domain boundary. +#' [*] \code{"proj_stepsize"}: The stepsize for spatial predictions, which affects the spatial grain of any outputs +#' created. +#' +#' Priors can be set via [INLAPrior]. +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param optional_mesh A directly supplied [`INLA`] mesh (Default: \code{NULL}) +#' @param optional_projstk A directly supplied projection stack. Useful if projection stack is identical for multiple species (Default: \code{NULL}) +#' @param max.edge The largest allowed triangle edge length, must be in the same scale units as the coordinates. +#' Default is an educated guess (Default: \code{NULL}). +#' @param offset interpreted as a numeric factor relative to the approximate data diameter. +#' Default is an educated guess (Default: \code{NULL}). +#' @param cutoff The minimum allowed distance between points on the mesh. +#' Default is an educated guess (Default: \code{NULL}). +#' @param proj_stepsize The stepsize in coordinate units between cells of the projection grid (Default: \code{NULL}). +#' @param timeout Specify a timeout for INLA models in sec. Afterwards it passed. +#' @param strategy Which approximation to use for the joint posterior. Options are \code{"auto"} ("default"), \code{"adaptative"}, +#' \code{"gaussian"}, \code{"simplified.laplace"} & \code{"laplace"}. +#' @param int.strategy Integration strategy. Options are \code{"auto"},\code{"grid"}, \code{"eb"} ("default") & \code{"ccd"}. +#' See also https://groups.google.com/g/r-inla-discussion-group/c/hDboQsJ1Mls +#' @param barrier Should a barrier model be added to the model? +#' @param type The mode used for creating posterior predictions. +#' Either summarizing the linear \code{"predictor"} or \code{"response"} (Default: \code{"response"}). +#' @param area Accepts a [`character`] denoting the type of area calculation to be done on the mesh (Default: \code{'gpc2'}). +#' @param nonconvex.bdry Create a non-convex boundary hulls instead (Default: \code{FALSE}) **Not yet implemented** +#' @param nonconvex.convex Non-convex minimal extension radius for convex curvature **Not yet implemented** +#' @param nonconvex.concave Non-convex minimal extension radius for concave curvature **Not yet implemented** +#' @param nonconvex.res Computation resolution for nonconvex.hulls **Not yet implemented** +#' @param ... Other options. +#' @references +#' * Havard Rue, Sara Martino, and Nicholas Chopin (2009), Approximate Bayesian Inference for Latent Gaussian Models Using Integrated Nested Laplace Approximations (with discussion), Journal of the Royal Statistical Society B, 71, 319-392. +#' * Finn Lindgren, Havard Rue, and Johan Lindstrom (2011). An Explicit Link Between Gaussian Fields and Gaussian Markov Random Fields: The Stochastic Partial Differential Equation Approach (with discussion), Journal of the Royal Statistical Society B, 73(4), 423-498. +#' * Simpson, Daniel, Janine B. Illian, S. H. Sørbye, and Håvard Rue. 2016. “Going Off Grid: Computationally Efficient Inference for Log-Gaussian Cox Processes.” Biometrika 1 (103): 49–70. +#' @family engine +#' @returns An [engine]. +#' @examples +#' \dontrun{ +#' # Add INLA as an engine (with a custom mesh) +#' x <- distribution(background) |> engine_inla(mesh = my_mesh) +#' } +#' @name engine_inla +NULL +#' @rdname engine_inla +#' @export +engine_inla <- function(x, + optional_mesh = NULL, + optional_projstk = NULL, + max.edge = NULL, + offset = NULL, + cutoff = NULL, + proj_stepsize = NULL, + timeout = NULL, + strategy = "auto", + int.strategy = "eb", + barrier = FALSE, + type = "response", + area = "gpc2", + # Not yet implemented. + nonconvex.bdry = FALSE, + nonconvex.convex = -0.15, + nonconvex.concave = -0.05, + nonconvex.res = 40, + ...) { + + # Check whether INLA package is available + check_package('INLA') + if(!isNamespaceLoaded("INLA")) { attachNamespace("INLA");requireNamespace('INLA') } + + myLog('[Deprecation]','yellow','Consider using engine_inlabru as engine with better prediction support.') + + # assert that arguments are valid + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(x$background,'sf'), + inherits(optional_mesh,'inla.mesh') || is.null(optional_mesh), + is.list(optional_projstk) || is.null(optional_projstk), + is.vector(max.edge) || is.null(max.edge), + (is.vector(offset) || is.numeric(offset)) || is.null(offset), + is.numeric(cutoff) || is.null(cutoff), + is.null(timeout) || is.numeric(timeout), + is.character(type), + is.character(area), + is.character(strategy), + is.character(int.strategy), + is.null(proj_stepsize) || is.numeric(proj_stepsize) + ) + type <- match.arg(type, c("predictor", "response"), several.ok = FALSE) + area <- match.arg(area, c("gpc", "gpc2", "km"), several.ok = FALSE) + # Check strategy settings + strategy <- match.arg(strategy, c("auto", "adaptative", "gaussian", "simplified.laplace", "laplace"), several.ok = FALSE) + int.strategy <- match.arg(int.strategy, c("auto", "grid", "eb", "ccd"), several.ok = FALSE) + + # Set the projection mesh + if(inherits(optional_mesh,'inla.mesh')) { + # Load a provided on + mesh <- optional_mesh + # Convert the study region + region.poly <- methods::as(sf::st_geometry(x$background), "Spatial") + + # Security check for projection and if not set, use the one from background + if(is.null(mesh$crs)) mesh$crs <- sp::CRS( proj4string(region.poly) ) + + # Calculate area + ar <- suppressWarnings( + mesh_area(mesh = mesh, region.poly = region.poly, variant = area) + ) + } else { + mesh <- new_waiver() + ar <- new_waiver() + } + + # If time out is specified + if(!is.null(timeout)) INLA::inla.setOption(fmesher.timeout = timeout) + + # Get barrier from the region polygon + # TODO: Add this in addition to spatial field below, possibly specify an option to calculate this + if(barrier && !is.Waiver(mesh)){ + mesh_bar <- mesh_barrier(mesh, region.poly) + } else { mesh_bar <- new_waiver() } + + # --- # + # Create other parameters object + params <- list( + max.edge = max.edge, + offset = offset, + cutoff = cutoff, + proj_stepsize = proj_stepsize, + type = type, + area = area, + strategy = strategy, + int.strategy = int.strategy, + ... + ) + + # Print a message in case there is already an engine object + if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') + + # Set engine in distribution object + x$set_engine( + bdproto( + "INLA-Engine", + Engine, + name = "", + data = list( + 'mesh' = mesh, + 'mesh.area' = ar, + 'mesh.bar' = mesh_bar, + 'stk_pred' = optional_projstk, + 'params' = params + ), + # Function to create a mesh + create_mesh = function(self, model){ + assertthat::assert_that(is.list(model), + "background" %in% names(model)) + # Check if mesh is already present, if so use it + if(!is.Waiver(self$get_data("mesh"))) return() + # Create a new mesh based on the available data + + # Get parameters + params <- self$get_data("params") + + # Convert the study region + region.poly <- methods::as(sf::st_geometry(model$background), "Spatial") + + # Convert to boundary object for later + suppressWarnings( + bdry <- INLA::inla.sp2segment( + sp = region.poly, + join = TRUE, + crs = INLA::inla.CRS(projargs = sp::proj4string(region.poly)) + ) + ) + bdry$loc <- INLA::inla.mesh.map(bdry$loc) + + # Try and infer mesh parameters if not set + + # Get all coordinates of observations + locs <- collect_occurrencepoints(model, include_absences = FALSE) + + assertthat::assert_that( + nrow(locs)>0, + ncol(locs)==2 + ) + + if(is.null(params$max.edge)){ + # A good guess here is usally a max.edge of between 1/3 to 1/5 of the spatial range. + max.edge <- c(diff(range(locs[,1]))/(3*5) , diff(range(locs[,1]))/(3*5) * 2) + params$max.edge <- max.edge + } + if(is.null(params$offset)){ + # Check whether the coordinate system is longlat + if( sf::st_is_longlat(bdry$crs) ){ + # Specify offset as 1/100 of the boundary distance + offset <- c( diff(range(bdry$loc[,1]))*0.01, + diff(range(bdry$loc[,1]))*0.01) + } else { + offset <- c( diff(range(bdry$loc[,1]))*0.01, + diff(range(bdry$loc[,1]))*0.01) + } + params$offset <- offset + } + if(is.null(params$cutoff)){ + # Specify as minimum distance between y coordinates + # Thus capturing most points on this level + # otherwise set to default + val <- min(abs(diff(locs[,2]))) + cutoff <- ifelse(val == 0, 1e-12, val) + params$cutoff <- cutoff + } + + suppressWarnings( + mesh <- INLA::inla.mesh.2d( + # Point localities + loc = locs, + # Boundary object + boundary = bdry, + # Mesh Parameters + max.edge = params$max.edge, + offset = params$offset, + cutoff = params$cutoff, + # Define the CRS + crs = bdry$crs + ) + ) + # Calculate area + # ar <- suppressMessages( + # suppressWarnings( + # mesh_area(mesh = mesh, region.poly = region.poly, variant = params$area) + # ) + # ) + # 06/01/2023: This should work and is identical to inlabru::ipoints + ar <- suppressWarnings( + diag( INLA::inla.mesh.fem(mesh = mesh)[[1]] ) + ) + assertthat::assert_that(length(ar) == mesh$n) + + # Now set the output + self$set_data("mesh", mesh) + self$set_data("mesh.area", ar) + + invisible() + }, + # Generic plotting function for the mesh + plot = function(self, assess = FALSE){ + if(is.Waiver(self$get_data('mesh'))) stop("No mesh found!") + + if(assess){ + # For an INLA mesh assessment + out <- INLA::inla.mesh.assessment( + mesh = self$get_data('mesh'), + spatial.range = 3, + alpha = 2, + dims = c(300, 300) + ) + # Convert to raster stack + out <- raster::stack( + sp::SpatialPixelsDataFrame( sp::coordinates(out), data = as.data.frame(out), + proj4string = self$get_data('mesh')$crs ) + ) + + raster::plot(out[[c('sd','sd.dev','edge.len')]], + col = c("#00204D","#00336F","#39486B","#575C6D","#707173","#8A8779","#A69D75","#C4B56C","#E4CF5B","#FFEA46") + ) + } else { + INLA:::plot.inla.mesh( self$get_data('mesh') ) + } + }, + # Spatial latent function + # https://groups.google.com/g/r-inla-discussion-group/c/eqMhlbwChkQ/m/m0b0PuzL-PsJ + # Default SPDE prior + # It computes the approximate diameter of the mesh, multiplies by 0.2 to get a value for the prior median range, and then transforms it to log-kappa scale by the formula + # log(sqrt(8*nu)/range) where nu is alpha-dim/2. + calc_latent_spatial = function(self,type = 'spde', alpha = 2, + priors = NULL, + polynames = NULL, + varname = "spatial.field1", + ...){ + # Catch prior objects + if(is.null(priors) || is.Waiver(priors)) priors <- NULL + + # For calculating iCAR process + if(type == 'car'){ + # convert mesh to sf object + ns <- mesh_as_sf(self$data$mesh) + # Create adjacency matrix with queen's case + nc.nb <- spdep::poly2nb(ns, queen = TRUE) + #Convert the adjacency matrix into a file in the INLA format + adjmat <- spdep::nb2mat(nc.nb,style = "B") + adjmat <- methods::as(adjmat, "dgTMatrix") + # adjmat <- INLA::inla.graph2matrix(nc.nb) + # Save the adjaceny matrix as output + self$data$latentspatial <- adjmat + self$data$s.index <- as.numeric(attr(nc.nb,varname)) + } else if(type=='spde'){ + # Check that everything is correctly specified + if(!is.null(priors)) if('spde' %notin% priors$varnames() ) priors <- NULL + + # Use default spde + if(is.null(priors) || is.Waiver(priors)){ + # Define PC Matern SPDE model and save + self$data$latentspatial <- INLA::inla.spde2.matern( + mesh = self$data$mesh, + alpha = alpha + ) + } else { + # Get priors + pr <- if(is.null(priors)) c(0.01, 0.05) else priors$get('spde','prior.range') + ps <- if(is.null(priors)) c(10, 0.05) else priors$get('spde','prior.sigma') + + # Define PC Matern SPDE model and save + self$data$latentspatial <- INLA::inla.spde2.pcmatern( + mesh = self$data$mesh, + alpha = alpha, + # P(Range < 1°) = 0.001 and P(sigma > 0.5) = 0.05 + prior.range = pr, prior.sigma = ps + ) + } + # Make index for spatial field + self$data$s.index <- INLA::inla.spde.make.index(name = varname, + n.spde = self$data$latentspatial$n.spde, + n.group = 1, + n.repl = 1) + # Security checks + assertthat::assert_that( + inherits(self$data$latentspatial,'inla.spde'), + length(self$data$s.index[[1]]) == self$data$mesh$n + ) + } else if(type == 'poly'){ + # Save column names of polynomial transformed coordinates + assertthat::assert_that(!is.null(polynames)) + self$data$latentspatial <- polynames + } + invisible() + }, + # Get latent spatial equation bit + # Set vars to 2 or larger to get copied spde's + get_equation_latent_spatial = function(self, method, vars = 1, separate_spde = FALSE){ + assertthat::assert_that(is.numeric(vars)) + if(method == 'spde'){ + assertthat::assert_that(inherits(self$data$latentspatial, 'inla.spde'), + msg = 'Latent spatial has not been calculated.') + # SPDE string + if(separate_spde){ + ss <- paste0("f(spatial.field",vars,", model = ",method,")") + } else { + if(vars >1){ + ss <- paste0("f(spatial.field",vars,", copy = \'spatial.field1\', model = ",method,", fixed = TRUE)") + } else { + ss <- paste0("f(spatial.field",vars,", model = ",method,")") + } + } + return(ss) + + } else if(method == 'car'){ + assertthat::assert_that(inherits(self$data$latentspatial,'dgTMatrix'), + msg = 'Neighborhood matrix has not been calculated.') + return( + # BESAG model or BYM model to specify + # BYM found to be largely similar to SPDE https://onlinelibrary.wiley.com/doi/pdf/10.1002/ece3.3081 + paste0('f(','spatial.field',', model = "bym", graph = ','adjmat',')') + ) + } + }, + # Configure stack + make_stack = function(self, model, id, intercept = TRUE, joint = FALSE) { + assertthat::assert_that( + is.list(model), + is.character(id) + ) + # Get Environment records + env <- model$predictors + + # Include intercept in here + # TODO: Note that this sets intercepts by type and not by dataset id + if(intercept) { + env$Intercept <- 1 # Overall Intercept + env[[paste0('Intercept', + ifelse(joint,paste0('_', + make.names(tolower(model$name)),'_', + model$type),''))]] <- 1 # Setting Intercept to common type, thus sharing with similar types + } + # Set up projection matrix for the data + suppressWarnings( + mat_proj <- INLA::inla.spde.make.A( + mesh = self$get_data('mesh'), + loc = as.matrix(env[,c('x','y')]) + ) + ) + # Create INLA stack + # The three main inla.stack() arguments are a vector list with the data (data), + # a list of projector matrices (each related to one block effect, + # A) and the list of effects (effects). + + # Response for inla stack + ll_resp <- list() + # Add the expected estimate and observed note + # FIXME: Currently only two likelihoods are supported (binomial/poisson) with the NA order being the determining factor + if(model$family == 'poisson') { + if(joint) ll_resp[[ 'observed' ]] <- cbind(model$observations[['observed']], NA ) + if(!joint) ll_resp[[ 'observed' ]] <- cbind(model$observations[['observed']] ) + ll_resp[[ 'e' ]] <- model$expect + } + if(model$family == 'binomial') { + if(joint) ll_resp[[ 'observed' ]] <- cbind(NA, model$observations[['observed']] ) + if(!joint) ll_resp[[ 'observed' ]] <- cbind( model$observations[['observed']] ) + ll_resp[[ 'Ntrials' ]] <- model$expect + } + + # Effects matrix + ll_effects <- list() + # Note, order adding this is important and matches the A matrix below + # ll_effects[['Intercept']] <- rep(1, nrow(model$observations)) + # ll_effects[['Intercept']][[paste0('Intercept',ifelse(joint,paste0('_',make.names(tolower(model$name)),'_',model$type),''))]] <- seq(1, self$get_data('mesh')$n) # Old code + ll_effects[['predictors']] <- env + ll_effects[['spatial.field1']] <- seq(1, self$get_data('mesh')$n) + + # Add offset if specified + if(!is.null(model$offset)){ + ll_effects[['predictors']] <- cbind( ll_effects[['predictors']], + subset(model[['offset']],select = "spatial_offset") + ) + } + + # Check whether equation has spatial field and otherwise add + # MJ 13/06: Spatial.field now set directly to effects + # if( 'spde' %in% all.vars(model$equation) ){ + # # Get Index Objects + # iset <- self$get_data('s.index') + # ll_effects[['spatial.field']] <- c(ll_effects[['spatial.field']], iset) + # } else if ( 'adjmat' %in% all.vars(model$equation) ){ + # iset <- self$get_data('s.index') + # ll_effects[['spatial.field']] <- c(ll_effects[['spatial.field']], data.frame(spatial.index = iset) ) + # } + # Define A + A <- list(1, mat_proj) + + # Define stack + stk <- INLA::inla.stack( + data = ll_resp, + A = A, + effects = ll_effects, + tag = paste0('stk_',as.character(model$type),'_',id) + ) + # Set the stack + self$set_data(paste0('stk_',as.character(model$type),'_',id), stk) + invisible() + }, + # Main INLA training function ---- + # Setup computation function + setup = function(self, model, settings,...){ + assertthat::assert_that( + 'background' %in% names(model), + 'biodiversity' %in% names(model), + all( model$biodiversity[[1]]$predictors_names %in% model$predictors_names ), + all( sapply(model$biodiversity, function(x) is.formula(x$equation)) ), + length(model$biodiversity)>=1, + msg = 'Some internal checks failed while setting up the model.' + ) + # Messager + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') + + # Set number of threads via set.Options + INLA::inla.setOption(num.threads = getOption('ibis.nthread'), + blas.num.threads = getOption('ibis.nthread')) + + # --- Prepare general inputs --- + # Check whether spatial latent effects were added + if( 'spde' %in% all.vars(model$biodiversity[[1]]$equation) ){ + # Get spatial index + spde <- self$get_data('s.index') + } else { spde <- NULL } + + # Check for existence of specified offset and use the full one in this case + if(!is.Waiver(model$offset)) offset <- subset(model[['offset']],select = "spatial_offset") else offset <- NULL + + # Projection stepsize + params <- self$get_data('params') + if(is.null( params$proj_stepsize )){ + # Set to stepsize equivalent of the resolution of the grid + val <- max(diff(model[['predictors']]$x)) # TODO: Check that it works when dummy variable is used + params$proj_stepsize <- val + self$set_data('params', params ) + rm(val) + } + + # Number of types to determine if a joint model is necessary + nty <- length( unique( as.character(sapply(model$biodiversity, function(z) z$type)) ) ) + + # Clean up previous data and integration stacks + chk <- grep('stk_int|stk_poipo|stk_poipa|stk_polpo|stk_polpa|stk_pred|stk_full', self$list_data()) + if(length(chk)>0) self$data[chk] <- NULL + + # Re-format the full predictors if there are any factor variables + # FIXME: Potentially outsource? + if(any(model$predictors_types$type=="factor")){ + vf <- model$predictors_types$predictors[model$predictors_types$type=="factor"] + for(k in vf){ + o <- explode_factor(model$predictors[[k]],name = k) + model$predictors <- cbind(model$predictors, o) + model$predictors_names <- c(model$predictors_names, colnames(o)) + model$predictors_types <- rbind(model$predictors_types, + data.frame(predictors = colnames(o), type = "numeric") ) + # Finally remove the original column from the predictor object + model$predictors[[k]] <- NULL + model$predictors_names <- model$predictors_names[-which( model$predictors_names == k )] + model$predictors_types <- subset(model$predictors_types, subset = predictors != k) + # FIXME: Hacky solution as to not overwrite predictor object + ras_back <- model$predictors_object$data + # Explode the columns in the raster object + model$predictors_object$data <- raster::addLayer( + model$predictors_object$data, + explode_factorized_raster(model$predictors_object$data[[k]]) + ) + model$predictors_object$data <- raster::dropLayer(model$predictors_object$data, k) + } + } else { ras_back <- new_waiver() } + + # Now for each dataset create a INLA stack + for(id in 1:length(model$biodiversity) ){ + + # If there any factor variables split them per type and explode them + if(any(model$biodiversity[[id]]$predictors_types$type=="factor")){ + vf <- model$biodiversity[[id]]$predictors_types$predictors[model$biodiversity[[id]]$predictors_types$type=="factor"] + fv <- model$biodiversity[[id]]$predictors[vf] + for(k in 1:ncol(fv)){ + o <- explode_factor(fv[,k],name = colnames(fv)[k]) + # Add + model$biodiversity[[id]]$predictors <- cbind(model$biodiversity[[id]]$predictors, o) + model$biodiversity[[id]]$predictors_names <- c(model$biodiversity[[id]]$predictors_names, colnames(o)) + model$biodiversity[[id]]$predictors_types <- rbind(model$biodiversity[[id]]$predictors_types, + data.frame(predictors = colnames(o), type = "numeric") ) + # Finally remove the original column from the predictor object + model$biodiversity[[id]]$predictors[[colnames(fv)[k]]] <- NULL + model$biodiversity[[id]]$predictors_names <- model$biodiversity[[id]]$predictors_names[-which( model$biodiversity[[id]]$predictors_names == colnames(fv)[k] )] + model$biodiversity[[id]]$predictors_types <- subset(model$biodiversity[[id]]$predictors_types, subset = predictors != colnames(fv)[k]) + + } + } + # Calculate observation stack INLA stack + # Save stacks by id instead of type + self$make_stack(model = model$biodiversity[[id]], + id = names(model$biodiversity)[id], + intercept = TRUE, + joint = ifelse(nty > 1, TRUE, FALSE) + ) + + # Define mesh.area dependent on whether a single variable only is used or not + if(model$biodiversity[[id]]$family == 'poisson'){ + # Only create on if not already existing + chk <- grep('stk_int', self$list_data()) + if(length(chk)==0){ + # Make integration stack for given poisson model + stk_int <- inla_make_integration_stack( + mesh = self$get_data('mesh'), + mesh.area = self$get_data('mesh.area'), + model = model, + id = names(model$biodiversity)[id], + joint = ifelse(nty > 1, TRUE, FALSE) + ) + # Save integration stack + self$set_data(paste0('stk_int_',names(model$biodiversity)[id]),stk_int) + } + } + } + + # ------------------ # + # Get all stacks defined so far and join them + stk_inference <- lapply( + self$list_data()[grep(paste(names(model$biodiversity),collapse = '|'), self$list_data())], + function(x) self$get_data(x) + ) + stk_inference <- do.call(INLA::inla.stack, stk_inference) + + # Clamp? + if( settings$get("clamp") ) model$predictors <- clamp_predictions(model, model$predictors) + + # Make projection stack if not directly supplied + if(is.null(self$data$stk_pred)){ + + stk_pred <- inla_make_projection_stack( + stk_resp = stk_inference, + model = model, + mesh = self$get_data('mesh'), + mesh.area = self$get_data('mesh.area'), + res = self$get_data('params')$proj_stepsize, + type = model$biodiversity[[id]]$type, + spde = spde, + settings = settings, + joint = ifelse(nty > 1, TRUE, FALSE) + ) + self$set_data('stk_pred', stk_pred) + } else { + # FIXME: Add some basic assertthat tests for when a prediction stack is directly supplied + stk_pred <- self$get_data('stk_pred') + } + + # Now join all stacks and save in full + # Note: If integrated stack is included, E must be set to relative area (in mesh.area). + self$set_data('stk_full', + INLA::inla.stack(stk_inference, stk_pred$stk_proj) + ) + if(!is.Waiver(ras_back)) model$predictors_object$data # Overwrite model object back to avoid issues with other engines. Hacky! + return(model) + }, + train = function(self, model, settings) { + # Check that all inputs are there + assertthat::assert_that( + inherits(settings,'Settings'), + is.list(model),length(model)>1, + # Check that model id and setting id are identical + settings$modelid == model$id, + any( (c('stk_full','stk_pred') %in% names(self$data)) ), + inherits(self$get_data('stk_full'),'inla.data.stack') + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting fitting...') + + # Get all datasets with id. This includes the data stacks and integration stacks + stk_inference <- lapply( + self$list_data()[grep(paste(names(model$biodiversity),collapse = '|'), self$list_data())], + function(x) self$get_data(x)) + stk_inference <- do.call(INLA::inla.stack, stk_inference) + + # Get full stack and projection grid + stk_full <- self$get_data('stk_full') + predcoords <- self$get_data('stk_pred')$predcoords + + # Get parameters + params <- self$get_data("params") + + # Get families and links + fam <- unique( as.character( sapply(model$biodiversity, function(x) x$family) ) ) + lin <- sapply(model$biodiversity, function(x) x$link) + # Define control family + cf <- list() + for(i in 1:length(fam)) cf[[i]] <- list(link = ifelse(fam[i] == 'poisson','log','cloglog' )) + if(length(fam)==1 && fam == 'binomial') cf[[1]]$link <- 'logit' + + # Shared link? Set to + if(length(fam)==1) {li <- 1} else { li <- NULL} # FIXME: Check whether links have to be set individually per observation + + if('spde' %in% all.vars(model$biodiversity[[1]]$equation) ){ + spde <- self$get_data('latentspatial') + stack_data_resp <- INLA::inla.stack.data(stk_inference, spde = self$get_data('latentspatial')) + stack_data_full <- INLA::inla.stack.data(stk_full, spde = self$get_data('latentspatial')) + } else { + adjmat <- spde <- self$get_data('latentspatial') + stack_data_resp <- INLA::inla.stack.data(stk_inference) + stack_data_full <- INLA::inla.stack.data(stk_full) + } + # ----------- # + # Provided or default formula + master_form <- stats::as.formula( + paste0("observed ~ ", + # # If multiple datasets, remove intercept + ifelse(length(model$biodiversity)>1,"0 + ", ""), + paste0(sapply(model$biodiversity, function(x){ + attr(stats::terms.formula(x$equation),"term.labels") + }) |> c() |> unique(),collapse = " + ") + ) + ) + + # Perform variable selection + if( settings$get(what='optim_hyperparam')){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Performing backstep variable selection (hacky)...') + + k <- NULL + # Specify offsets and spde to be retained + # FIXME: Also set priors and offsets here? + if(is.Waiver(spde)) k <- NULL else k <- self$get_equation_latent_spatial('spde') + + # Use backward variable elimination + vs <- inla.backstep(master_form = master_form, + stack_data_resp = stack_data_resp, + stk_inference = stk_inference, + fam = fam, + cf = cf,li = li, + response = 'observed', + keep = k + ) + master_form <- to_formula(vs$form) + } + + # ------------------------------------------ # + # Train the model on the response + fit_resp <- INLA::inla(formula = master_form, # The specified formula + data = stack_data_resp, # The data stack + quantiles = c(0.05, 0.5, 0.95), + E = INLA::inla.stack.data(stk_inference)$e, # Expectation (Eta) for Poisson model + Ntrials = INLA::inla.stack.data(stk_inference)$Ntrials, + family = fam, # Family the data comes from + control.family = cf, # Control options + control.predictor = list(A = INLA::inla.stack.A(stk_inference), + link = li, # Link to NULL for multiple likelihoods! + compute = TRUE), # Compute for marginals of the predictors. + control.compute = list(cpo = FALSE, waic = TRUE, config = TRUE), #model diagnostics and config = TRUE gives you the GMRF + # control.fixed = list(mean = 0),# prec = list( initial = log(0.000001), fixed = TRUE)), # Added to see whether this changes GMRFlib convergence issues + verbose = settings$get(what='verbose'), # To see the log of the model runs + control.inla = INLA::control.inla(strategy = params$strategy, + int.strategy = params$int.strategy), + num.threads = getOption('ibis.nthread') + ) + + # Predict spatially + if(!settings$get(what='inference_only')){ + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') + + # Predict on full + fit_pred <- try({INLA::inla(formula = master_form, # The specified formula + data = stack_data_full, # The data stack + quantiles = c(0.05, 0.5, 0.95), + E = INLA::inla.stack.data(stk_full)$e, + Ntrials = INLA::inla.stack.data(stk_full)$Ntrials, + family= fam, # Family the data comes from + control.family = cf, # Control options + control.predictor = list(A = INLA::inla.stack.A(stk_full), + link = li, # Link to NULL for multiple likelihoods! + compute = TRUE), # Compute for marginals of the predictors. + control.compute = list(cpo = FALSE, waic = TRUE, config = TRUE, openmp.strategy = 'huge' ), + # control.mode = list(theta = thetas, restart = FALSE), # To speed up use previous thetas + verbose = settings$get(what='verbose'), # To see the log of the model runs + # control.results = list(return.marginals.random = FALSE, + # return.marginals.predictor = FALSE), # Don't predict marginals to save speed + # control.fixed = INLA::control.fixed(mean = 0),#, prec = list( initial = log(0.000001), fixed = TRUE)), # Added to see whether this changes GMRFlib convergence issues + control.inla = INLA::control.inla(strategy = params$strategy, + int.strategy = params$int.strategy), + num.threads = getOption('ibis.nthread') + ) + },silent = FALSE) + if(inherits(fit_pred,'try-error')) { print(fit_pred); stop('Model did not converge. Try to simplify structure and check priors!') } + # Create a spatial prediction + index.pred <- INLA::inla.stack.index(stk_full, 'stk_pred')$data + # Which type of prediction (linear predictor or response scale) + # The difference between both is that response applies the (inverse of the) link function, + # so it doesn't include the observation distribution part (measurement noise) of posterior predictions. + if(params$type == "predictor"){ + post <- fit_pred$summary.linear.predictor[index.pred, ] + } else { + post <- fit_pred$summary.fitted.values[index.pred, ] + } + assertthat::assert_that(nrow(post)>0, + nrow(post) == nrow(predcoords) ) # Check with cells in projection + # Back-transform for predictor + if(params$type == "predictor"){ + if(length(fam)==1){ + if(fam == 'poisson') post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] <- exp( post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] ) + if(fam == 'binomial') post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] <- logistic(post[,c('mean','0.05quant','0.5quant','0.95quant','mode')]) + } else { + # Joint likelihood of Poisson log and binomial cloglog following Simpson et al. + post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] <- exp( post[,c('mean','0.05quant','0.5quant','0.95quant','mode')] ) + } + } + post <- subset(post, select = c('mean','sd','0.05quant','0.5quant','0.95quant','mode') ) + post$cv <- post$sd / post$mean + # Rename + names(post) <- c("mean", "sd", "q05", "q50", "q95", "mode","cv") + + # Fill prediction + suppressWarnings( + prediction <- raster::stack( + sp::SpatialPixelsDataFrame( + points = predcoords, + data = post, + proj4string = sp::CRS( self$get_data('mesh')$crs@projargs ) # x$engine$data$mesh$crs@projargs + ) + ) + ) + prediction <- raster::mask(prediction, model$background) # Mask with background + # Align with background + temp <- raster::raster( + sp::SpatialPixelsDataFrame( + points = model$predictors[,c('x','y')], + data = model$predictors[,c('x','y')], + proj4string = sp::CRS( self$get_data('mesh')$crs@projargs ) # x$engine$data$mesh$crs@projargs + ) + ) + prediction <- raster::resample(prediction, temp, method = 'bilinear') + + } else { + # No prediction to be conducted + fit_pred <- NULL + prediction <- NULL + } + + # Compute end of computation time + settings$set('end.time', Sys.time()) + + # Definition of INLA Model object ---- + out <- bdproto( + "INLA-Model", + DistributionModel, + id = model$id, + model = model, + settings = settings, + fits = list( + "fit_best" = fit_resp, + "fit_pred" = fit_pred, + "fit_best_equation" = master_form, + "mesh" = self$get_data('mesh'), + "spde" = self$get_data('latentspatial'), + "prediction" = prediction + ), + # Projection function + project = function(self, newdata, mode = 'coef', backtransf = NULL, layer = "mean"){ + assertthat::assert_that('fit_best' %in% names(self$fits), + is.data.frame(newdata) || is.matrix(newdata), + mode %in% c('coef','sim','full'), + assertthat::has_name(newdata,c('x','y')) + ) + stop("Projection using engine INLA is deprecated. Use engine_inlabru !") + + # Try and guess backtransformation + if(is.null(backtransf)){ + fam <- self$get_data('fit_best')$.args$family + backtransf <- ifelse(fam == 'poisson', exp, logistic) + } + + if(mode == 'coef'){ + # We use the coefficient prediction + out <- coef_prediction(mesh = self$get_data('mesh'), + mod = self, + type = 'mean', + backtransf = backtransf + ) + } else if(mode == 'sim'){ + # Simulate from posterior. Not yet coded + stop('Simulation from posterior not yet implemented. Use inlabru instead!') + } else { + stop('Full prediction not yet added.') + } + # Return result + return(out) + }, + # Partial response + # FIXME: Create external function + partial = function(self, x, x.var, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = "response"){ + # Goal is to create a sequence of value and constant and append to existing stack + # Alternative is to create a model-matrix through INLA::inla.make.lincomb() and + # model.matrix(~ vars, data = newDummydata) fed to make.lincomb + # provide via lincomb = M to an INLA call. + # Both should be identical + stop("Partial function not implemented. Consider using inlabru instead!") + # Check that provided model exists and variable exist in model + mod <- self$get_data('fit_best') + assertthat::assert_that(inherits(mod,'inla'), + 'model' %in% names(self), + inherits(x,'BiodiversityDistribution'), + length(x.var) == 1, is.character(x.var), + is.null(constant) || is.numeric(constant) + ) + varn <- mod$names.fixed + variable <- match.arg(x.var, varn, several.ok = FALSE) + assertthat::assert_that(variable %in% varn, length(variable)==1,!is.null(variable)) + + # ------------------ # + # Get all datasets with id in model. This includes the data stacks and integration stacks + stk_inference <- lapply( + x$engine$list_data()[grep(paste(names(model$biodiversity),collapse = '|'), x$engine$list_data())], + function(z) x$engine$get_data(z)) + stk_inference <- do.call(INLA::inla.stack, stk_inference) + # FIXME: Test that this works with SPDE present + stack_data_resp <- INLA::inla.stack.data(stk_inference) + # ------------------ # + + # If constant is null, calculate average across other values + if(is.null(constant)){ + constant <- lapply(stack_data_resp, function(x) mean(x,na.rm = T))[varn[varn %notin% variable]] + } + # For target variable calculate range + variable_range <- range(stack_data_resp[[variable]],na.rm = TRUE) + + # Create dummy data.frame + dummy <- data.frame(observed = rep(NA, variable_length)) + + seq(variable_range[1],variable_range[2],length.out = variable_length) + + # # add sequence of data and na to data.frame. predict those + # control.predictor = list(A = INLA::inla.stack.A(stk_full), + # link = li, # Link to NULL for multiple likelihoods! + # compute = TRUE), # Compute for marginals of the predictors. + + print('Refitting model for partial effect') + ufit <- INLA::inla(formula = stats::as.formula(mod$.args$formula), # The specified formula + data = stk_inference, # The data stack + quantiles = c(0.05, 0.5, 0.95), + E = INLA::inla.stack.data(stk_inference)$e, # Expectation (Eta) for Poisson model + Ntrials = INLA::inla.stack.data(stk_inference)$Ntrials, + family = mod$.args$family, # Family the data comes from + control.family = mod$.args$control.family, # Control options + control.predictor = mod$.args$control.predictor, # Compute for marginals of the predictors. + control.compute = mod$.args$control.compute, + control.fixed = mod$.args$control.fixed, + verbose = FALSE, # To see the log of the model runs + control.inla = mod$.args$control.inla, + num.threads = getOption('ibis.nthread') + ) + control.predictor = list(A = INLA::inla.stack.A(stk_inference)) + + # Plot and return result + }, + # Get coefficients + get_coefficients = function(self){ + # Returns a vector of the coefficients with direction/importance + cofs <- self$summary() + cofs <- subset(cofs, select = c("variable", "mean", "sd")) + names(cofs) <- c("Feature", "Beta", "Sigma") + # Remove intercept(s) + int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) + if(length(int)>0) cofs <- cofs[-int,] + return(cofs) + }, + # Function to plot SPDE if existing + plot_spatial = function(self, dim = c(300,300), kappa_cor = FALSE, what = "spatial.field1", ...){ + assertthat::assert_that(is.vector(dim), + is.character(what)) + + if( length( self$fits$fit_best$size.spde2.blc ) == 1) + { + # Get spatial projections from model + # FIXME: Potentially make the plotting of this more flexible + gproj <- INLA::inla.mesh.projector(self$get_data('mesh'), dims = dim) + g.mean <- INLA::inla.mesh.project(gproj, + self$get_data('fit_best')$summary.random[[what]]$mean) + g.sd <- INLA::inla.mesh.project(gproj, self$get_data('fit_best')$summary.random[[what]]$sd) + + # Convert to rasters + g.mean <- t(g.mean) + g.mean <- g.mean[rev(1:length(g.mean[,1])),] + r.m <- raster::raster(g.mean, + xmn = range(gproj$x)[1], xmx = range(gproj$x)[2], + ymn = range(gproj$y)[1], ymx = range(gproj$y)[2], + crs = self$get_data('mesh')$crs + ) + g.sd <- t(g.sd) + g.sd <- g.sd[rev(1:length(g.sd[,1])),] + r.sd <- raster::raster(g.sd, + xmn = range(gproj$x)[1], xmx = range(gproj$x)[2], + ymn = range(gproj$y)[1], ymx = range(gproj$y)[2], + crs = self$get_data('mesh')$crs + ) + + spatial_field <- raster::stack(r.m, r.sd);names(spatial_field) <- c('SPDE_mean','SPDE_sd') + # Mask with prediction if exists + if(!is.null(self$get_data('prediction'))){ + spatial_field <- raster::resample(spatial_field, self$get_data('prediction')[[1]]) + spatial_field <- raster::mask(spatial_field, self$get_data('prediction')[[1]]) + } + + # -- # + if(kappa_cor){ + # Also build correlation fun + # Get SPDE results + spde_results <- INLA::inla.spde2.result( + inla = self$get_data('fit_best'), + name = what, + spde = self$get_data('spde'), + do.transfer = TRUE) + + # Large kappa (inverse range) equals a quick parameter change in space. + # Small kappa parameter have much longer, slower gradients. + Kappa <- INLA::inla.emarginal(function(x) x, spde_results$marginals.kappa[[1]]) + sigmau <- INLA::inla.emarginal(function(x) sqrt(x), spde_results$marginals.variance.nominal[[1]]) + r <- INLA::inla.emarginal(function(x) x, spde_results$marginals.range.nominal[[1]]) + + # Get Mesh and distance between points + mesh <- self$get_data('mesh') + D <- as.matrix( stats::dist(mesh$loc[, 1:2]) ) + + # Distance vector. + dis.cor <- data.frame(distance = seq(0, max(D), length = 100)) + # Maximum distance by quarter of extent + dis.max <- abs((xmin(self$get_data('prediction')) - xmax(self$get_data('prediction')) ) / 2) # Take a quarter of the max distance + + # Modified Bessel function to get correlation strength + dis.cor$cor <- as.numeric((Kappa * dis.cor$distance) * base::besselK(Kappa * dis.cor$distance, 1)) + dis.cor$cor[1] <- 1 + # --- + # Build plot + graphics::layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE)) + plot(dis.cor$cor ~ dis.cor$distance, type = 'l', lwd = 3, + xlab = 'Distance (proj. unit)', ylab = 'Correlation', main = paste0('Kappa: ', round(Kappa,2) ) ) + graphics::abline(v = dis.max,lty = 'dotted') + plot(spatial_field[[1]],col = ibis_colours[['viridis_cividis']], main = 'mean spatial effect') + plot(spatial_field[[2]], main = 'sd spatial effect') + } else { + # Just plot the SPDE + graphics::par(mfrow=c(1,2)) + plot(spatial_field[[1]],col = ibis_colours[['viridis_cividis']], main = 'mean spatial effect') + plot(spatial_field[[2]], main = 'sd spatial effect') + # And return + return(spatial_field) + } + + + } else { + message(text_red('No spatial covariance in model specified.')) + } + } + ) + return(out) + } + )) +} diff --git a/R/engine_inlabru.R b/R/engine_inlabru.R index 078a918a..9b398838 100644 --- a/R/engine_inlabru.R +++ b/R/engine_inlabru.R @@ -1,1247 +1,1253 @@ -#' @include bdproto-engine.R utils-inla.R bdproto-distributionmodel.R -NULL - -#' Use inlabru as engine -#' -#' @description Model components are specified with general inputs and mapping methods to the -#' latent variables, and the predictors are specified via general R expressions, -#' with separate expressions for each observation likelihood model in multi-likelihood models. -#' The inlabru engine - similar as the [`engine_inla`] function acts a wrapper for [INLA::inla], -#' albeit [inlabru] has a number of convenience functions implemented that make in particular predictions -#' with new data much more straight forward (e.g. via posterior simulation instead of fitting). -#' Since more recent versions [inlabru] also supports the addition of multiple likelihoods, therefore -#' allowing full integrated inference. -#' @details -#' All \code{INLA} engines require the specification of a mesh that needs to be provided to the -#' \code{"optional_mesh"} parameter. Otherwise the mesh will be created based on best guesses of the -#' data spread. A good mesh needs to have triangles as regular as possible in size and shape: equilateral. -#' -#' [*] \code{"max.edge"}: The largest allowed triangle edge length, must be in the same scale units as the coordinates -#' Lower bounds affect the density of triangles -#' [*] \code{"offset"}: The automatic extension distance of the mesh -#' If positive: same scale units. If negative, interpreted as a factor relative to the approximate data diameter -#' i.e., a value of -0.10 will add a 10% of the data diameter as outer extension. -#' [*] \code{"cutoff"}: The minimum allowed distance between points, -#' it means that points at a closer distance than the supplied value are replaced by a single vertex. -#' it is critical when there are some points very close to each other, either for point locations or in the -#' domain boundary. -#' [*] \code{"proj_stepsize"}: The stepsize for spatial predictions, which affects the spatial grain of any outputs -#' created. -#' -#' Priors can be set via [INLAPrior]. -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param optional_mesh A directly supplied [`INLA`] mesh (Default: \code{NULL}) -#' @param max.edge The largest allowed triangle edge length, must be in the same scale units as the coordinates. -#' Default is an educated guess (Default: \code{NULL}). -#' @param offset interpreted as a numeric factor relative to the approximate data diameter. -#' Default is an educated guess (Default: \code{NULL}). -#' @param cutoff The minimum allowed distance between points on the mesh. -#' Default is an educated guess (Default: \code{NULL}). -#' @param proj_stepsize The stepsize in coordinate units between cells of the projection grid (Default: \code{NULL}) -#' @param strategy Which approximation to use for the joint posterior. Options are \code{"auto"} ("default"), \code{"adaptative"}, -#' \code{"gaussian"}, \code{"simplified.laplace"} & \code{"laplace"}. -#' @param int.strategy Integration strategy. Options are \code{"auto"},\code{"grid"}, \code{"eb"} ("default") & \code{"ccd"}. -#' @param area Accepts a [`character`] denoting the type of area calculation to be done on the mesh (Default: \code{'gpc2'}). -#' @param timeout Specify a timeout for INLA models in sec. Afterwards it passed. -#' @param type The mode used for creating posterior predictions. Either summarizing the linear \code{"predictor"} or \code{"response"} (Default: \code{"response"}). -#' @param ... Other variables -#' @references -#' * Bachl, F. E., Lindgren, F., Borchers, D. L., & Illian, J. B. (2019). inlabru: an R package for Bayesian spatial modelling from ecological survey data. Methods in Ecology and Evolution, 10(6), 760-766. -#' * Simpson, Daniel, Janine B. Illian, S. H. Sørbye, and Håvard Rue. 2016. “Going Off Grid: Computationally Efficient Inference for Log-Gaussian Cox Processes.” Biometrika 1 (103): 49–70. -#' @source [https://inlabru-org.github.io/inlabru/articles/](https://inlabru-org.github.io/inlabru/articles/) -#' @family engine -#' @name engine_inlabru -NULL -#' @rdname engine_inlabru -#' @export -engine_inlabru <- function(x, - optional_mesh = NULL, - max.edge = NULL, - offset = NULL, - cutoff = NULL, - proj_stepsize = NULL, - strategy = "auto", - int.strategy = "eb", - area = "gpc2", - timeout = NULL, - type = "response", - ...) { - - # Check whether INLA package is available - check_package('inlabru') - if(!isNamespaceLoaded("inlabru")) { attachNamespace("inlabru");requireNamespace('inlabru') } - check_package('INLA') - if(!isNamespaceLoaded("INLA")) { attachNamespace("INLA");requireNamespace('INLA') } - - # assert that arguments are valid - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(x$background,'sf'), - inherits(optional_mesh,'inla.mesh') || is.null(optional_mesh), - is.vector(max.edge) || is.null(max.edge), - (is.vector(offset) || is.numeric(offset)) || is.null(offset), - is.numeric(cutoff) || is.null(cutoff), - is.null(timeout) || is.numeric(timeout), - is.character(strategy), - is.character(int.strategy), - is.character(area), - is.character(type), - is.null(proj_stepsize) || is.numeric(proj_stepsize) - ) - - # Match strategy - strategy <- match.arg(strategy, c('auto', 'gaussian', 'simplified.laplace', 'laplace', 'adaptive'), several.ok = FALSE) - int.strategy <- match.arg(int.strategy, c('auto', 'ccd', 'grid', 'eb'), several.ok = FALSE) - area <- match.arg(area, c("gpc", "gpc2", "km"), several.ok = FALSE) - type <- match.arg(type, c("response", "predictor"), several.ok = FALSE) - - # Set inlabru options for strategies. These are set globally - inlabru::bru_options_set(control.inla = list(strategy = strategy, - int.strategy = int.strategy)) - - # Set the projection mesh - if(inherits(optional_mesh,'inla.mesh')) { - # Load a provided on - mesh <- optional_mesh - # Security check for projection and if not set, use the one from background - if(is.null(mesh$crs)) mesh$crs <- sp::CRS( proj4string(region.poly) ) - - # Convert the study region - region.poly <- as(sf::st_geometry(x$background), "Spatial") - - # Calculate area - ar <- suppressWarnings( - mesh_area(mesh = mesh, region.poly = region.poly, variant = area) - ) - } else { - mesh <- new_waiver() - ar <- new_waiver() - } - - # Collate other parameters in a specific object - params <- list( - max.edge = max.edge, - offset = offset, - cutoff = cutoff, - proj_stepsize = proj_stepsize, - type = type, - area = area, - strategy = strategy, - int.strategy = int.strategy, - ... - ) - - # If time out is specified - if(!is.null(timeout)) INLA::inla.setOption(fmesher.timeout = timeout) - - # Print a message in case there is already an engine object - if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') - - # Set engine in distribution object - x$set_engine( - bdproto( - "INLABRU-Engine", - Engine, - name = "", - data = list( - 'mesh' = mesh, - 'mesh.area' = ar, - 'proj_stepsize' = proj_stepsize, - 'params' = params - ), - # Function to create a mesh - create_mesh = function(self, model){ - assertthat::assert_that(is.list(model), - "background" %in% names(model)) - # Check if mesh is already present, if so use it - if(!is.Waiver(self$get_data("mesh"))) return() - # Create a new mesh based on the available data - - # Get parameters - params <- self$get_data("params") - - # Convert the study region - region.poly <- as(sf::st_geometry(model$background), "Spatial") - - # Convert to boundary object for later - suppressWarnings( - bdry <- INLA::inla.sp2segment( - sp = region.poly, - join = TRUE, - crs = INLA::inla.CRS(projargs = sp::proj4string(region.poly)) - ) - ) - bdry$loc <- INLA::inla.mesh.map(bdry$loc) - - # Get all coordinates of observations - locs <- collect_occurrencepoints(model, include_absences = FALSE) - - # Try and infer mesh parameters if not set - if(is.null(params$max.edge)){ - # A good guess here is usally a max.edge of between 1/3 to 1/5 of the spatial range. - max.edge <- c(diff(range(locs[,1]))/(3*5) , diff(range(locs[,1]))/(3*5) * 2) - params$max.edge <- max.edge - } - if(is.null(params$offset)){ - # Check whether the coordinate system is longlat - if( sf::st_is_longlat(bdry$crs) ){ - # Specify offset as 1/100 of the boundary distance - offset <- c( diff(range(bdry$loc[,1]))*0.01, - diff(range(bdry$loc[,1]))*0.01) - } else { - offset <- c( diff(range(bdry$loc[,1]))*0.01, - diff(range(bdry$loc[,1]))*0.01) - } - params$offset <- offset - } - - if(is.null(params$cutoff)){ - # Specify as minimum distance between y coordinates - # Thus capturing most points on this level - # otherwise set to default - val <- min(abs(diff(locs[,2]))) - cutoff <- ifelse(val == 0, 1e-12, val) - params$cutoff <- cutoff - } - - suppressWarnings( - mesh <- INLA::inla.mesh.2d( - # Point localities - loc = locs, - # Boundary object - boundary = bdry, - # Mesh Parameters - max.edge = params$max.edge, - offset = params$offset, - cutoff = params$cutoff, - # Define the CRS - crs = bdry$crs - ) - ) - # Calculate area - # ar <- suppressMessages( - # suppressWarnings( - # mesh_area(mesh = mesh, region.poly = region.poly, variant = params$area) - # ) - # ) - # 06/01/2023: This should work and is identical to inlabru::ipoints - ar <- suppressWarnings( - inlabru::ipoints(samplers = mesh)$weight |> as.vector() - ) - assertthat::assert_that(length(ar) == mesh$n) - - # Now set the output - self$set_data("mesh", mesh) - self$set_data("mesh.area", ar) - - invisible() - }, - # Generic plotting function for the mesh - plot = function(self, assess = FALSE){ - if(is.Waiver(self$get_data('mesh'))) stop("No mesh found!") - if(assess){ - # For an INLA mesh assessment - out <- INLA:::inla.mesh.assessment( - mesh = self$get_data('mesh'), - spatial.range = 3, - alpha = 2, - dims = c(300, 300) - ) - # Convert to raster stack - out <- raster::stack( - sp::SpatialPixelsDataFrame( sp::coordinates(out), data = as.data.frame(out), - proj4string = self$get_data('mesh')$crs ) - ) - - raster::plot(out[[c('sd','sd.dev','edge.len')]], - col = c("#00204D","#00336F","#39486B","#575C6D","#707173","#8A8779","#A69D75","#C4B56C","#E4CF5B","#FFEA46") - ) - } else { - INLA:::plot.inla.mesh( self$get_data('mesh') ) - } - }, - # Spatial latent function - # https://groups.google.com/g/r-inla-discussion-group/c/eqMhlbwChkQ/m/m0b0PuzL-PsJ - # Default SPDE prior - # It computes the approximate diameter of the mesh, multiplies by 0.2 to get a value for the prior median range, and then transforms it to log-kappa scale by the formula - # log(sqrt(8*nu)/range) where nu is alpha-dim/2. - calc_latent_spatial = function(self,type = 'spde', alpha = 2, - priors = NULL, - polynames = NULL, - varname = "spatial.field1", - ...){ - # Catch prior objects - if(is.null(priors) || is.Waiver(priors)) priors <- NULL - - # For calculating iCAR process - if(type == 'car'){ - # convert mesh to sf object - ns <- mesh_as_sf(self$data$mesh) - # Create adjacency matrix with queen's case - nc.nb <- spdep::poly2nb(ns, queen = TRUE) - #Convert the adjacency matrix into a file in the INLA format - adjmat <- spdep::nb2mat(nc.nb,style = "B") - adjmat <- as(adjmat, "dgTMatrix") - # adjmat <- INLA::inla.graph2matrix(nc.nb) - # Save the adjaceny matrix as output - self$data$latentspatial <- adjmat - self$data$s.index <- as.numeric(attr(nc.nb,varname)) - } else if(type=='spde'){ - # Check that everything is correctly specified - if(!is.null(priors)) if('spde' %notin% priors$varnames() ) priors <- NULL - - # Use default spde - if(is.null(priors) || is.Waiver(priors)){ - # Define PC Matern SPDE model and save - self$data$latentspatial <- INLA::inla.spde2.matern( - mesh = self$data$mesh, - alpha = alpha - ) - } else { - # Get priors - pr <- if(is.null(priors)) c(0.01, 0.05) else priors$get('spde','prior.range') - ps <- if(is.null(priors)) c(10, 0.05) else priors$get('spde','prior.sigma') - - # Define PC Matern SPDE model and save - self$data$latentspatial <- INLA::inla.spde2.pcmatern( - mesh = self$data$mesh, - alpha = alpha, - # P(Range < 1°) = 0.001 and P(sigma > 0.5) = 0.05 - prior.range = pr, prior.sigma = ps - ) - } - # Make index for spatial field - self$data$s.index <- INLA::inla.spde.make.index(name = varname, - n.spde = self$data$latentspatial$n.spde, - n.group = 1, - n.repl = 1) - # Security checks - assertthat::assert_that( - inherits(self$data$latentspatial,'inla.spde'), - length(self$data$s.index[[1]]) == self$data$mesh$n - ) - } else if(type == 'poly'){ - # Save column names of polynomial transformed coordinates - assertthat::assert_that(!is.null(polynames)) - self$data$latentspatial <- polynames - } - invisible() - }, - # Get latent spatial equation bit - # Set vars to 2 or larger to get copied spde's - get_equation_latent_spatial = function(self, method, vars = 1, separate_spde = FALSE){ - assertthat::assert_that(is.numeric(vars)) - if(method == 'spde'){ - assertthat::assert_that(inherits(self$data$latentspatial, 'inla.spde'), - msg = 'Latent spatial has not been calculated.') - # SPDE string - if(separate_spde){ - ss <- paste0("f(spatial.field",vars,", model = ",method,")") - } else { - if(vars >1){ - ss <- paste0("f(spatial.field",vars,", copy = \'spatial.field1\', model = ",method,", fixed = TRUE)") - } else { - ss <- paste0("f(spatial.field",vars,", model = ",method,")") - } - } - return(ss) - - } else if(method == 'car'){ - assertthat::assert_that(inherits(self$data$latentspatial,'dgTMatrix'), - msg = 'Neighborhood matrix has not been calculated.') - return( - # BESAG model or BYM model to specify - # BYM found to be largely similar to SPDE https://onlinelibrary.wiley.com/doi/pdf/10.1002/ece3.3081 - paste0('f(','spatial.field1',', model = "bym", graph = ','adjmat',')') - ) - } - }, - calc_integration_points = function(self, model, mode = 'stack'){ - # Mode cox process integration - if(mode == 'cp'){ - # Create integration points by using the mesh as sampler - suppressWarnings( - ips <- inlabru::ipoints( - samplers = self$get_data('mesh') - ) - ) - # Extract predictors add to integration point data - d <- get_rastervalue(coords = ips@coords, - env = model$predictors_object$get_data(df = FALSE), - rm.na = FALSE) - for (cov in model$predictors_names) ips@data[,cov] <- d[,cov] - ips@data$Intercept <- 1 - ips <- subset(ips, complete.cases(ips@data)) # Necessary as some integration points can fall outside land area - # Return results - return(ips) - } else if(mode == 'stack'){ - # Use INLA make stack function instead. Useful for poisson created data so that - # integration points are created as absence. Numerically inefficient though compared to cp - # FIXME: Ideally sample from a provided pseudo-background - istk <- inla_make_integration_stack(mesh = self$get_data('mesh'), - mesh.area = self$get_data("mesh.area"), - model = model, - id = "istack", - joint = FALSE) - ips <- cbind(istk$data$data, istk$effects$data) # Combine observations and stack - ips <- subset(ips, complete.cases(ips[,c("x", "y")])) # Remove NA coordinates - # Convert to sp - ips <- sp::SpatialPointsDataFrame(coords = ips[,c('x', 'y')], - data = ips[, names(ips) %notin% c('x','y')], - proj4string = self$get_data('mesh')$crs - ) - # Select only the predictor names - ips <- subset(ips, select = c("observed", "Intercept", "e", model$predictors_names)) - ips <- subset(ips, complete.cases(ips@data)) - abs_E <- ips$e; ips$e <- NULL - # Return list of result - return(list(ips = ips, E = abs_E)) - } - }, - # Main inlabru setup ---- - # Setup computation function - setup = function(self, model, settings, ...){ - assertthat::assert_that( - 'background' %in% names(model), - 'biodiversity' %in% names(model), - all( sapply(model$biodiversity, function(x) is.formula(x$equation)) ), - length(model$biodiversity)>=1, - msg = 'Some internal checks failed while setting up the model.' - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') - - # Construct likelihoods for each entry in the dataset - lhl <- list() - for(j in 1:length(model$biodiversity)){ - # Combine observed and predictors for the - df <- cbind( - data.frame(observed = model$biodiversity[[j]]$observations[['observed']]), - model$biodiversity[[j]]$predictors - ) - # Convert to Spatial points - df <- sp::SpatialPointsDataFrame(coords = df[,c('x', 'y')], - data = df[, names(df) %notin% c('x','y')], - proj4string = self$get_data('mesh')$crs - ) - - # Options for specifying link function of likelihood - o <- inlabru::bru_options_get() - # Data type specific. Currently only binomial and poisson supported - # FIXME: Code below does not work as intended. Worked in earlier versions. To be debugged later! - # FIXME: Apparently the code works when an SPDE is added, thus cp seems to rely on the matter covs - # if(model$biodiversity[[j]]$type == 'poipo'){ - # ips <- self$calc_integration_points(model, mode = 'cp') - # - # # Log gaussian cox process - # lh <- inlabru::like(formula = update.formula(model$biodiversity[[j]]$equation, "coordinates ~ ."), - # # include = model$biodiversity[[j]]$predictors_names, - # family = "cp", - # data = df, - # # samplers = as(model$background,"Spatial"), - # domain = list(coordinates = self$get_data("mesh")), - # ips = ips, - # options = o - # ) - # assertthat::assert_that(sum(lh$response_data$BRU_response_cp>0)>2, - # msg = "Found issues with aligning coordinates within the domain most likely.") - # If not poipo but still poisson, prepare data as follows - # } else - if(model$biodiversity[[j]]$family == "poisson"){ - # Calculate integration points for PPMs and to estimation data.frame - ips <- self$calc_integration_points(model, mode = 'stack') - abs_E = ips$E; ips <- ips$ips - assertthat::assert_that(all(colnames(ips) %in% colnames(df))) - new <- sp:::rbind.SpatialPointsDataFrame( - df[,c('observed', 'Intercept', model$biodiversity[[j]]$predictors_names)], - ips[,c('observed', 'Intercept', model$biodiversity[[j]]$predictors_names)]) - # Formulate the likelihood - lh <- inlabru::like(formula = model$biodiversity[[j]]$equation, - family = model$biodiversity[[j]]$family, - data = new, # Combine presence and absence information - mesh = self$get_data('mesh'), - E = c(model$biodiversity[[j]]$expect, abs_E), # Combine Exposure variants - # include = include[[i]], # Don't need this as all variables included in equation - options = o - ) - } else if(model$biodiversity[[j]]$family == "binomial"){ - # Set likelihood to cloglog for binomial following Simpson 2016 if multiple likelihoods - if(length(model$biodiversity)>1){ - o[['control.family']] <- list(link = ifelse(model$biodiversity[[j]]$family=='binomial', 'cloglog', 'default')) - } - - # Formulate the likelihood - lh <- inlabru::like(formula = model$biodiversity[[j]]$equation, - family = model$biodiversity[[j]]$family, - data = df, # Combine presence and absence information - mesh = self$get_data('mesh'), - Ntrials = model$biodiversity[[j]]$expect, - # include = include[[i]], # Don't need this as all variables in equation are included - options = o - ) - } - # Add to list - lhl[[j]] <- lh - } - - # List of likelihoods - self$set_data("likelihoods", inlabru::like_list(lhl) ) - - # --- # - # Defining the component function - if(length(model$biodiversity)>1){ - comp <- as.formula( - paste(' ~ 0 + Intercept(1) ', - ifelse(model$biodiversity[[1]]$use_intercept, - paste("+",paste0('Intercept_', - make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name - sapply( model$biodiversity, function(x) x$type ),"(1)",collapse = ' + ') - ), - "" - ) - ) - ) - } else { - comp <- as.formula( - paste0( "~ Intercept(1)") - ) - } - - # Add Offset if set - if(!is.Waiver(model$offset)){ - ovn <- "spatial_offset" - comp <- update.formula(comp, - paste(c(' ~ . +', paste0(ovn,'(main = ', ovn, ', model = "offset")')), collapse = " ") - ) - } - - # --- # - # Get unified predictors from likelihoods - pn <- lapply(lhl, function(x) all.vars(x$formula) ) %>% do.call(c,.) %>% unique() - # pn <- lapply(model$biodiversity, function(x) x$predictors_names ) %>% do.call(c,.) %>% unique() - pn <- pn[grep("Intercept|coordinates", pn, invert = TRUE)] - assertthat::assert_that(length(pn)>0) - model$predictors_types <- model$predictors_types[which(model$predictors_types$predictors %in% pn),] - - # Add Predictors to component - for(i in 1:nrow(model$predictors_types)){ - # For numeric - if(model$predictors_types$type[i] == 'numeric' | model$predictors_types$type[i] == 'integer') { - # Built component - if(settings$get('only_linear') == FALSE){ - # if there are less than 50 unique values, create linear variable instead - if(length(unique(model$predictors[,i])) > 50){ - m <- paste0("rw1","__",model$predictors_types$predictors[i]) - } else m <- "linear" - } else { m <- 'linear' } - - # Specify priors if set - if(!is.Waiver(model$priors)){ - # If a prior has been specified - if(any(model$priors$varnames() == model$predictors_types$predictors[i])){ - vn <- model$priors$varnames()[which(model$priors$varnames() == model$predictors_types$predictors[i])] - ty <- model$priors$types()[names(vn)] - if(ty %in% c("gaussian", "normal")){ - pp <- paste0(c( - ', mean.linear = ', model$priors$get(vn)[1],', ', - 'prec.linear = ', model$priors$get(vn)[2],'' - ),collapse = "" ) - } else if(ty == "clinear"){ - pp <- paste0("hyper = list(theta = c(prior = \'clinear\', param = c(", - model$priors$get(vn)[1],", ",model$priors$get(vn)[2],")))") - m <- "clinear" - } - } else {pp <- "" } - } else { pp <- "" } - if( m!= "linear" ){ - # Could add RW effects with pc priors. PC priors are on the KL distance (difference between probability distributions), P(sigma >2)=0.05 - # Default is a loggamma prior with mu 1, 5e-05. Better would be 1, 0.5 following Caroll 2015, so we define it like this here - pp <- ', hyper = list(theta = list(prior = \'loggamma\', param = c(1, 0.5)))' - } - comp <- update.formula(comp, - paste(' ~ . +', paste0(model$predictors_types$predictors[i],'(main = ', model$predictors_types$predictors[i], - pp,', model = "',m,'")'), collapse = " ") - ) - } else if( model$predictors_types$type[i] == "factor"){ - # factor_full uses the full factor. fact_contrast uses the first level as reference - # Built component - comp <- update(comp, - paste(c(' ~ . + ', paste0(model$predictors_types$predictors[i],'(main = ', model$predictors_types$predictors[i], ', model = "factor_contrast")')), collapse = " ") - ) - } - } - - # Add spatial effect if set - if("latentspatial" %in% self$list_data() ){ - spde <- self$get_data("latentspatial") - assertthat::assert_that(inherits(spde, "inla.spde2")) - if(inherits(spde, "inla.spde") ){ - for(i in 1:length(model$biodiversity)){ - # Add spatial component term - comp <- update.formula(comp, - paste0(c("~ . + "), - paste0("spatial.field", i, - "(main = coordinates,", - ifelse( grep('copy', self$get_equation_latent_spatial('spde', vars = i))==1, - " copy = \'spatial.field1\', fixed = TRUE,", - ""), - "model = spde)" - ) - ) - ) - } - } else { - # FIXME: Make this more generic so that other latent effects are supported - stop("Non-SPDE effects not yet implemented") - } - } - # Set component - self$set_data("components", comp) - - # Set number of threads via set.Options - inlabru::bru_safe_inla(quietly = TRUE) - INLA::inla.setOption(num.threads = getOption('ibis.nthread'), - blas.num.threads = getOption('ibis.nthread') - ) - - # Set any other bru options via verbosity of fitting - inlabru::bru_options_set(bru_verbose = settings$get('verbose')) - # Newer inlabru versions support quantiles - if(utils::packageVersion("inlabru") > '2.5.2'){ - inlabru::bru_options_set(quantiles = c(0.05, 0.5, 0.95)) - } - invisible() - }, - train = function(self, model, settings) { - # Check that all inputs are there - assertthat::assert_that( - inherits(settings,'Settings'), - is.list(model),length(model)>1, - # Check that model id and setting id are identical - settings$modelid == model$id, - inherits(self$get_data("likelihoods"), 'list') - ) - - # Get likelihood - likelihoods <- self$get_data("likelihoods") - - # Get model components - comp <- self$get_data("components") - - # Get params - params <- self$get_data("params") - - # Recreate non-linear variables in case they are set - if(settings$get('only_linear') == FALSE){ - # TODO: Bypass grouping for now until this has been figured out - m = INLA::inla.models() - m$latent$rw1$min.diff = NULL - assign("inla.models", m, INLA::inla.get.inlaEnv()) - - for(i in 1:nrow(model$predictors_types)){ - # if there are less than 50 unique values, create linear variable instead - if(length(unique(model$predictors[,i])) > 50){ - # Create a one-dimensional array - m <- INLA::inla.mesh.1d( - seq(min(model$predictors[,i],na.rm = TRUE), - max(model$predictors[,i],na.rm = TRUE), length.out = 100), - degree = 1) - m <- INLA::inla.spde2.matern(m) - # Internally assign - assign(x = paste0("rw1","__",model$predictors_types$predictors[i]), - value = m ) - rm(m) - } - } - } - - # Get spatial effect if existent - if("latentspatial" %in% self$list_data() ){ - spde <- self$get_data("latentspatial") - assertthat::assert_that(exists("spde"), - inherits(spde, "inla.spde2") - ) - } - # Get options - options <- inlabru::bru_options_get() - assertthat::assert_that(inlabru::bru_options_check(options)) - # -------- # - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting fitting.') - - if( settings$get(what='varsel') == "reg" ){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Performing incremental variable selection...') - - # Catch all variables with set priors and keep them! - if(!is.Waiver(model$priors)) keep <- as.character(model$priors$varnames()) else keep <- NULL - - te <- attr(stats::terms.formula(comp), 'term.label') - test_form <- comp - # Remove variables that are never removed - if(!is.null(keep)){ - test_form <- update.formula(test_form, paste0(". ~ . - ", - paste0( - grep(pattern = paste0(keep, collapse = '|'),x = te, value = TRUE ), - collapse = "-" - )) - ) - te <- te[grep(pattern = paste0(keep,collapse = '|'),x = te, invert = TRUE, fixed = TRUE )] - } - te <- te[grep('Intercept',te,ignore.case = T,invert = T)] - # --- # - # Iterate through unique combinations of variables backwards - pb <- progress::progress_bar$new(total = length(te),format = "Backwards eliminating variables... :spin [:elapsedfull]") - o <- options; o$bru_verbose <- FALSE - o$bru_max_iter <- 2 # Use only two iteration max for the variable selection - not_found <- TRUE - while(not_found) { - pb$tick() - # --- # - # Base Model # - fit <- try({ - inlabru::bru(components = test_form, - likelihoods, - options = o) - },silent = TRUE) - if("error" %in% names(fit)) {not_found <- FALSE;next()} - - results_base <- data.frame(form = deparse1(test_form), - converged = fit$ok, - waic = fit$waic$waic, - dic = fit$dic$dic, - mean.deviance = fit$dic$mean.deviance ) - results <- data.frame() - - # Formula terms - te <- attr(stats::terms.formula(test_form), 'term.label') - te_int <- te[grep('Intercept',te,ignore.case = T, invert = F)] # capture intercept(s) - te <- te[grep('Intercept',te,ignore.case = T, invert = T)] - assertthat::assert_that(length(te) > 0, length(te_int) > 0) - - # Now for each term in variable list - for(vars in te){ - # New formula - new_form <- update(test_form, paste0('~ . - ',vars )) - ll <- likelihoods - - try({fit <- inlabru::bru(components = new_form, - ll, - options = o) - },silent = TRUE) - if("error" %in% names(fit)){ - results <- rbind(results, - data.frame( form = deparse1(new_form), - converged = FALSE, - waic = NA, dic = NA, mean.deviance = NA ) - ) - } else { - results <- rbind(results, data.frame(form = deparse1(new_form), - converged = fit$ok, - waic = fit$waic$waic, - dic = fit$dic$dic, - mean.deviance = fit$dic$mean.deviance ) - ) - } - rm(fit) - } # End of loop - - if(!is.na(results_base$dic) || nrow(results) > 0) { - # Now check whether any of the new models are 'better' than the full model - if(results_base$dic <= min(results$dic, na.rm = TRUE)){ - not_found <- FALSE - best_found <- results_base$form - } else { - # Otherwise continue get best model - test_form <- as.formula(results$form[which.min(results$dic)]) - } - rm(results_base, results) - } else { - # Check whether formula is empty, if yes, set to not_found to FALSE - te <- attr(stats::terms.formula(test_form),'term.label') - if(length(te)<=4){ - not_found <- FALSE - best_found <- test_form - } - } - - } # End of While loop - # Make sure to add kept variables back - if(!is.null(keep)){ - te <- attr(stats::terms.formula(comp),'term.label') - best_found <- update.formula(best_found, paste0(". ~ . + ", - paste0( - grep(pattern = paste0(keep, collapse = '|'),x = te, value = TRUE ), - collapse = "+" - )) - ) - } - # Replace component to be tested with best found - comp <- as.formula(best_found) - } - - # --- # - # Fitting bru model - try({ - fit_bru <- inlabru::bru(components = comp, - likelihoods, - options = options) - }, silent = FALSE) - # --- # - - # Security checks - if(!exists("fit_bru")){ - stop('Model did not converge. Try to simplify structure and check priors!') - } - if(is.null(fit_bru$names.fixed)) stop('Model did not converge. Try to simplify structure and check priors!') - - if(!settings$get('inference_only')){ - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction.') - - covs <- model$predictors_object$get_data(df = FALSE) - covs <- covs[[ which(names(covs) %in% fit_bru$names.fixed) ]] - - # Build coordinates - suppressWarnings( - preds <- inla_predpoints(mesh = self$get_data('mesh'), - background = model$background, - cov = covs, - proj_stepsize = self$get_data('proj_stepsize'), - spatial = TRUE - ) - ) - # Clamp? - if( settings$get("clamp") ) preds@data <- clamp_predictions(model, preds@data) - - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(preds)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - preds[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - # --- # - # Define formula - if(params$type == "response"){ - # Transformation to use for prediction scale - # FIXME: This assumes no custom link function has been set! - fun <- ifelse(length(model$biodiversity) == 1 && model$biodiversity[[1]]$type == 'poipa', "logistic", "exp") - } else { - fun <- "" # Linear predictor - } - - # Get variables for inlabru - if(length(model$biodiversity)>1){ - vn <- lapply(model$biodiversity, function(x) x$predictors_names) %>% do.call(c, .) %>% unique() - ii <- paste("Intercept", - # # If multiple datasets, remove intercept - ifelse(length(model$biodiversity)>1,"+ 0", ""), - ifelse(model$biodiversity[[1]]$use_intercept, - paste("+",paste0('Intercept_', - make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name - sapply( model$biodiversity, function(x) x$type ),collapse = ' + ') - ), - "" - ) - ) - # Assert that variables are used in the likelihoods - assertthat::assert_that( - all( vn %in% (lapply(likelihoods, function(x) all.vars(x$formula) ) %>% do.call(c, .) %>% unique() ) ) - ) - } else { - # vn <- sapply(model$biodiversity, function(x) x$predictors_names) %>% unique() - vn <- fit_bru$names.fixed[grep('Intercept', fit_bru$names.fixed,invert = TRUE)] - ii <- "Intercept" - } - assertthat::assert_that(all( vn %in% names(preds) )) - preds <- subset(preds, select = vn ) - # Add offset if set - if(!is.Waiver(model$offset)){ - ovn <- "spatial_offset" - ofs <- paste0("", ovn," + ") - } else { ofs <- ""} - - pfo <- as.formula( - paste0("~",fun,"( ",ii, " + ", ofs, paste0(vn, collapse = " + "), - # Add spatial latent effects - ifelse("latentspatial" %in% self$list_data(), - paste("+",paste0("spatial.field",1:length(model$biodiversity),collapse = " + ")), - ""), - ")") - ) - # --- # - cores <- if(getOption("ibis.runparallel")) getOption("ibis.nthread") else NULL - # Make a prediction - suppressWarnings( - pred_bru <- inlabru:::predict.bru( - object = fit_bru, - num.threads = cores, - data = preds, - probs = c(0.05, 0.5, 0.95), - formula = pfo, - n.samples = 1000 # Pass as parameter? - ) - ) - pred_bru$cv <- pred_bru$sd / pred_bru$mean - # Get only the predicted variables of interest - if(utils::packageVersion("inlabru") <= '2.5.2'){ - # Older version where probs are ignored - prediction <- raster::stack( - pred_bru[,c("mean","sd","q0.025", "median", "q0.975", "cv")] - ) - names(prediction) <- c("mean","sd","q0.025", "median", "q0.975", "cv") - } else { - prediction <- raster::stack( - pred_bru[,c("mean","sd","q0.05", "q0.5", "q0.95", "cv")] - ) - names(prediction) <- c("mean", "sd", "q05", "q50", "q95", "cv") - } - - } else { - prediction <- NULL - } - - # Compute end of computation time - settings$set('end.time', Sys.time()) - - # Definition of INLA Model object ---- - out <- bdproto( - "INLABRU-Model", - DistributionModel, - id = model$id, - model = model, - settings = settings, - fits = list( - "fit_best" = fit_bru, - "fit_best_equation" = self$get_data("components"), - "mesh" = self$get_data('mesh'), - "spde" = self$get_data('latentspatial'), - "prediction" = prediction - ), - # Projection function - project = function(self, newdata, form = NULL, n.samples = 1000, layer = "mean"){ - assertthat::assert_that('fit_best' %in% names(self$fits), - is.data.frame(newdata) || is.matrix(newdata) || inherits(newdata,'SpatialPixelsDataFrame'), - is.null(form) || is.character(form) || is.formula(form) - ) - # Get model - mod <- self$get_data('fit_best') - model <- self$model - # Also get settings for bias values - settings <- self$settings - - # Clamp? - if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) - - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(newdata)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - - # If newdata is not yet a SpatialPixel object, transform - if(!inherits(newdata,'SpatialPixelsDataFrame')){ - assertthat::assert_that( - assertthat::has_name(newdata,c('x','y')) - ) - # Convert predictors to SpatialPixelsDataFrame as required for inlabru - newdata <- sp::SpatialPointsDataFrame(coords = newdata[,c('x', 'y')], - data = newdata[, names(newdata) %notin% c('x','y')], - proj4string = self$get_data('mesh')$crs - ) - newdata <- subset(newdata, complete.cases(newdata@data)) # Remove missing data - newdata <- as(newdata, 'SpatialPixelsDataFrame') - } - # Check that model variables are in prediction dataset - assertthat::assert_that( - all(mod$names.fixed[grep('Intercept', mod$names.fixed,invert = TRUE)] %in% names(newdata)) - ) - - if(is.null(form)){ - # Try and guess backtransformation - backtransf <- ifelse(mod$bru_info$lhoods[[1]]$family == 'poisson','exp','logistic') - - # Build the formula - if(length(model$biodiversity)>1){ - vn <- lapply(model$biodiversity, function(x) x$predictors_names) %>% do.call(c, .) %>% unique() - ii <- paste("Intercept + ",paste0('Intercept_', - make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name - sapply( model$biodiversity, function(x) x$type ),collapse = ' + ') - ) - # Assert that variables are used in the likelihoods - assertthat::assert_that( - all( vn %in% (lapply(likelihoods, function(x) all.vars(x$formula) ) %>% do.call(c, .) %>% unique() ) ) - ) - } else { - vn <- sapply(model$biodiversity, function(x) x$predictors_names) %>% unique() - # vn <- mod$names.fixed[grep('Intercept', fit_bru$names.fixed,invert = TRUE)] - assertthat::assert_that(all(vn %in% mod$names.fixed)) - ii <- "Intercept" - } - - form <- as.formula( - paste0("~",backtransf,"( ",ii, " + ", paste0(vn, collapse = " + "), - ifelse(length(mod$summary.spde2.blc)>0, "+ spatial.field", ""), - ")") - ) - } - - # Perform the projection - suppressWarnings( - out <- inlabru:::predict.bru( - object = mod, - data = newdata, - formula = form, - probs = c(0.05,0.5,0.95), - n.samples = n.samples - ) - ) - out$cv <- out$sd / out$mean - # Get only the predicted variables of interest - if(utils::packageVersion("inlabru") <= '2.5.2'){ - # Older version where probs are ignored - out <- raster::stack( - out[,c("mean","sd","q0.025", "median", "q0.975", "cv")] - ) - names(out) <- c("mean","sd","q0.025", "median", "q0.975", "cv") - } else { - out <- raster::stack( - out[,c("mean","sd","q0.05", "q0.5", "q0.95", "cv")] - ) - names(out) <- c("mean", "sd", "q05", "q50", "q95", "cv") - } - # Return result - return(out) - }, - # Partial response - partial = function(self, x.var, constant = NULL, variable_length = 100, values = NULL, plot = TRUE, type = "response"){ - # We use inlabru's functionalities to sample from the posterior - # a given variable. A prediction is made over a generated fitted data.frame - # Check that provided model exists and variable exist in model - mod <- self$get_data('fit_best') - model <- self$model - df <- model$biodiversity[[1]]$predictors - assertthat::assert_that(inherits(mod,'bru'), - 'model' %in% names(self), - is.character(x.var), - is.numeric(variable_length), variable_length >=1, - is.null(constant) || is.numeric(constant), - is.null(values) || is.numeric(values) - ) - - # Match variable name - if(!is.null(mod$summary.random)) vn <- names(mod$summary.random) else vn <- "" - x.var <- match.arg(x.var, c( mod$names.fixed, vn), several.ok = FALSE) - - # Make a prediction via inlabru - if(any(model$predictors_types$type=="factor")){ - rr <- sapply(df[model$predictors_types$predictors[model$predictors_types$type=="numeric"]], - function(x) range(x, na.rm = TRUE)) |> as.data.frame() - } else { - rr <- sapply(df, function(x) range(x, na.rm = TRUE)) |> as.data.frame() - } - assertthat::assert_that(nrow(rr)>1, ncol(rr)>=1) - - df_partial <- list() - # Set length out to value length to have equal coverage - if(!is.null(values)){ variable_length <- length(values) } - # Add all others as constant - if(is.null(constant)){ - for(n in names(rr)) df_partial[[n]] <- rep( mean(df[[n]], na.rm = TRUE), variable_length ) - } else { - for(n in names(rr)) df_partial[[n]] <- rep( constant, variable_length ) - } - if(!is.null(values)){ - df_partial[[x.var]] <- values - } else { - df_partial[[x.var]] <- seq(rr[1,x.var], rr[2,x.var], length.out = variable_length) - } - df_partial <- df_partial %>% as.data.frame() - - if(any(model$predictors_types$type=="factor")){ - lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) - df_partial[model$predictors_types$predictors[model$predictors_types$type=="factor"]] <- - factor(lvl[1], levels = lvl) - } - - ## plot the unique effect of the covariate - fun <- ifelse(length(model$biodiversity) == 1 && model$biodiversity[[1]]$type == 'poipa', "logistic", "exp") - pred_cov <- inlabru:::predict.bru(mod, - df_partial, - as.formula( paste("~ ",fun,"(", paste(mod$names.fixed,collapse = " + ") ,")") ), - n.samples = 100, - probs = c(0.05,0.5,0.95) - ) - pred_cov$cv <- pred_cov$sd / pred_cov$mean - - o <- pred_cov - names(o)[grep(x.var, names(o))] <- "partial_effect" - if(utils::packageVersion("inlabru") <= '2.5.2'){ - # Older version where probs are ignored - o <- subset(o, select = c("partial_effect", "mean", "sd", "median", "q0.025", "q0.975", "cv")) - names(o) <- c("partial_effect", "mean", "sd", "median", "lower", "upper", "cv") - } else { - o <- subset(o, select = c("partial_effect", "mean", "sd", "q0.05", "q0.5", "q0.95", "cv")) - names(o) <- c("partial_effect", "mean", "sd", "lower", "median", "upper", "cv") - } - - # Do plot and return result - if(plot){ - pm <- ggplot2::ggplot(data = o, ggplot2::aes(x = partial_effect, y = median, - ymin = lower, - ymax = upper) ) + - ggplot2::theme_classic() + - ggplot2::geom_ribbon(fill = "grey90") + - ggplot2::geom_line() + - ggplot2::labs(x = x.var, y = "Partial effect") - print(pm) - } - return(o %>% as.data.frame() ) - }, - # (S)partial effect - spartial = function(self, x.var, constant = NULL, plot = TRUE, type = "response"){ - # We use inlabru's functionalities to sample from the posterior - # a given variable. A prediction is made over a generated fitted data.frame - # Check that provided model exists and variable exist in model - mod <- self$get_data('fit_best') - model <- self$model - assertthat::assert_that(inherits(mod,'bru'), - 'model' %in% names(self), - is.character(x.var), - is.null(constant) || is.numeric(constant) - ) - - # Match variable name - x.var <- match.arg(x.var, mod$names.fixed, several.ok = FALSE) - - # Convert predictors to SpatialPixelsDataFrame as required for inlabru - df_partial <- sp::SpatialPointsDataFrame(coords = model$predictors[,c('x', 'y')], - data = model$predictors[, names(model$predictors) %notin% c('x','y')], - proj4string = self$get_data('mesh')$crs - ) - df_partial <- subset(df_partial, complete.cases(df_partial@data)) # Remove missing data - df_partial <- as(df_partial, 'SpatialPixelsDataFrame') - - # Add all others as constant - if(is.null(constant)){ - for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- suppressWarnings( mean(model$predictors[[n]], na.rm = TRUE) ) - } else { - for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- constant - } - if(any(model$predictors_types$type=="factor")){ - lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) - df_partial[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]] <- - factor(lvl[1], levels = lvl) - # FIXME: Assigning the first level (usually reference) for now. But ideally find a way to skip factors from partial predictions - } - - fun <- ifelse(length(model$biodiversity) == 1 && model$biodiversity[[1]]$type == 'poipa', "logistic", "exp") - pred_cov <- inlabru:::predict.bru(mod, - df_partial, - as.formula( paste("~ ",fun,"( Intercept + ", x.var ,")") ), - n.samples = 100, - probs = c(0.05,0.5,0.95) - ) - pred_cov$cv <- pred_cov$sd / pred_cov$mean - - # Do plot and return result - if(plot){ - o <- pred_cov - ggplot2::ggplot() + - ggplot2::theme_classic(base_size = 18) + - inlabru:::gg(o, ggplot2::aes(fill = mean)) + - ggplot2::scale_fill_gradientn(colours = ibis_colours$divg_bluegreen) + - ggplot2::labs(x = "", y = "", title = paste0("Spartial of ", x.var)) - } - - # Depending on the package version return out - if(utils::packageVersion("inlabru") <= '2.5.2'){ - # Older version where probs are ignored - return( - raster::stack( - pred_cov[,c("mean","sd","q0.025", "median", "q0.975", "cv")] # Columns need to be adapted if quantiles are changed - ) - ) - } else { - return( - raster::stack( - pred_cov[,c("mean","sd","q0.05", "q0.5", "q0.95", "cv")] # Columns need to be adapted if quantiles are changed - ) - ) - } - }, - get_coefficients = function(self){ - # Returns a vector of the coefficients with direction/importance - cofs <- self$summary() - cofs <- subset(cofs, select = c("variable", "mean", "sd")) - names(cofs) <- c("Feature", "Beta", "Sigma") - # Remove intercept(s) - int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) - if(length(int)>0) cofs <- cofs[-int,] - return(cofs) - }, - # Function to plot SPDE if existing - plot_spatial = function(self, spat = NULL, type = "response", what = "spatial.field1", ...){ - # Get mesh, domain and model - mesh <- self$get_data("mesh") - domain <- as(self$model$background, "Spatial") - mod <- self$get_data('fit_best') - type <- match.arg(type, c("response", "predictor"), several.ok = FALSE) - - assertthat::assert_that(!is.null(mod$model.random), - msg = "No spatial latent was estimated in the model!") - - if(mod$model.random == "SPDE2 model") { - assertthat::assert_that(inherits(mod,'bru'), - inherits(mesh, 'inla.mesh'), - is.null(spat) || inherits("SpatialPixelsDataFrame"), - 'model' %in% names(self), - is.character(what) - ) - # Check whether random variable exists, otherwise raise warning - if(!(what %in% names(mod$summary.random))){ - stop(paste0( - "Spatial random effect not found. Set 'what' to one of these: ", - paste0(names(mod$summary.random),collapse = " | ") - )) - } - - # Predict the spatial intensity surface - if(is.null(spat)){ - spat <- inlabru::pixels(mesh, mask = domain) - } - # FIXME: Does not work for other link functions - if(type == "response") fun <- 'exp' else fun <- '' - - suppressWarnings( - lambda <- inlabru:::predict.bru(mod, - spat, - as.formula(paste0("~ ",fun,"(",what," + Intercept)")) - ) - ) - - # Convert to raster stack - lambda <- raster::stack(lambda) - - # Also get SPDE posteriors of the matern correlation and coveriance function - corplot <- inlabru:::plot.prediction(inlabru::spde.posterior(mod, what, what = "matern.correlation")) + - ggplot2::ggtitle("Matern correlation") - covplot <- inlabru:::plot.prediction(inlabru::spde.posterior(mod, what, what = "matern.covariance")) + - ggplot2::ggtitle("Matern covariance") - inlabru::multiplot(covplot, corplot) - - return(lambda) - } else { - message("No SPDE effect found.") - } - } - ) - } - )) -} +#' @include bdproto-engine.R utils-inla.R bdproto-distributionmodel.R +NULL + +#' Use inlabru as engine +#' +#' @description Model components are specified with general inputs and mapping methods to the +#' latent variables, and the predictors are specified via general R expressions, +#' with separate expressions for each observation likelihood model in multi-likelihood models. +#' The inlabru engine - similar as the [`engine_inla`] function acts a wrapper for [INLA::inla], +#' albeit [inlabru] has a number of convenience functions implemented that make in particular predictions +#' with new data much more straight forward (e.g. via posterior simulation instead of fitting). +#' Since more recent versions [inlabru] also supports the addition of multiple likelihoods, therefore +#' allowing full integrated inference. +#' @details +#' All \code{INLA} engines require the specification of a mesh that needs to be provided to the +#' \code{"optional_mesh"} parameter. Otherwise the mesh will be created based on best guesses of the +#' data spread. A good mesh needs to have triangles as regular as possible in size and shape: equilateral. +#' +#' [*] \code{"max.edge"}: The largest allowed triangle edge length, must be in the same scale units as the coordinates +#' Lower bounds affect the density of triangles +#' [*] \code{"offset"}: The automatic extension distance of the mesh +#' If positive: same scale units. If negative, interpreted as a factor relative to the approximate data diameter +#' i.e., a value of -0.10 will add a 10% of the data diameter as outer extension. +#' [*] \code{"cutoff"}: The minimum allowed distance between points, +#' it means that points at a closer distance than the supplied value are replaced by a single vertex. +#' it is critical when there are some points very close to each other, either for point locations or in the +#' domain boundary. +#' [*] \code{"proj_stepsize"}: The stepsize for spatial predictions, which affects the spatial grain of any outputs +#' created. +#' +#' Priors can be set via [INLAPrior]. +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param optional_mesh A directly supplied [`INLA`] mesh (Default: \code{NULL}) +#' @param max.edge The largest allowed triangle edge length, must be in the same scale units as the coordinates. +#' Default is an educated guess (Default: \code{NULL}). +#' @param offset interpreted as a numeric factor relative to the approximate data diameter. +#' Default is an educated guess (Default: \code{NULL}). +#' @param cutoff The minimum allowed distance between points on the mesh. +#' Default is an educated guess (Default: \code{NULL}). +#' @param proj_stepsize The stepsize in coordinate units between cells of the projection grid (Default: \code{NULL}) +#' @param strategy Which approximation to use for the joint posterior. Options are \code{"auto"} ("default"), \code{"adaptative"}, +#' \code{"gaussian"}, \code{"simplified.laplace"} & \code{"laplace"}. +#' @param int.strategy Integration strategy. Options are \code{"auto"},\code{"grid"}, \code{"eb"} ("default") & \code{"ccd"}. +#' @param area Accepts a [`character`] denoting the type of area calculation to be done on the mesh (Default: \code{'gpc2'}). +#' @param timeout Specify a timeout for INLA models in sec. Afterwards it passed. +#' @param type The mode used for creating posterior predictions. Either summarizing the linear \code{"predictor"} or \code{"response"} (Default: \code{"response"}). +#' @param ... Other variables +#' @references +#' * Bachl, F. E., Lindgren, F., Borchers, D. L., & Illian, J. B. (2019). inlabru: an R package for Bayesian spatial modelling from ecological survey data. Methods in Ecology and Evolution, 10(6), 760-766. +#' * Simpson, Daniel, Janine B. Illian, S. H. Sørbye, and Håvard Rue. 2016. “Going Off Grid: Computationally Efficient Inference for Log-Gaussian Cox Processes.” Biometrika 1 (103): 49–70. +#' @source [https://inlabru-org.github.io/inlabru/articles/](https://inlabru-org.github.io/inlabru/articles/) +#' @family engine +#' @returns An [engine]. +#' @examples +#' \dontrun{ +#' # Add inlabru as an engine +#' x <- distribution(background) |> engine_inlabru() +#' } +#' @name engine_inlabru +NULL +#' @rdname engine_inlabru +#' @export +engine_inlabru <- function(x, + optional_mesh = NULL, + max.edge = NULL, + offset = NULL, + cutoff = NULL, + proj_stepsize = NULL, + strategy = "auto", + int.strategy = "eb", + area = "gpc2", + timeout = NULL, + type = "response", + ...) { + + # Check whether INLA package is available + check_package('inlabru') + if(!isNamespaceLoaded("inlabru")) { attachNamespace("inlabru");requireNamespace('inlabru') } + check_package('INLA') + if(!isNamespaceLoaded("INLA")) { attachNamespace("INLA");requireNamespace('INLA') } + + # assert that arguments are valid + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(x$background,'sf'), + inherits(optional_mesh,'inla.mesh') || is.null(optional_mesh), + is.vector(max.edge) || is.null(max.edge), + (is.vector(offset) || is.numeric(offset)) || is.null(offset), + is.numeric(cutoff) || is.null(cutoff), + is.null(timeout) || is.numeric(timeout), + is.character(strategy), + is.character(int.strategy), + is.character(area), + is.character(type), + is.null(proj_stepsize) || is.numeric(proj_stepsize) + ) + + # Match strategy + strategy <- match.arg(strategy, c('auto', 'gaussian', 'simplified.laplace', 'laplace', 'adaptive'), several.ok = FALSE) + int.strategy <- match.arg(int.strategy, c('auto', 'ccd', 'grid', 'eb'), several.ok = FALSE) + area <- match.arg(area, c("gpc", "gpc2", "km"), several.ok = FALSE) + type <- match.arg(type, c("response", "predictor"), several.ok = FALSE) + + # Set inlabru options for strategies. These are set globally + inlabru::bru_options_set(control.inla = list(strategy = strategy, + int.strategy = int.strategy)) + + # Set the projection mesh + if(inherits(optional_mesh,'inla.mesh')) { + # Load a provided on + mesh <- optional_mesh + # Security check for projection and if not set, use the one from background + if(is.null(mesh$crs)) mesh$crs <- sp::CRS( proj4string(region.poly) ) + + # Convert the study region + region.poly <- methods::as(sf::st_geometry(x$background), "Spatial") + + # Calculate area + ar <- suppressWarnings( + mesh_area(mesh = mesh, region.poly = region.poly, variant = area) + ) + } else { + mesh <- new_waiver() + ar <- new_waiver() + } + + # Collate other parameters in a specific object + params <- list( + max.edge = max.edge, + offset = offset, + cutoff = cutoff, + proj_stepsize = proj_stepsize, + type = type, + area = area, + strategy = strategy, + int.strategy = int.strategy, + ... + ) + + # If time out is specified + if(!is.null(timeout)) INLA::inla.setOption(fmesher.timeout = timeout) + + # Print a message in case there is already an engine object + if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') + + # Set engine in distribution object + x$set_engine( + bdproto( + "INLABRU-Engine", + Engine, + name = "", + data = list( + 'mesh' = mesh, + 'mesh.area' = ar, + 'proj_stepsize' = proj_stepsize, + 'params' = params + ), + # Function to create a mesh + create_mesh = function(self, model){ + assertthat::assert_that(is.list(model), + "background" %in% names(model)) + # Check if mesh is already present, if so use it + if(!is.Waiver(self$get_data("mesh"))) return() + # Create a new mesh based on the available data + + # Get parameters + params <- self$get_data("params") + + # Convert the study region + region.poly <- methods::as(sf::st_geometry(model$background), "Spatial") + + # Convert to boundary object for later + suppressWarnings( + bdry <- INLA::inla.sp2segment( + sp = region.poly, + join = TRUE, + crs = INLA::inla.CRS(projargs = sp::proj4string(region.poly)) + ) + ) + bdry$loc <- INLA::inla.mesh.map(bdry$loc) + + # Get all coordinates of observations + locs <- collect_occurrencepoints(model, include_absences = FALSE) + + # Try and infer mesh parameters if not set + if(is.null(params$max.edge)){ + # A good guess here is usally a max.edge of between 1/3 to 1/5 of the spatial range. + max.edge <- c(diff(range(locs[,1]))/(3*5) , diff(range(locs[,1]))/(3*5) * 2) + params$max.edge <- max.edge + } + if(is.null(params$offset)){ + # Check whether the coordinate system is longlat + if( sf::st_is_longlat(bdry$crs) ){ + # Specify offset as 1/100 of the boundary distance + offset <- c( diff(range(bdry$loc[,1]))*0.01, + diff(range(bdry$loc[,1]))*0.01) + } else { + offset <- c( diff(range(bdry$loc[,1]))*0.01, + diff(range(bdry$loc[,1]))*0.01) + } + params$offset <- offset + } + + if(is.null(params$cutoff)){ + # Specify as minimum distance between y coordinates + # Thus capturing most points on this level + # otherwise set to default + val <- min(abs(diff(locs[,2]))) + cutoff <- ifelse(val == 0, 1e-12, val) + params$cutoff <- cutoff + } + + suppressWarnings( + mesh <- INLA::inla.mesh.2d( + # Point localities + loc = locs, + # Boundary object + boundary = bdry, + # Mesh Parameters + max.edge = params$max.edge, + offset = params$offset, + cutoff = params$cutoff, + # Define the CRS + crs = bdry$crs + ) + ) + # Calculate area + # ar <- suppressMessages( + # suppressWarnings( + # mesh_area(mesh = mesh, region.poly = region.poly, variant = params$area) + # ) + # ) + # 06/01/2023: This should work and is identical to inlabru::ipoints + ar <- suppressWarnings( + inlabru::ipoints(samplers = mesh)$weight |> as.vector() + ) + assertthat::assert_that(length(ar) == mesh$n) + + # Now set the output + self$set_data("mesh", mesh) + self$set_data("mesh.area", ar) + + invisible() + }, + # Generic plotting function for the mesh + plot = function(self, assess = FALSE){ + if(is.Waiver(self$get_data('mesh'))) stop("No mesh found!") + if(assess){ + # For an INLA mesh assessment + out <- INLA::inla.mesh.assessment( + mesh = self$get_data('mesh'), + spatial.range = 3, + alpha = 2, + dims = c(300, 300) + ) + # Convert to raster stack + out <- raster::stack( + sp::SpatialPixelsDataFrame( sp::coordinates(out), data = as.data.frame(out), + proj4string = self$get_data('mesh')$crs ) + ) + + raster::plot(out[[c('sd','sd.dev','edge.len')]], + col = c("#00204D","#00336F","#39486B","#575C6D","#707173","#8A8779","#A69D75","#C4B56C","#E4CF5B","#FFEA46") + ) + } else { + INLA:::plot.inla.mesh( self$get_data('mesh') ) + } + }, + # Spatial latent function + # https://groups.google.com/g/r-inla-discussion-group/c/eqMhlbwChkQ/m/m0b0PuzL-PsJ + # Default SPDE prior + # It computes the approximate diameter of the mesh, multiplies by 0.2 to get a value for the prior median range, and then transforms it to log-kappa scale by the formula + # log(sqrt(8*nu)/range) where nu is alpha-dim/2. + calc_latent_spatial = function(self,type = 'spde', alpha = 2, + priors = NULL, + polynames = NULL, + varname = "spatial.field1", + ...){ + # Catch prior objects + if(is.null(priors) || is.Waiver(priors)) priors <- NULL + + # For calculating iCAR process + if(type == 'car'){ + # convert mesh to sf object + ns <- mesh_as_sf(self$data$mesh) + # Create adjacency matrix with queen's case + nc.nb <- spdep::poly2nb(ns, queen = TRUE) + #Convert the adjacency matrix into a file in the INLA format + adjmat <- spdep::nb2mat(nc.nb,style = "B") + adjmat <- methods::as(adjmat, "dgTMatrix") + # adjmat <- INLA::inla.graph2matrix(nc.nb) + # Save the adjaceny matrix as output + self$data$latentspatial <- adjmat + self$data$s.index <- as.numeric(attr(nc.nb,varname)) + } else if(type=='spde'){ + # Check that everything is correctly specified + if(!is.null(priors)) if('spde' %notin% priors$varnames() ) priors <- NULL + + # Use default spde + if(is.null(priors) || is.Waiver(priors)){ + # Define PC Matern SPDE model and save + self$data$latentspatial <- INLA::inla.spde2.matern( + mesh = self$data$mesh, + alpha = alpha + ) + } else { + # Get priors + pr <- if(is.null(priors)) c(0.01, 0.05) else priors$get('spde','prior.range') + ps <- if(is.null(priors)) c(10, 0.05) else priors$get('spde','prior.sigma') + + # Define PC Matern SPDE model and save + self$data$latentspatial <- INLA::inla.spde2.pcmatern( + mesh = self$data$mesh, + alpha = alpha, + # P(Range < 1°) = 0.001 and P(sigma > 0.5) = 0.05 + prior.range = pr, prior.sigma = ps + ) + } + # Make index for spatial field + self$data$s.index <- INLA::inla.spde.make.index(name = varname, + n.spde = self$data$latentspatial$n.spde, + n.group = 1, + n.repl = 1) + # Security checks + assertthat::assert_that( + inherits(self$data$latentspatial,'inla.spde'), + length(self$data$s.index[[1]]) == self$data$mesh$n + ) + } else if(type == 'poly'){ + # Save column names of polynomial transformed coordinates + assertthat::assert_that(!is.null(polynames)) + self$data$latentspatial <- polynames + } + invisible() + }, + # Get latent spatial equation bit + # Set vars to 2 or larger to get copied spde's + get_equation_latent_spatial = function(self, method, vars = 1, separate_spde = FALSE){ + assertthat::assert_that(is.numeric(vars)) + if(method == 'spde'){ + assertthat::assert_that(inherits(self$data$latentspatial, 'inla.spde'), + msg = 'Latent spatial has not been calculated.') + # SPDE string + if(separate_spde){ + ss <- paste0("f(spatial.field",vars,", model = ",method,")") + } else { + if(vars >1){ + ss <- paste0("f(spatial.field",vars,", copy = \'spatial.field1\', model = ",method,", fixed = TRUE)") + } else { + ss <- paste0("f(spatial.field",vars,", model = ",method,")") + } + } + return(ss) + + } else if(method == 'car'){ + assertthat::assert_that(inherits(self$data$latentspatial,'dgTMatrix'), + msg = 'Neighborhood matrix has not been calculated.') + return( + # BESAG model or BYM model to specify + # BYM found to be largely similar to SPDE https://onlinelibrary.wiley.com/doi/pdf/10.1002/ece3.3081 + paste0('f(','spatial.field1',', model = "bym", graph = ','adjmat',')') + ) + } + }, + calc_integration_points = function(self, model, mode = 'stack'){ + # Mode cox process integration + if(mode == 'cp'){ + # Create integration points by using the mesh as sampler + suppressWarnings( + ips <- inlabru::ipoints( + samplers = self$get_data('mesh') + ) + ) + # Extract predictors add to integration point data + d <- get_rastervalue(coords = ips@coords, + env = model$predictors_object$get_data(df = FALSE), + rm.na = FALSE) + for (cov in model$predictors_names) ips@data[,cov] <- d[,cov] + ips@data$Intercept <- 1 + ips <- subset(ips, stats::complete.cases(ips@data)) # Necessary as some integration points can fall outside land area + # Return results + return(ips) + } else if(mode == 'stack'){ + # Use INLA make stack function instead. Useful for poisson created data so that + # integration points are created as absence. Numerically inefficient though compared to cp + # FIXME: Ideally sample from a provided pseudo-background + istk <- inla_make_integration_stack(mesh = self$get_data('mesh'), + mesh.area = self$get_data("mesh.area"), + model = model, + id = "istack", + joint = FALSE) + ips <- cbind(istk$data$data, istk$effects$data) # Combine observations and stack + ips <- subset(ips, stats::complete.cases(ips[,c("x", "y")])) # Remove NA coordinates + # Convert to sp + ips <- sp::SpatialPointsDataFrame(coords = ips[,c('x', 'y')], + data = ips[, names(ips) %notin% c('x','y')], + proj4string = self$get_data('mesh')$crs + ) + # Select only the predictor names + ips <- subset(ips, select = c("observed", "Intercept", "e", model$predictors_names)) + ips <- subset(ips, stats::complete.cases(ips@data)) + abs_E <- ips$e; ips$e <- NULL + # Return list of result + return(list(ips = ips, E = abs_E)) + } + }, + # Main inlabru setup ---- + # Setup computation function + setup = function(self, model, settings, ...){ + assertthat::assert_that( + 'background' %in% names(model), + 'biodiversity' %in% names(model), + all( sapply(model$biodiversity, function(x) is.formula(x$equation)) ), + length(model$biodiversity)>=1, + msg = 'Some internal checks failed while setting up the model.' + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') + + # Construct likelihoods for each entry in the dataset + lhl <- list() + for(j in 1:length(model$biodiversity)){ + # Combine observed and predictors for the + df <- cbind( + data.frame(observed = model$biodiversity[[j]]$observations[['observed']]), + model$biodiversity[[j]]$predictors + ) + # Convert to Spatial points + df <- sp::SpatialPointsDataFrame(coords = df[,c('x', 'y')], + data = df[, names(df) %notin% c('x','y')], + proj4string = self$get_data('mesh')$crs + ) + + # Options for specifying link function of likelihood + o <- inlabru::bru_options_get() + # Data type specific. Currently only binomial and poisson supported + # FIXME: Code below does not work as intended. Worked in earlier versions. To be debugged later! + # FIXME: Apparently the code works when an SPDE is added, thus cp seems to rely on the matter covs + # if(model$biodiversity[[j]]$type == 'poipo'){ + # ips <- self$calc_integration_points(model, mode = 'cp') + # + # # Log gaussian cox process + # lh <- inlabru::like(formula = stats::update.formula(model$biodiversity[[j]]$equation, "coordinates ~ ."), + # # include = model$biodiversity[[j]]$predictors_names, + # family = "cp", + # data = df, + # # samplers = methods::as(model$background,"Spatial"), + # domain = list(coordinates = self$get_data("mesh")), + # ips = ips, + # options = o + # ) + # assertthat::assert_that(sum(lh$response_data$BRU_response_cp>0)>2, + # msg = "Found issues with aligning coordinates within the domain most likely.") + # If not poipo but still poisson, prepare data as follows + # } else + if(model$biodiversity[[j]]$family == "poisson"){ + # Calculate integration points for PPMs and to estimation data.frame + ips <- self$calc_integration_points(model, mode = 'stack') + abs_E = ips$E; ips <- ips$ips + assertthat::assert_that(all(colnames(ips) %in% colnames(df))) + new <- sp::rbind.SpatialPointsDataFrame( + df[,c('observed', 'Intercept', model$biodiversity[[j]]$predictors_names)], + ips[,c('observed', 'Intercept', model$biodiversity[[j]]$predictors_names)]) + # Formulate the likelihood + lh <- inlabru::like(formula = model$biodiversity[[j]]$equation, + family = model$biodiversity[[j]]$family, + data = new, # Combine presence and absence information + mesh = self$get_data('mesh'), + E = c(model$biodiversity[[j]]$expect, abs_E), # Combine Exposure variants + # include = include[[i]], # Don't need this as all variables included in equation + options = o + ) + } else if(model$biodiversity[[j]]$family == "binomial"){ + # Set likelihood to cloglog for binomial following Simpson 2016 if multiple likelihoods + if(length(model$biodiversity)>1){ + o[['control.family']] <- list(link = ifelse(model$biodiversity[[j]]$family=='binomial', 'cloglog', 'default')) + } + + # Formulate the likelihood + lh <- inlabru::like(formula = model$biodiversity[[j]]$equation, + family = model$biodiversity[[j]]$family, + data = df, # Combine presence and absence information + mesh = self$get_data('mesh'), + Ntrials = model$biodiversity[[j]]$expect, + # include = include[[i]], # Don't need this as all variables in equation are included + options = o + ) + } + # Add to list + lhl[[j]] <- lh + } + + # List of likelihoods + self$set_data("likelihoods", inlabru::like_list(lhl) ) + + # --- # + # Defining the component function + if(length(model$biodiversity)>1){ + comp <- stats::as.formula( + paste(' ~ 0 + Intercept(1) ', + ifelse(model$biodiversity[[1]]$use_intercept, + paste("+",paste0('Intercept_', + make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name + sapply( model$biodiversity, function(x) x$type ),"(1)",collapse = ' + ') + ), + "" + ) + ) + ) + } else { + comp <- stats::as.formula( + paste0( "~ Intercept(1)") + ) + } + + # Add Offset if set + if(!is.Waiver(model$offset)){ + ovn <- "spatial_offset" + comp <- stats::update.formula(comp, + paste(c(' ~ . +', paste0(ovn,'(main = ', ovn, ', model = "offset")')), collapse = " ") + ) + } + + # --- # + # Get unified predictors from likelihoods + pn <- lapply(lhl, function(x) all.vars(x$formula) ) |> (\(.) do.call(c,.))() |> unique() + # pn <- lapply(model$biodiversity, function(x) x$predictors_names ) |> do.call(c,.) |> unique() + pn <- pn[grep("Intercept|coordinates", pn, invert = TRUE)] + assertthat::assert_that(length(pn)>0) + model$predictors_types <- model$predictors_types[which(model$predictors_types$predictors %in% pn),] + + # Add Predictors to component + for(i in 1:nrow(model$predictors_types)){ + # For numeric + if(model$predictors_types$type[i] == 'numeric' | model$predictors_types$type[i] == 'integer') { + # Built component + if(settings$get('only_linear') == FALSE){ + # if there are less than 50 unique values, create linear variable instead + if(length(unique(model$predictors[,i])) > 50){ + m <- paste0("rw1","__",model$predictors_types$predictors[i]) + } else m <- "linear" + } else { m <- 'linear' } + + # Specify priors if set + if(!is.Waiver(model$priors)){ + # If a prior has been specified + if(any(model$priors$varnames() == model$predictors_types$predictors[i])){ + vn <- model$priors$varnames()[which(model$priors$varnames() == model$predictors_types$predictors[i])] + ty <- model$priors$types()[names(vn)] + if(ty %in% c("gaussian", "normal")){ + pp <- paste0(c( + ', mean.linear = ', model$priors$get(vn)[1],', ', + 'prec.linear = ', model$priors$get(vn)[2],'' + ),collapse = "" ) + } else if(ty == "clinear"){ + pp <- paste0("hyper = list(theta = c(prior = \'clinear\', param = c(", + model$priors$get(vn)[1],", ",model$priors$get(vn)[2],")))") + m <- "clinear" + } + } else {pp <- "" } + } else { pp <- "" } + if( m!= "linear" ){ + # Could add RW effects with pc priors. PC priors are on the KL distance (difference between probability distributions), P(sigma >2)=0.05 + # Default is a loggamma prior with mu 1, 5e-05. Better would be 1, 0.5 following Caroll 2015, so we define it like this here + pp <- ', hyper = list(theta = list(prior = \'loggamma\', param = c(1, 0.5)))' + } + comp <- stats::update.formula(comp, + paste(' ~ . +', paste0(model$predictors_types$predictors[i],'(main = ', model$predictors_types$predictors[i], + pp,', model = "',m,'")'), collapse = " ") + ) + } else if( model$predictors_types$type[i] == "factor"){ + # factor_full uses the full factor. fact_contrast uses the first level as reference + # Built component + comp <- update(comp, + paste(c(' ~ . + ', paste0(model$predictors_types$predictors[i],'(main = ', model$predictors_types$predictors[i], ', model = "factor_contrast")')), collapse = " ") + ) + } + } + + # Add spatial effect if set + if("latentspatial" %in% self$list_data() ){ + spde <- self$get_data("latentspatial") + assertthat::assert_that(inherits(spde, "inla.spde2")) + if(inherits(spde, "inla.spde") ){ + for(i in 1:length(model$biodiversity)){ + # Add spatial component term + comp <- stats::update.formula(comp, + paste0(c("~ . + "), + paste0("spatial.field", i, + "(main = coordinates,", + ifelse( grep('copy', self$get_equation_latent_spatial('spde', vars = i))==1, + " copy = \'spatial.field1\', fixed = TRUE,", + ""), + "model = spde)" + ) + ) + ) + } + } else { + # FIXME: Make this more generic so that other latent effects are supported + stop("Non-SPDE effects not yet implemented") + } + } + # Set component + self$set_data("components", comp) + + # Set number of threads via set.Options + inlabru::bru_safe_inla(quietly = TRUE) + INLA::inla.setOption(num.threads = getOption('ibis.nthread'), + blas.num.threads = getOption('ibis.nthread') + ) + + # Set any other bru options via verbosity of fitting + inlabru::bru_options_set(bru_verbose = settings$get('verbose')) + # Newer inlabru versions support quantiles + if(utils::packageVersion("inlabru") > '2.5.2'){ + inlabru::bru_options_set(quantiles = c(0.05, 0.5, 0.95)) + } + invisible() + }, + train = function(self, model, settings) { + # Check that all inputs are there + assertthat::assert_that( + inherits(settings,'Settings'), + is.list(model),length(model)>1, + # Check that model id and setting id are identical + settings$modelid == model$id, + inherits(self$get_data("likelihoods"), 'list') + ) + + # Get likelihood + likelihoods <- self$get_data("likelihoods") + + # Get model components + comp <- self$get_data("components") + + # Get params + params <- self$get_data("params") + + # Recreate non-linear variables in case they are set + if(settings$get('only_linear') == FALSE){ + # TODO: Bypass grouping for now until this has been figured out + m = INLA::inla.models() + m$latent$rw1$min.diff = NULL + assign("inla.models", m, INLA::inla.get.inlaEnv()) + + for(i in 1:nrow(model$predictors_types)){ + # if there are less than 50 unique values, create linear variable instead + if(length(unique(model$predictors[,i])) > 50){ + # Create a one-dimensional array + m <- INLA::inla.mesh.1d( + seq(min(model$predictors[,i],na.rm = TRUE), + max(model$predictors[,i],na.rm = TRUE), length.out = 100), + degree = 1) + m <- INLA::inla.spde2.matern(m) + # Internally assign + assign(x = paste0("rw1","__",model$predictors_types$predictors[i]), + value = m ) + rm(m) + } + } + } + + # Get spatial effect if existent + if("latentspatial" %in% self$list_data() ){ + spde <- self$get_data("latentspatial") + assertthat::assert_that(exists("spde"), + inherits(spde, "inla.spde2") + ) + } + # Get options + options <- inlabru::bru_options_get() + assertthat::assert_that(inlabru::bru_options_check(options)) + # -------- # + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting fitting.') + + if( settings$get(what='optim_hyperparam')){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Performing incremental variable selection...') + + # Catch all variables with set priors and keep them! + if(!is.Waiver(model$priors)) keep <- as.character(model$priors$varnames()) else keep <- NULL + + te <- attr(stats::terms.formula(comp), 'term.label') + test_form <- comp + # Remove variables that are never removed + if(!is.null(keep)){ + test_form <- stats::update.formula(test_form, paste0(". ~ . - ", + paste0( + grep(pattern = paste0(keep, collapse = '|'),x = te, value = TRUE ), + collapse = "-" + )) + ) + te <- te[grep(pattern = paste0(keep,collapse = '|'),x = te, invert = TRUE, fixed = TRUE )] + } + te <- te[grep('Intercept',te,ignore.case = T,invert = T)] + # --- # + # Iterate through unique combinations of variables backwards + pb <- progress::progress_bar$new(total = length(te),format = "Backwards eliminating variables... :spin [:elapsedfull]") + o <- options; o$bru_verbose <- FALSE + o$bru_max_iter <- 2 # Use only two iteration max for the variable selection + not_found <- TRUE + while(not_found) { + pb$tick() + # --- # + # Base Model # + fit <- try({ + inlabru::bru(components = test_form, + likelihoods, + options = o) + },silent = TRUE) + if("error" %in% names(fit)) {not_found <- FALSE;next()} + + results_base <- data.frame(form = deparse1(test_form), + converged = fit$ok, + waic = fit$waic$waic, + dic = fit$dic$dic, + mean.deviance = fit$dic$mean.deviance ) + results <- data.frame() + + # Formula terms + te <- attr(stats::terms.formula(test_form), 'term.label') + te_int <- te[grep('Intercept',te,ignore.case = T, invert = F)] # capture intercept(s) + te <- te[grep('Intercept',te,ignore.case = T, invert = T)] + assertthat::assert_that(length(te) > 0, length(te_int) > 0) + + # Now for each term in variable list + for(vars in te){ + # New formula + new_form <- update(test_form, paste0('~ . - ',vars )) + ll <- likelihoods + + try({fit <- inlabru::bru(components = new_form, + ll, + options = o) + },silent = TRUE) + if("error" %in% names(fit)){ + results <- rbind(results, + data.frame( form = deparse1(new_form), + converged = FALSE, + waic = NA, dic = NA, mean.deviance = NA ) + ) + } else { + results <- rbind(results, data.frame(form = deparse1(new_form), + converged = fit$ok, + waic = fit$waic$waic, + dic = fit$dic$dic, + mean.deviance = fit$dic$mean.deviance ) + ) + } + rm(fit) + } # End of loop + + if(!is.na(results_base$dic) || nrow(results) > 0) { + # Now check whether any of the new models are 'better' than the full model + if(results_base$dic <= min(results$dic, na.rm = TRUE)){ + not_found <- FALSE + best_found <- results_base$form + } else { + # Otherwise continue get best model + test_form <- stats::as.formula(results$form[which.min(results$dic)]) + } + rm(results_base, results) + } else { + # Check whether formula is empty, if yes, set to not_found to FALSE + te <- attr(stats::terms.formula(test_form),'term.label') + if(length(te)<=4){ + not_found <- FALSE + best_found <- test_form + } + } + + } # End of While loop + # Make sure to add kept variables back + if(!is.null(keep)){ + te <- attr(stats::terms.formula(comp),'term.label') + best_found <- stats::update.formula(best_found, paste0(". ~ . + ", + paste0( + grep(pattern = paste0(keep, collapse = '|'),x = te, value = TRUE ), + collapse = "+" + )) + ) + } + # Replace component to be tested with best found + comp <- stats::as.formula(best_found) + } + + # --- # + # Fitting bru model + try({ + fit_bru <- inlabru::bru(components = comp, + likelihoods, + options = options) + }, silent = FALSE) + # --- # + + # Security checks + if(!exists("fit_bru")){ + stop('Model did not converge. Try to simplify structure and check priors!') + } + if(is.null(fit_bru$names.fixed)) stop('Model did not converge. Try to simplify structure and check priors!') + + if(!settings$get('inference_only')){ + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction.') + + covs <- model$predictors_object$get_data(df = FALSE) + covs <- covs[[ which(names(covs) %in% fit_bru$names.fixed) ]] + + # Build coordinates + suppressWarnings( + preds <- inla_predpoints(mesh = self$get_data('mesh'), + background = model$background, + cov = covs, + proj_stepsize = self$get_data('proj_stepsize'), + spatial = TRUE + ) + ) + # Clamp? + if( settings$get("clamp") ) preds@data <- clamp_predictions(model, preds@data) + + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(preds)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + preds[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + # --- # + # Define formula + if(params$type == "response"){ + # Transformation to use for prediction scale + # FIXME: This assumes no custom link function has been set! + fun <- ifelse(length(model$biodiversity) == 1 && model$biodiversity[[1]]$type == 'poipa', "logistic", "exp") + } else { + fun <- "" # Linear predictor + } + + # Get variables for inlabru + if(length(model$biodiversity)>1){ + vn <- lapply(model$biodiversity, function(x) x$predictors_names) |> (\(.) do.call(c, .))() |> unique() + ii <- paste("Intercept", + # # If multiple datasets, remove intercept + ifelse(length(model$biodiversity)>1,"+ 0", ""), + ifelse(model$biodiversity[[1]]$use_intercept, + paste("+",paste0('Intercept_', + make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name + sapply( model$biodiversity, function(x) x$type ),collapse = ' + ') + ), + "" + ) + ) + # Assert that variables are used in the likelihoods + assertthat::assert_that( + all( vn %in% (lapply(likelihoods, function(x) all.vars(x$formula) ) |> (\(.) do.call(c, .))() |> unique() ) ) + ) + } else { + # vn <- sapply(model$biodiversity, function(x) x$predictors_names) |> unique() + vn <- fit_bru$names.fixed[grep('Intercept', fit_bru$names.fixed,invert = TRUE)] + ii <- "Intercept" + } + assertthat::assert_that(all( vn %in% names(preds) )) + preds <- subset(preds, select = vn ) + # Add offset if set + if(!is.Waiver(model$offset)){ + ovn <- "spatial_offset" + ofs <- paste0("", ovn," + ") + } else { ofs <- ""} + + pfo <- stats::as.formula( + paste0("~",fun,"( ",ii, " + ", ofs, paste0(vn, collapse = " + "), + # Add spatial latent effects + ifelse("latentspatial" %in% self$list_data(), + paste("+",paste0("spatial.field",1:length(model$biodiversity),collapse = " + ")), + ""), + ")") + ) + # --- # + cores <- if(getOption("ibis.runparallel")) getOption("ibis.nthread") else NULL + # Make a prediction + suppressWarnings( + pred_bru <- inlabru:::predict.bru( + object = fit_bru, + num.threads = cores, + data = preds, + probs = c(0.05, 0.5, 0.95), + formula = pfo, + n.samples = 1000 # Pass as parameter? + ) + ) + pred_bru$cv <- pred_bru$sd / pred_bru$mean + # Get only the predicted variables of interest + if(utils::packageVersion("inlabru") <= '2.5.2'){ + # Older version where probs are ignored + prediction <- raster::stack( + pred_bru[,c("mean","sd","q0.025", "median", "q0.975", "cv")] + ) + names(prediction) <- c("mean","sd","q0.025", "median", "q0.975", "cv") + } else { + prediction <- raster::stack( + pred_bru[,c("mean","sd","q0.05", "q0.5", "q0.95", "cv")] + ) + names(prediction) <- c("mean", "sd", "q05", "q50", "q95", "cv") + } + + } else { + prediction <- NULL + } + + # Compute end of computation time + settings$set('end.time', Sys.time()) + + # Definition of INLA Model object ---- + out <- bdproto( + "INLABRU-Model", + DistributionModel, + id = model$id, + model = model, + settings = settings, + fits = list( + "fit_best" = fit_bru, + "fit_best_equation" = self$get_data("components"), + "mesh" = self$get_data('mesh'), + "spde" = self$get_data('latentspatial'), + "prediction" = prediction + ), + # Projection function + project = function(self, newdata, form = NULL, n.samples = 1000, layer = "mean"){ + assertthat::assert_that('fit_best' %in% names(self$fits), + is.data.frame(newdata) || is.matrix(newdata) || inherits(newdata,'SpatialPixelsDataFrame'), + is.null(form) || is.character(form) || is.formula(form) + ) + # Get model + mod <- self$get_data('fit_best') + model <- self$model + # Also get settings for bias values + settings <- self$settings + + # Clamp? + if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) + + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(newdata)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + + # If newdata is not yet a SpatialPixel object, transform + if(!inherits(newdata,'SpatialPixelsDataFrame')){ + assertthat::assert_that( + assertthat::has_name(newdata,c('x','y')) + ) + # Convert predictors to SpatialPixelsDataFrame as required for inlabru + newdata <- sp::SpatialPointsDataFrame(coords = newdata[,c('x', 'y')], + data = newdata[, names(newdata) %notin% c('x','y')], + proj4string = self$get_data('mesh')$crs + ) + newdata <- subset(newdata, stats::complete.cases(newdata@data)) # Remove missing data + newdata <- methods::as(newdata, 'SpatialPixelsDataFrame') + } + # Check that model variables are in prediction dataset + assertthat::assert_that( + all(mod$names.fixed[grep('Intercept', mod$names.fixed,invert = TRUE)] %in% names(newdata)) + ) + + if(is.null(form)){ + # Try and guess backtransformation + backtransf <- ifelse(mod$bru_info$lhoods[[1]]$family == 'poisson','exp','logistic') + + # Build the formula + if(length(model$biodiversity)>1){ + vn <- lapply(model$biodiversity, function(x) x$predictors_names) |> (\(.) do.call(c, .))() |> unique() + ii <- paste("Intercept + ",paste0('Intercept_', + make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name + sapply( model$biodiversity, function(x) x$type ),collapse = ' + ') + ) + # Assert that variables are used in the likelihoods + assertthat::assert_that( + all( vn %in% (lapply(likelihoods, function(x) all.vars(x$formula) ) |> (\(.) do.call(c, .))() |> unique() ) ) + ) + } else { + vn <- sapply(model$biodiversity, function(x) x$predictors_names) |> unique() + # vn <- mod$names.fixed[grep('Intercept', fit_bru$names.fixed,invert = TRUE)] + assertthat::assert_that(all(vn %in% mod$names.fixed)) + ii <- "Intercept" + } + + form <- stats::as.formula( + paste0("~",backtransf,"( ",ii, " + ", paste0(vn, collapse = " + "), + ifelse(length(mod$summary.spde2.blc)>0, "+ spatial.field", ""), + ")") + ) + } + + # Perform the projection + suppressWarnings( + out <- inlabru:::predict.bru( + object = mod, + data = newdata, + formula = form, + probs = c(0.05,0.5,0.95), + n.samples = n.samples + ) + ) + out$cv <- out$sd / out$mean + # Get only the predicted variables of interest + if(utils::packageVersion("inlabru") <= '2.5.2'){ + # Older version where probs are ignored + out <- raster::stack( + out[,c("mean","sd","q0.025", "median", "q0.975", "cv")] + ) + names(out) <- c("mean","sd","q0.025", "median", "q0.975", "cv") + } else { + out <- raster::stack( + out[,c("mean","sd","q0.05", "q0.5", "q0.95", "cv")] + ) + names(out) <- c("mean", "sd", "q05", "q50", "q95", "cv") + } + # Return result + return(out) + }, + # Partial response + partial = function(self, x.var, constant = NULL, variable_length = 100, values = NULL, plot = TRUE, type = "response"){ + # We use inlabru's functionalities to sample from the posterior + # a given variable. A prediction is made over a generated fitted data.frame + # Check that provided model exists and variable exist in model + mod <- self$get_data('fit_best') + model <- self$model + df <- model$biodiversity[[1]]$predictors + assertthat::assert_that(inherits(mod,'bru'), + 'model' %in% names(self), + is.character(x.var), + is.numeric(variable_length), variable_length >=1, + is.null(constant) || is.numeric(constant), + is.null(values) || is.numeric(values) + ) + + # Match variable name + if(!is.null(mod$summary.random)) vn <- names(mod$summary.random) else vn <- "" + x.var <- match.arg(x.var, c( mod$names.fixed, vn), several.ok = FALSE) + + # Make a prediction via inlabru + if(any(model$predictors_types$type=="factor")){ + rr <- sapply(df[model$predictors_types$predictors[model$predictors_types$type=="numeric"]], + function(x) range(x, na.rm = TRUE)) |> as.data.frame() + } else { + rr <- sapply(df, function(x) range(x, na.rm = TRUE)) |> as.data.frame() + } + assertthat::assert_that(nrow(rr)>1, ncol(rr)>=1) + + df_partial <- list() + # Set length out to value length to have equal coverage + if(!is.null(values)){ variable_length <- length(values) } + # Add all others as constant + if(is.null(constant)){ + for(n in names(rr)) df_partial[[n]] <- rep( mean(df[[n]], na.rm = TRUE), variable_length ) + } else { + for(n in names(rr)) df_partial[[n]] <- rep( constant, variable_length ) + } + if(!is.null(values)){ + df_partial[[x.var]] <- values + } else { + df_partial[[x.var]] <- seq(rr[1,x.var], rr[2,x.var], length.out = variable_length) + } + df_partial <- df_partial |> as.data.frame() + + if(any(model$predictors_types$type=="factor")){ + lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) + df_partial[model$predictors_types$predictors[model$predictors_types$type=="factor"]] <- + factor(lvl[1], levels = lvl) + } + + ## plot the unique effect of the covariate + fun <- ifelse(length(model$biodiversity) == 1 && model$biodiversity[[1]]$type == 'poipa', "logistic", "exp") + pred_cov <- inlabru:::predict.bru(mod, + df_partial, + stats::as.formula( paste("~ ",fun,"(", paste(mod$names.fixed,collapse = " + ") ,")") ), + n.samples = 100, + probs = c(0.05,0.5,0.95) + ) + pred_cov$cv <- pred_cov$sd / pred_cov$mean + + o <- pred_cov + names(o)[grep(x.var, names(o))] <- "partial_effect" + if(utils::packageVersion("inlabru") <= '2.5.2'){ + # Older version where probs are ignored + o <- subset(o, select = c("partial_effect", "mean", "sd", "median", "q0.025", "q0.975", "cv")) + names(o) <- c("partial_effect", "mean", "sd", "median", "lower", "upper", "cv") + } else { + o <- subset(o, select = c("partial_effect", "mean", "sd", "q0.05", "q0.5", "q0.95", "cv")) + names(o) <- c("partial_effect", "mean", "sd", "lower", "median", "upper", "cv") + } + + # Do plot and return result + if(plot){ + pm <- ggplot2::ggplot(data = o, ggplot2::aes(x = partial_effect, y = median, + ymin = lower, + ymax = upper) ) + + ggplot2::theme_classic() + + ggplot2::geom_ribbon(fill = "grey90") + + ggplot2::geom_line() + + ggplot2::labs(x = x.var, y = "Partial effect") + print(pm) + } + return(o |> as.data.frame() ) + }, + # (S)partial effect + spartial = function(self, x.var, constant = NULL, plot = TRUE, type = "response"){ + # We use inlabru's functionalities to sample from the posterior + # a given variable. A prediction is made over a generated fitted data.frame + # Check that provided model exists and variable exist in model + mod <- self$get_data('fit_best') + model <- self$model + assertthat::assert_that(inherits(mod,'bru'), + 'model' %in% names(self), + is.character(x.var), + is.null(constant) || is.numeric(constant) + ) + + # Match variable name + x.var <- match.arg(x.var, mod$names.fixed, several.ok = FALSE) + + # Convert predictors to SpatialPixelsDataFrame as required for inlabru + df_partial <- sp::SpatialPointsDataFrame(coords = model$predictors[,c('x', 'y')], + data = model$predictors[, names(model$predictors) %notin% c('x','y')], + proj4string = self$get_data('mesh')$crs + ) + df_partial <- subset(df_partial, stats::complete.cases(df_partial@data)) # Remove missing data + df_partial <- methods::as(df_partial, 'SpatialPixelsDataFrame') + + # Add all others as constant + if(is.null(constant)){ + for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- suppressWarnings( mean(model$predictors[[n]], na.rm = TRUE) ) + } else { + for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- constant + } + if(any(model$predictors_types$type=="factor")){ + lvl <- levels(model$predictors[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]]) + df_partial[[model$predictors_types$predictors[model$predictors_types$type=="factor"]]] <- + factor(lvl[1], levels = lvl) + # FIXME: Assigning the first level (usually reference) for now. But ideally find a way to skip factors from partial predictions + } + + fun <- ifelse(length(model$biodiversity) == 1 && model$biodiversity[[1]]$type == 'poipa', "logistic", "exp") + pred_cov <- inlabru:::predict.bru(mod, + df_partial, + stats::as.formula( paste("~ ",fun,"( Intercept + ", x.var ,")") ), + n.samples = 100, + probs = c(0.05,0.5,0.95) + ) + pred_cov$cv <- pred_cov$sd / pred_cov$mean + + # Do plot and return result + if(plot){ + o <- pred_cov + ggplot2::ggplot() + + ggplot2::theme_classic(base_size = 18) + + inlabru::gg(o, ggplot2::aes(fill = mean)) + + ggplot2::scale_fill_gradientn(colours = ibis_colours$divg_bluegreen) + + ggplot2::labs(x = "", y = "", title = paste0("Spartial of ", x.var)) + } + + # Depending on the package version return out + if(utils::packageVersion("inlabru") <= '2.5.2'){ + # Older version where probs are ignored + return( + raster::stack( + pred_cov[,c("mean","sd","q0.025", "median", "q0.975", "cv")] # Columns need to be adapted if quantiles are changed + ) + ) + } else { + return( + raster::stack( + pred_cov[,c("mean","sd","q0.05", "q0.5", "q0.95", "cv")] # Columns need to be adapted if quantiles are changed + ) + ) + } + }, + get_coefficients = function(self){ + # Returns a vector of the coefficients with direction/importance + cofs <- self$summary() + cofs <- subset(cofs, select = c("variable", "mean", "sd")) + names(cofs) <- c("Feature", "Beta", "Sigma") + # Remove intercept(s) + int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) + if(length(int)>0) cofs <- cofs[-int,] + return(cofs) + }, + # Function to plot SPDE if existing + plot_spatial = function(self, spat = NULL, type = "response", what = "spatial.field1", ...){ + # Get mesh, domain and model + mesh <- self$get_data("mesh") + domain <- methods::as(self$model$background, "Spatial") + mod <- self$get_data('fit_best') + type <- match.arg(type, c("response", "predictor"), several.ok = FALSE) + + assertthat::assert_that(!is.null(mod$model.random), + msg = "No spatial latent was estimated in the model!") + + if(mod$model.random == "SPDE2 model") { + assertthat::assert_that(inherits(mod,'bru'), + inherits(mesh, 'inla.mesh'), + is.null(spat) || inherits("SpatialPixelsDataFrame"), + 'model' %in% names(self), + is.character(what) + ) + # Check whether random variable exists, otherwise raise warning + if(!(what %in% names(mod$summary.random))){ + stop(paste0( + "Spatial random effect not found. Set 'what' to one of these: ", + paste0(names(mod$summary.random),collapse = " | ") + )) + } + + # Predict the spatial intensity surface + if(is.null(spat)){ + spat <- inlabru::pixels(mesh, mask = domain) + } + # FIXME: Does not work for other link functions + if(type == "response") fun <- 'exp' else fun <- '' + + suppressWarnings( + lambda <- inlabru:::predict.bru(mod, + spat, + stats::as.formula(paste0("~ ",fun,"(",what," + Intercept)")) + ) + ) + + # Convert to raster stack + lambda <- raster::stack(lambda) + + # Also get SPDE posteriors of the matern correlation and coveriance function + corplot <- inlabru:::plot.prediction(inlabru::spde.posterior(mod, what, what = "matern.correlation")) + + ggplot2::ggtitle("Matern correlation") + covplot <- inlabru:::plot.prediction(inlabru::spde.posterior(mod, what, what = "matern.covariance")) + + ggplot2::ggtitle("Matern covariance") + inlabru::multiplot(covplot, corplot) + + return(lambda) + } else { + message("No SPDE effect found.") + } + } + ) + } + )) +} diff --git a/R/engine_stan.R b/R/engine_stan.R index d6c0ecd6..1c3c0a69 100644 --- a/R/engine_stan.R +++ b/R/engine_stan.R @@ -1,837 +1,843 @@ -#' @include bdproto-engine.R bdproto-distributionmodel.R -NULL -#' Use Stan as engine -#' -#' @description Stan is probabilistic programming language that can be used to -#' specify most types of statistical linear and non-linear regression models. -#' Stan provides full Bayesian inference for continuous-variable models through Markov chain Monte Carlo methods -#' such as the No-U-Turn sampler, an adaptive form of Hamiltonian Monte Carlo sampling. -#' Stan code has to be written separately and this function acts as compiler to -#' build the stan-model. -#' **Requires the [cmdstanr] package to be installed!** -#' @details -#' By default the posterior is obtained through sampling, however stan also supports -#' approximate inference forms through penalized maximum likelihood estimation (see Carpenter et al. 2017). -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param chains A positive [`integer`] specifying the number of Markov chains (Default: \code{4} chains). -#' @param iter A positive [`integer`] specifying the number of iterations for each chain (including warmup). (Default: \code{2000}). -#' @param warmup A positive [`integer`] specifying the number of warmup (aka burnin) iterations per chain. -#' If step-size adaptation is on (Default: \code{TRUE}), this also controls the number of iterations for which -#' adaptation is run (and hence these warmup samples should not be used for inference). -#' The number of warmup iterations should be smaller than \code{iter} and the default is \code{iter/2}. -#' @param cores If set to NULL take values from specified ibis option \code{getOption('ibis.nthread')}. -#' @param init Initial values for parameters (Default: \code{'random'}). Can also be specified as [list] (see: [`rstan::stan`]) -#' @param algorithm Mode used to sample from the posterior. Available options are \code{"sampling"}, \code{"optimize"}, -#' or \code{"variational"}. -#' See [`cmdstanr`] package for more details. (Default: \code{"sampling"}). -#' @param control See [`rstan::stan`] for more details on specifying the controls. -#' @param type The mode used for creating posterior predictions. Either summarizing the linear \code{"predictor"} or \code{"response"} (Default: \code{"response"}). -#' @param ... Other variables -#' @seealso [rstan], [cmdstanr] -#' @note -#' The function \code{obj$stancode()} can be used to print out the stancode of the model. -#' @references -#' * Jonah Gabry and Rok Češnovar (2021). cmdstanr: R Interface to 'CmdStan'. https://mc-stan.org/cmdstanr, https://discourse.mc-stan.org. -#' * Carpenter, B., Gelman, A., Hoffman, M. D., Lee, D., Goodrich, B., Betancourt, M., ... & Riddell, A. (2017). Stan: A probabilistic programming language. Journal of statistical software, 76(1), 1-32. -#' * Piironen, J., & Vehtari, A. (2017). Sparsity information and regularization in the horseshoe and other shrinkage priors. Electronic Journal of Statistics, 11(2), 5018-5051. -#' @family engine -#' @name engine_stan -NULL -#' @rdname engine_stan -#' @export -engine_stan <- function(x, - chains = 4, - iter = 2000, - warmup = floor(iter/2), - init = "random", - cores = getOption("ibis.nthread"), - algorithm = 'sampling', - control = list(adapt_delta = 0.95), - type = "response", - ...) { - # Check whether INLA package is available - check_package('rstan') - if(!isNamespaceLoaded("rstan")) { attachNamespace("rstan");requireNamespace('rstan') } - stan_check_cmd(install = TRUE) - check_package("cmdstanr") - assertthat::assert_that( cmdstanr::cmdstan_version()>"2.26.0") - - # assert that arguments are valid - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(x$background,'sf') - ) - # Other checks of parameters - assertthat::assert_that( - is.numeric(chains), is.numeric(iter), is.numeric(warmup), - is.null(cores) || is.numeric(cores), - is.character(init) || is.list(init), - is.null(control) || is.list(control), - is.character(algorithm), - msg = 'Input parameters wrongly specified!' - ) - # Match algorithm and posterior prediction type - algorithm <- match.arg(algorithm, c("sampling", "optimize", "variational"),several.ok = FALSE) - type <- match.arg(type, c("response", "predictor"), several.ok = FALSE) - if(is.null(cores)) cores <- getOption('ibis.nthread') - - # Create a background raster - if(is.Waiver(x$predictors)){ - # Create from background - template <- raster::raster( - ext = raster::extent(x$background), - crs = raster::projection(x$background), - res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution - diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 - ) - ) - } else { - # If predictor existing, use them - template <- emptyraster(x$predictors$get_data() ) - } - # Burn in the background - template <- raster::rasterize(x$background, template, field = 0) - - # Set engine in distribution object - x$set_engine( - bdproto( - "STAN-Engine", - Engine, - name = "", - data = list( - 'template' = template - ), - # Stan options - stan_param = list( - chains = chains, iter = iter, - warmup = warmup, init = init, - cores = cores, algorithm = algorithm, - control = control, - type = type - ), - # Function to respecify the control parameters - set_control = function(self, - chains = 4, - iter = 2000, - warmup = 500, - init = "random", - cores = NULL, - control = NULL){ - - # Overwrite existing - self$stan_param <- list( - chains = chains, iter = iter, - warmup = warmup, init = init, - cores = cores, algorithm = algorithm, - control = control - ) - - }, - # Spatial latent effect - get_equation_latent_spatial = function(self){ - return( NULL ) - }, - # Setup a model - setup = function(self, model, settings = NULL, ...){ - # Simple security checks - assertthat::assert_that( - assertthat::has_name(model, 'background'), - assertthat::has_name(model, 'biodiversity'), - inherits(settings,'Settings') || is.null(settings), - nrow(model$predictors) == ncell(self$get_data('template')) - ) - # Check that all stan parameters are appropriately set - assertthat::assert_that( - is.numeric(self$stan_param$chains), - is.numeric(self$stan_param$iter), - is.numeric(self$stan_param$warmup) - ) - # Set cores - options(mc.cores = self$stan_param$cores) - - # FIXME: Stan should handle factors directly. For now outsourced to split up - if(any(model$predictors_types$type=="factor")){ - vf <- model$predictors_types$predictors[model$predictors_types$type=="factor"] - for(k in vf){ - o <- explode_factor(model$predictors[[k]],name = k) - model$predictors <- cbind(model$predictors, o) - model$predictors_names <- c(model$predictors_names, colnames(o)) - model$predictors_types <- rbind(model$predictors_types, - data.frame(predictors = colnames(o), type = "numeric") ) - # Finally remove the original column from the predictor object - model$predictors[[k]] <- NULL - model$predictors_names <- model$predictors_names[-which( model$predictors_names == k )] - model$predictors_types <- subset(model$predictors_types, subset = predictors != k) - # Explode the columns in the raster object - model$predictors_object$data <- raster::addLayer( - model$predictors_object$data, - explode_factorized_raster(model$predictors_object$data[[k]]) - ) - model$predictors_object$data <- raster::dropLayer(model$predictors_object$data, k) - } - } - - # Stan procedure - First add integration points to all poipo datasets - # FIXME: Possibly outsoure this across methods - for(i in 1:length(model$biodiversity)){ - - # If there any factor variables split them per type and explode them - if(any(model$biodiversity[[i]]$predictors_types$type=="factor")){ - vf <- model$biodiversity[[i]]$predictors_types$predictors[model$biodiversity[[i]]$predictors_types$type=="factor"] - fv <- model$biodiversity[[i]]$predictors[vf] - for(k in 1:ncol(fv)){ - o <- explode_factor(fv[,k],name = colnames(fv)[k]) - # Add - model$biodiversity[[i]]$predictors <- cbind(model$biodiversity[[i]]$predictors, o) - model$biodiversity[[i]]$predictors_names <- c(model$biodiversity[[i]]$predictors_names, colnames(o)) - model$biodiversity[[i]]$predictors_types <- rbind(model$biodiversity[[i]]$predictors_types, - data.frame(predictors = colnames(o), type = "numeric") ) - # Finally remove the original column from the predictor object - model$biodiversity[[i]]$predictors[[colnames(fv)[k]]] <- NULL - model$biodiversity[[i]]$predictors_names <- model$biodiversity[[i]]$predictors_names[-which( model$biodiversity[[i]]$predictors_names == colnames(fv)[k] )] - model$biodiversity[[i]]$predictors_types <- subset(model$biodiversity[[i]]$predictors_types, subset = predictors != colnames(fv)[k]) - } - } - - # Add pseudo-absence points if necessary, by including nearest predictor values for each - if('poipo' == model$biodiversity[[i]]$type){ - - # Get background layer - bg <- x$engine$get_data('template') - assertthat::assert_that(!is.na(cellStats(bg,min))) - - # Add pseudo-absence points - presabs <- add_pseudoabsence(df = model$biodiversity[[i]]$observations, - field_occurrence = 'observed', - template = bg, - settings = model$biodiversity[[i]]$pseudoabsence_settings) - if(inherits(presabs, 'sf')) presabs <- presabs %>% sf::st_drop_geometry() - # Sample environmental points for absence only points - abs <- subset(presabs, observed == 0) - # Re-extract environmental information for absence points - envs <- get_rastervalue(coords = abs[,c('x','y')], - env = model$predictors_object$get_data(df = FALSE), - rm.na = FALSE) - if(assertthat::has_name(model$biodiversity[[i]]$predictors, "Intercept")){ envs$Intercept <- 1} - - # Format out - df <- rbind(model$biodiversity[[i]]$predictors[,c('x','y','Intercept', model$biodiversity[[i]]$predictors_names)], - envs[,c('x','y','Intercept', model$biodiversity[[i]]$predictors_names)] ) - any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) - if(length(any_missing)>0){ - presabs <- presabs[-any_missing,] # This works as they are in the same order - model$biodiversity[[i]]$expect <- model$biodiversity[[i]]$expect[-any_missing] - # Fill the absences with 1 as multiplier. This works since absences follow the presences - model$biodiversity[[i]]$expect <- c( model$biodiversity[[i]]$expect, - rep(1, nrow(presabs)-length(model$biodiversity[[i]]$expect) )) - } - df <- subset(df, complete.cases(df)) - assertthat::assert_that(nrow(presabs) == nrow(df)) - - # Overwrite observation data - model$biodiversity[[i]]$observations <- presabs - - # Preprocessing security checks - assertthat::assert_that( all( model$biodiversity[[i]]$observations[['observed']] >= 0 ), - any(!is.na(presabs[['observed']])), - length(model$biodiversity[[i]]$expect)==nrow(model$biodiversity[[i]]$observations), - nrow(df) == nrow(model$biodiversity[[i]]$observations) - ) - - # Add offset if existent - if(!is.Waiver(model$offset)) { - # Respecify offset if not set - of <- model$offset; of[, "spatial_offset" ] <- ifelse(is.na(of[, "spatial_offset" ]), 1, of[, "spatial_offset"]) - of1 <- get_ngbvalue(coords = model$biodiversity[[i]]$observations[,c("x","y")], - env = of, - longlat = raster::isLonLat(bg), - field_space = c('x','y') - ) - df[["spatial_offset"]] <- of1 - } - - # Define expectation as very small vector following Renner et al. - w <- ppm_weights(df = df, - pa = model$biodiversity[[i]]$observations[['observed']], - bg = bg, - weight = 1e-6 - ) - df$w <- w * (1/model$biodiversity[[i]]$expect) # Also add as column - - model$biodiversity[[i]]$predictors <- df - model$biodiversity[[i]]$expect <- df$w - } else { - # calculating the case weights (equal weights) - # the order of weights should be the same as presences and backgrounds in the training data - prNum <- as.numeric(table(model$biodiversity[[i]]$observations[['observed']])["1"]) # number of presences - bgNum <- as.numeric(table(model$biodiversity[[i]]$observations[['observed']])["0"]) # number of backgrounds - w <- ifelse(model$biodiversity[[i]]$observations[['observed']] == 1, 1, prNum / bgNum) - model$biodiversity[[i]]$expect <- w * model$biodiversity[[i]]$expect # Multiply with provided weights - } - } - # --- # - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Building stan code.') - sm_code <- vector("list", 7) - names(sm_code) <- c("functions","data","transformed_data","parameters","transformed_parameters", - "model","generated_quantities") - - # Has intercept? - has_intercept <- attr(terms.formula(model$biodiversity[[1]]$equation), "intercept") == 1 - # Family and link function - fam <- model$biodiversity[[1]]$family - li <- model$biodiversity[[1]]$link - - # Any spatial or other functions needed? - if(!is.null(self$get_equation_latent_spatial())){ - # Stan functions for CAR and GP models - ir <- readLines( system.file("inst/stanfiles/spatial_functions.stan",package = "ibis.iSDM",mustWork = TRUE) ) - assertthat::assert_that(length(ir)>0) - for(i in ir) sm_code$functions <- append(sm_code$functions, i) - } - - # Load all the data parameters - ir <- readLines( system.file("stanfiles/data_parameters.stan",package = "ibis.iSDM",mustWork = TRUE) ) - assertthat::assert_that(length(ir)>0) - for(i in ir) sm_code$data <- append(sm_code$data, i) - - # Append prior to transformed parameters - sm_code$transformed_parameters <- append(sm_code$transformed_parameters," - // Prior contribution to log posterior - real lprior = 0;") - - # Equation has overall intercept - if(has_intercept){ - # Add data - sm_code$transformed_data <- append(sm_code$transformed_data," - int Kc = K - 1; - matrix[N, Kc] Xc; // centered version of X without an intercept - vector[Kc] means_X; // column means of X before centering - for (i in 2:K) { - means_X[i - 1] = mean(X[, i]); - Xc[, i - 1] = X[, i] - means_X[i - 1]; - } - ") - # Add population level effects for rest of coefficients - sm_code$parameters <- append(sm_code$parameters," - vector[Kc] b; // population-level effects - real Intercept; // temporary intercept for centered predictors - ") - # add a prior on the intercept - sm_code$transformed_parameters <- append(sm_code$transformed_parameters, - paste0(" - // priors including constants - lprior += student_t_lpdf(Intercept | 3, ", - ifelse(fam == "poisson", -2, 0), # Adapted student prior for poisson - ", 2.5); - ") - ) - # Generate actual population-level intercept - sm_code$generated_quantities <- append(sm_code$generated_quantities," - // actual population-level intercept - real b_Intercept = Intercept - dot_product(means_X, b); - ") - } - - # Transformed parameters - # Add (gaussian) priors to model likelihood if set - if((!is.Waiver(model$priors) || settings$get(what='varsel') == "none")){ - # If no intercept is specified, add beta - if(has_intercept){ - # Parameters - sm_code$parameters <- append(sm_code$parameters, " - vector[Kc] beta;") - } else { - sm_code$parameters <- append(sm_code$parameters, "vector[K] beta;") - } - - # Add priors for each variable for which it is set to the model - sm_code$transformed_parameters <- append(sm_code$transformed_parameters, "// beta priors including constants") - # Now add for each one a normal effect - for(i in 1:length(model$predictors_names)){ - if(!is.Waiver(model$priors)){ - if(model$predictors_names[i] %in% model$priors$varnames()) { - # Get prior estimats - pp <- model$priors$get(model$predictors_names[i]) - sm_code$transformed_parameters <- append(sm_code$transformed_parameters, paste0( - "lprior += normal_lpdf(beta[",i,"] | ",pp[1],", ",pp[2],");" - )) - } else { - # Default gaussian prior - sm_code$transformed_parameters <- append(sm_code$transformed_parameters, - paste0("lprior += normal_lpdf(beta[",i,"] | 0, 2);")) - } - } else { - # Default gaussian prior - sm_code$transformed_parameters <- append(sm_code$transformed_parameters, - paste0("lprior += normal_lpdf(beta[",i,"] | 0, 2);")) - } - } - } else - if( settings$get(what='varsel') == "reg" ){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Adding regularized Bayesian priors.') - # Add regularized horseshoe prior - # See brms::horseshoe - ir <- readLines( system.file("stanfiles/prior_functions.stan",package = "ibis.iSDM",mustWork = TRUE) ) - assertthat::assert_that(length(ir)>0) - for(i in ir) sm_code$functions <- append(sm_code$functions, i) - - sm_code$data <- append(sm_code$data," - // data for the horseshoe prior - real hs_df; // local degrees of freedom - real hs_df_global; // global degrees of freedom - real hs_df_slab; // slab degrees of freedom - real hs_scale_global; // global prior scale - real hs_scale_slab; // slab prior scale" - ) - - # Parameters and transformed parameters - sm_code$parameters <- append(sm_code$parameters," - // local parameters for horseshoe prior - vector[K] zb; - vector[K] hs_local; - // horseshoe shrinkage parameters - real hs_global; // global shrinkage parameters - real hs_slab; // slab regularization parameter - ") - sm_code$transformed_parameters <- append(sm_code$transformed_parameters," - vector[K] beta; // population-level effects - // compute actual regression coefficients - beta = horseshoe(zb, hs_local, hs_global, hs_scale_slab^2 * hs_slab); - ") - - # Finally add priors to model - sm_code$model <- append(sm_code$model, " - // priors including constants - target += std_normal_lpdf(zb); - target += student_t_lpdf(hs_local | hs_df, 0, 1) - - rows(hs_local) * log(0.5); - target += student_t_lpdf(hs_global | hs_df_global, 0, hs_scale_global) - - 1 * log(0.5); - target += inv_gamma_lpdf(hs_slab | 0.5 * hs_df_slab, 0.5 * hs_df_slab); - ") - } - - # Now add the model depending on the type - if(length(model$biodiversity)>1){ - # For integrated model - stop("TBD") - - } else if(model$biodiversity[[1]]$type == "poipo" && model$biodiversity[[1]]$family == "poisson"){ - # For poisson process model add likelihood - if(has_intercept){ - ir <- readLines( system.file("stanfiles/poipo_ll_poisson_intercept.stan",package = "ibis.iSDM",mustWork = TRUE)) - } else { - ir <- readLines( system.file("stanfiles/poipo_ll_poisson.stan",package = "ibis.iSDM",mustWork = TRUE) ) - } - assertthat::assert_that(length(ir)>0) - for(i in ir) sm_code$model <- append(sm_code$model, i) - - } else if(model$biodiversity[[1]]$type == "poipa" && model$biodiversity[[1]]$family == "binomial"){ - # For logistic regression - if(has_intercept){ - ir <- readLines( system.file("stanfiles/poipa_ll_bernoulli_intercept.stan",package = "ibis.iSDM",mustWork = TRUE) ) - } else { - ir <- readLines( system.file("stanfiles/poipa_ll_bernoulli.stan",package = "ibis.iSDM",mustWork = TRUE) ) - } - assertthat::assert_that(length(ir)>0) - for(i in ir) sm_code$model <- append(sm_code$model, i) - } else { - # Else - stop("Model as of now not implemented for Stan!") - } - # Append prior contributions to model - sm_code$model <- append(sm_code$model, " - // Prior contributions - target += lprior;") - - # Wrap list entries in model code and save in model object - self$set_data("stancode", wrap_stanmodel(sm_code)) - - # --- # - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') - - # Return modified model object - return(model) - }, - train = function(self, model, settings, ...){ - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting fitting...') - - # Define an algorithm for MCMC sampling - # an be "sampling" for MCMC (the default), "optimize" for optimization, - # "variational" for variational inference with independent normal distributions, - settings$set('algorithm', self$stan_param$algorithm) - settings$set('cores', self$stan_param$cores) - settings$set('chains', self$stan_param$chains) - settings$set('iter', self$stan_param$iter) - settings$set('warmup', self$stan_param$warmup) - settings$set('type', self$stan_param$type) - - # --- # - # Collect data for stan modelling - if(length(model$biodiversity)>1){ - stop("done") - } else { - has_intercept <- attr(terms(model$biodiversity[[1]]$equation), "intercept") - # Format data list - if(has_intercept == 1){ - pn <- c("Intercept",model$biodiversity[[1]]$predictors_names) - } else { pn <- model$biodiversity[[1]]$predictors_names } - - dl <- list( - N = nrow( model$biodiversity[[1]]$observations), - observed = model$biodiversity[[1]]$observations[["observed"]], - X = as.matrix( model$biodiversity[[1]]$predictors[, pn] ), - K = length( pn ), - offsets = log(model$biodiversity[[1]]$expect), # Notice that exposure is log-transformed here! - has_intercept = attr(terms(model$biodiversity[[1]]$equation), "intercept"), - has_spatial = ifelse(is.null(self$get_equation_latent_spatial()), 0, 1), - # Horseshoe prior default parameters - # FIXME: Allow passing this one via a parameter - hs_df = 1, - hs_df_global = 1, hs_df_slab = 4, - hs_scale_global = 1, hs_scale_slab = 2 - ) - # If any additional offset is set, simply to the existing one in sum - # This works as log(2) + log(5) == log(2*5) - if(!is.Waiver(model$offset)) dl$offsets <- dl$offsets + model$biodiversity[[1]]$offset[,"spatial_offset"] - } - - # Model estimation - # ---- # - # Fitting - fpath_code <- write_stanmodel( self$get_data("stancode") ) - fit_stan <- run_stan( - model_code = fpath_code, - data = dl, - algorithm = settings$get('algorithm'), - cores = self$stan_param$cores, - chains = self$stan_param$chains, - iter = self$stan_param$iter, - warmup = self$stan_param$warmup, - path = getwd(), - force = TRUE # Force recompile - ) - - # Prediction - if(!settings$get('inference_only')){ - # Messager - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') - - # Prepare prediction dataset - prediction <- self$get_data('template') # Get output raster and new data - # Full data for prediction - full <- subset(model$predictors, select = c('x','y',model$predictors_names)) - - # Clamp? - if( settings$get("clamp") ) full <- clamp_predictions(model, full) - - if(has_intercept==1) full$Intercept <- 1 - - # If poipo, add w to prediction container - bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" - if(any(bd_poipo)){ - # FIXME: Bit hackish. See if works for other projections - full$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value - } - # Add offset if set - if(!is.Waiver(model$offset)) { - # Offsets are simply added linearly (albeit transformed) - if(hasName(full,"w")) full$w <- full$w + model$offset[,"spatial_offset"] else full$w <- model$offset[,"spatial_offset"] - } - suppressWarnings( - full <- sp::SpatialPointsDataFrame(coords = full[,c("x","y")], - data = full, - proj4string = sp::CRS( sp::proj4string(as(model$background, "Spatial")) ) - ) - ) - full <- as(full, 'SpatialPixelsDataFrame') - - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(full)) next() - full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - - # For Integrated model, follow poisson - fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) - - # Do the prediction by sampling from the posterior - out <- posterior_predict_stanfit(obj = fit_stan, - form = to_formula(paste0("observed ~ ", - ifelse(has_intercept==1, "Intercept + ", ""), - paste(model$biodiversity[[1]]$predictors_names,collapse = " + "))), - newdata = full@data, - offset = (full$w), - family = fam, # Family - mode = self$stan_param$type # Type - ) - - # Convert full to raster - prediction <- raster::stack(full) - # Fill output - prediction <- fill_rasters(post = out, background = prediction) - prediction <- raster::mask(prediction, model$background) # Mask with background - # plot(prediction$mean, col = ibis.iSDM:::ibis_colours$sdm_colour) - try({ rm(out) }) - } else { - prediction <- NULL - } - - # Compute end of computation time - settings$set('end.time', Sys.time()) - - # Define output - # Create output - out <- bdproto( - "STAN-Model", - DistributionModel, - id = model$id, - model = model, - settings = settings, - fits = list( - "fit_best" = fit_stan, - "prediction" = prediction, - "sm_code" = self$get_data("stancode") - ), - # Project function - project = function(self, newdata, offset = NULL, type = NULL, layer = "mean"){ - assertthat::assert_that( - nrow(newdata) > 0, - all( c("x", "y") %in% names(newdata) ), - is.null(offset) || is.numeric(offset), - is.character(type) || is.null(type) - ) - # Check that fitted model exists - obj <- self$get_data("fit_best") - model <- self$model - settings <- self$settings - if(is.null(type)) type <- settings$get("type") - assertthat::assert_that(inherits(obj, "stanfit"), - all(model$predictors_names %in% colnames(newdata))) - - # Clamp? - if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) - - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% names(newdata)) next() - newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] - } - } - - # For Integrated model, follow poisson - fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) - - # Build prediction stack - full <- subset(newdata, select = c('x','y', model$predictors_names)) - - # If poipo, add w to prediction container - bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" - if(any(bd_poipo)){ - # FIXME: Bit hackish. See if works for other projections - full$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value - } - # Add offset if set - if(!is.null(offset)) { - # Offsets are simply added linearly (albeit transformed) - if(hasName(full,"w")) full$w <- full$w + offset else full$w <- offset - } - suppressWarnings( - full <- sp::SpatialPointsDataFrame(coords = full[,c("x","y")], - data = full, - proj4string = sp::CRS( sp::proj4string(as(model$background, "Spatial")) ) - ) - ) - full <- as(full, 'SpatialPixelsDataFrame') - - # Do the prediction by sampling from the posterior - pred_stan <- posterior_predict_stanfit(obj = obj, - form = to_formula(paste0("observed ~ ", paste(model$predictors_names,collapse = " + "))), - newdata = full@data, - offset = (full$w), - family = fam, - mode = type # Linear predictor - ) - - # Fill output with summaries of the posterior - prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background - prediction <- fill_rasters(pred_stan, prediction) - - return(prediction) - - }, - # Partial effect - partial = function(self, x.var, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = "predictor"){ - # Get model and intercept if present - mod <- self$get_data('fit_best') - model <- self$model - has_intercept <- attr(terms(model$biodiversity[[1]]$equation), "intercept") - if(is.null(type)) type <- self$settings$get("type") - assertthat::assert_that(inherits(mod,'stanfit'), - is.character(x.var), - is.numeric(variable_length) && variable_length > 1, - is.null(constant) || is.numeric(constant) - ) - # Check that given variable is in x.var - assertthat::assert_that(x.var %in% model$predictors_names) - # Calculate - rr <- sapply(model$predictors, function(x) range(x, na.rm = TRUE)) |> as.data.frame() - df_partial <- list() - if(!is.null(values)){ variable_length <- length(values) } - - # Add all others as constant - if(is.null(constant)){ - for(n in names(rr)) df_partial[[n]] <- rep( mean(model$predictors[[n]], na.rm = TRUE), variable_length ) - } else { - for(n in names(rr)) df_partial[[n]] <- rep( constant, variable_length ) - } - if(!is.null(values)){ - df_partial[[x.var]] <- values - } else { - df_partial[[x.var]] <- seq(rr[1,x.var], rr[2,x.var], length.out = variable_length) - } - df_partial <- df_partial %>% as.data.frame() - - # For Integrated model, follow poisson - fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) - - # Add intercept if present - if(has_intercept==1) df_partial$Intercept <- 1 - # If poipo, add w to prediction container - bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" - if(any(bd_poipo)){ - # FIXME: Bit hackish. See if works for other projections - df_partial$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value - } - # Add offset if set - if(!is.Waiver(model$offset)) { - # Offsets are simply added linearly (albeit transformed). - # FIXME: Taken here as average. To be re-evaluated as use case develops! - if(hasName(df_partial,"w")) df_partial$w <- df_partial$w + mean(model$offset,na.rm = TRUE) else df_partial$w <- mean(model$offset,na.rm = TRUE) - } - # Simulate from the posterior - pred_part <- posterior_predict_stanfit(obj = mod, - form = to_formula(paste0("observed ~ ", - ifelse(has_intercept==1, "Intercept +", ""), - paste(model$predictors_names,collapse = " + "))), - newdata = df_partial, - offset = df_partial$w, - family = fam, - mode = type # Linear predictor - ) - # Also attach the partial variable - pred_part <- cbind("partial_effect" = df_partial[[x.var]], pred_part) - if(plot){ - o <- pred_part - pm <- ggplot2::ggplot(data = o, ggplot2::aes(x = partial_effect, y = mean, - ymin = mean-sd, - ymax = mean+sd) ) + - ggplot2::theme_classic() + - ggplot2::geom_ribbon(fill = "grey90") + - ggplot2::geom_line() + - ggplot2::labs(x = x.var, y = "Partial effect") - print(pm) - } - return(pred_part) # Return the partial data - }, - # Spatial partial effect plots - spartial = function(self, x.var, constant = NULL, plot = TRUE,type = "predictor", ...){ - # Get model object and check that everything is in order - mod <- self$get_data('fit_best') - model <- self$model - has_intercept <- attr(terms(model$biodiversity[[1]]$equation), "intercept") - assertthat::assert_that(inherits(mod,'stanfit'), - 'model' %in% names(self), - is.character(x.var), - is.null(constant) || is.numeric(constant) - ) - - # Match variable name - x.var <- match.arg(x.var, model$predictors_names, several.ok = FALSE) - - # Make spatial container for prediction - suppressWarnings( - df_partial <- sp::SpatialPointsDataFrame(coords = model$predictors[,c('x', 'y')], - data = model$predictors[, names(model$predictors) %notin% c('x','y')], - proj4string = sp::CRS( sp::proj4string(as(model$background, "Spatial")) ) - ) - ) - df_partial <- as(df_partial, 'SpatialPixelsDataFrame') - - # Add all others as constant - if(is.null(constant)){ - for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- mean(model$predictors[[n]], na.rm = TRUE) - } else { - for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- constant - } - bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" - if(any(bd_poipo)){ - # FIXME: Bit hackish. See if works for other projections - df_partial$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value - } - - # For Integrated model, follow poisson - fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) - - # Check intercept - if(has_intercept==1) df_partial$Intercept <- 1 - # If poipo, add w to prediction container - bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" - if(any(bd_poipo)){ - # FIXME: Bit hackish. See if works for other projections - df_partial$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value - } else { df_partial$w <- NULL } - - # Simulate from the posterior - pred_part <- posterior_predict_stanfit(obj = mod, - form = to_formula(paste0("observed ~ Intercept + ", - paste(model$predictors_names,collapse = " + "))), - newdata = df_partial@data, - offset = df_partial$w, - family = fam, - mode = type # Linear predictor - ) - - prediction <- emptyraster( self$get_data('prediction') ) # Background - prediction <- fill_rasters(pred_part, prediction) - - # Do plot and return result - if(plot){ - plot(prediction[[c("mean","sd")]], col = ibis_colours$viridis_orig) - } - return(prediction) - }, - get_coefficients = function(self){ - # Returns a vector of the coefficients with direction/importance - cofs <- self$summary() - if(nrow(cofs)==0) return(NULL) - cofs <- subset(cofs, select = c("parameter", "mean", "sd")) - names(cofs) <- c("Feature", "Beta", "Sigma") - # Remove intercept(s) - int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) - if(length(int)>0) cofs <- cofs[-int,] - return(cofs) - }, - # Spatial latent effect - plot_spatial = function(self, plot = TRUE){ - return(NULL) - }, - # Custom function to show stan code - stancode = function(self){ - message( - self$get_data("sm_code") - ) - } - ) - # Return - return(out) - - } # End of Train - ) - ) # End of engine definition -} +#' @include bdproto-engine.R bdproto-distributionmodel.R +NULL +#' Use Stan as engine +#' +#' @description Stan is probabilistic programming language that can be used to +#' specify most types of statistical linear and non-linear regression models. +#' Stan provides full Bayesian inference for continuous-variable models through Markov chain Monte Carlo methods +#' such as the No-U-Turn sampler, an adaptive form of Hamiltonian Monte Carlo sampling. +#' Stan code has to be written separately and this function acts as compiler to +#' build the stan-model. +#' **Requires the [cmdstanr] package to be installed!** +#' @details +#' By default the posterior is obtained through sampling, however stan also supports +#' approximate inference forms through penalized maximum likelihood estimation (see Carpenter et al. 2017). +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param chains A positive [`integer`] specifying the number of Markov chains (Default: \code{4} chains). +#' @param iter A positive [`integer`] specifying the number of iterations for each chain (including warmup). (Default: \code{2000}). +#' @param warmup A positive [`integer`] specifying the number of warmup (aka burnin) iterations per chain. +#' If step-size adaptation is on (Default: \code{TRUE}), this also controls the number of iterations for which +#' adaptation is run (and hence these warmup samples should not be used for inference). +#' The number of warmup iterations should be smaller than \code{iter} and the default is \code{iter/2}. +#' @param cores If set to NULL take values from specified ibis option \code{getOption('ibis.nthread')}. +#' @param init Initial values for parameters (Default: \code{'random'}). Can also be specified as [list] (see: [`rstan::stan`]) +#' @param algorithm Mode used to sample from the posterior. Available options are \code{"sampling"}, \code{"optimize"}, +#' or \code{"variational"}. +#' See [`cmdstanr`] package for more details. (Default: \code{"sampling"}). +#' @param control See [`rstan::stan`] for more details on specifying the controls. +#' @param type The mode used for creating posterior predictions. Either summarizing the linear \code{"predictor"} or \code{"response"} (Default: \code{"response"}). +#' @param ... Other variables +#' @seealso [rstan], [cmdstanr] +#' @note +#' The function \code{obj$stancode()} can be used to print out the stancode of the model. +#' @references +#' * Jonah Gabry and Rok Češnovar (2021). cmdstanr: R Interface to 'CmdStan'. https://mc-stan.org/cmdstanr, https://discourse.mc-stan.org. +#' * Carpenter, B., Gelman, A., Hoffman, M. D., Lee, D., Goodrich, B., Betancourt, M., ... & Riddell, A. (2017). Stan: A probabilistic programming language. Journal of statistical software, 76(1), 1-32. +#' * Piironen, J., & Vehtari, A. (2017). Sparsity information and regularization in the horseshoe and other shrinkage priors. Electronic Journal of Statistics, 11(2), 5018-5051. +#' @family engine +#' @returns An [engine]. +#' @examples +#' \dontrun{ +#' # Add Stan as an engine +#' x <- distribution(background) |> engine_stan(iter = 1000) +#' } +#' @name engine_stan +NULL +#' @rdname engine_stan +#' @export +engine_stan <- function(x, + chains = 4, + iter = 2000, + warmup = floor(iter/2), + init = "random", + cores = getOption("ibis.nthread"), + algorithm = 'sampling', + control = list(adapt_delta = 0.95), + type = "response", + ...) { + # Check whether INLA package is available + check_package('rstan') + if(!isNamespaceLoaded("rstan")) { attachNamespace("rstan");requireNamespace('rstan') } + stan_check_cmd(install = TRUE) + check_package("cmdstanr") + assertthat::assert_that( cmdstanr::cmdstan_version()>"2.26.0") + + # assert that arguments are valid + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(x$background,'sf') + ) + # Other checks of parameters + assertthat::assert_that( + is.numeric(chains), is.numeric(iter), is.numeric(warmup), + is.null(cores) || is.numeric(cores), + is.character(init) || is.list(init), + is.null(control) || is.list(control), + is.character(algorithm), + msg = 'Input parameters wrongly specified!' + ) + # Match algorithm and posterior prediction type + algorithm <- match.arg(algorithm, c("sampling", "optimize", "variational"),several.ok = FALSE) + type <- match.arg(type, c("response", "predictor"), several.ok = FALSE) + if(is.null(cores)) cores <- getOption('ibis.nthread') + + # Create a background raster + if(is.Waiver(x$predictors)){ + # Create from background + template <- raster::raster( + ext = raster::extent(x$background), + crs = raster::projection(x$background), + res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution + diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 + ) + ) + } else { + # If predictor existing, use them + template <- emptyraster(x$predictors$get_data() ) + } + # Burn in the background + template <- raster::rasterize(x$background, template, field = 0) + + # Set engine in distribution object + x$set_engine( + bdproto( + "STAN-Engine", + Engine, + name = "", + data = list( + 'template' = template + ), + # Stan options + stan_param = list( + chains = chains, iter = iter, + warmup = warmup, init = init, + cores = cores, algorithm = algorithm, + control = control, + type = type + ), + # Function to respecify the control parameters + set_control = function(self, + chains = 4, + iter = 2000, + warmup = 500, + init = "random", + cores = NULL, + control = NULL){ + + # Overwrite existing + self$stan_param <- list( + chains = chains, iter = iter, + warmup = warmup, init = init, + cores = cores, algorithm = algorithm, + control = control + ) + + }, + # Spatial latent effect + get_equation_latent_spatial = function(self){ + return( NULL ) + }, + # Setup a model + setup = function(self, model, settings = NULL, ...){ + # Simple security checks + assertthat::assert_that( + assertthat::has_name(model, 'background'), + assertthat::has_name(model, 'biodiversity'), + inherits(settings,'Settings') || is.null(settings), + nrow(model$predictors) == ncell(self$get_data('template')) + ) + # Check that all stan parameters are appropriately set + assertthat::assert_that( + is.numeric(self$stan_param$chains), + is.numeric(self$stan_param$iter), + is.numeric(self$stan_param$warmup) + ) + # Set cores + options(mc.cores = self$stan_param$cores) + + # FIXME: Stan should handle factors directly. For now outsourced to split up + if(any(model$predictors_types$type=="factor")){ + vf <- model$predictors_types$predictors[model$predictors_types$type=="factor"] + for(k in vf){ + o <- explode_factor(model$predictors[[k]],name = k) + model$predictors <- cbind(model$predictors, o) + model$predictors_names <- c(model$predictors_names, colnames(o)) + model$predictors_types <- rbind(model$predictors_types, + data.frame(predictors = colnames(o), type = "numeric") ) + # Finally remove the original column from the predictor object + model$predictors[[k]] <- NULL + model$predictors_names <- model$predictors_names[-which( model$predictors_names == k )] + model$predictors_types <- subset(model$predictors_types, subset = predictors != k) + # Explode the columns in the raster object + model$predictors_object$data <- raster::addLayer( + model$predictors_object$data, + explode_factorized_raster(model$predictors_object$data[[k]]) + ) + model$predictors_object$data <- raster::dropLayer(model$predictors_object$data, k) + } + } + + # Stan procedure - First add integration points to all poipo datasets + # FIXME: Possibly outsoure this across methods + for(i in 1:length(model$biodiversity)){ + + # If there any factor variables split them per type and explode them + if(any(model$biodiversity[[i]]$predictors_types$type=="factor")){ + vf <- model$biodiversity[[i]]$predictors_types$predictors[model$biodiversity[[i]]$predictors_types$type=="factor"] + fv <- model$biodiversity[[i]]$predictors[vf] + for(k in 1:ncol(fv)){ + o <- explode_factor(fv[,k],name = colnames(fv)[k]) + # Add + model$biodiversity[[i]]$predictors <- cbind(model$biodiversity[[i]]$predictors, o) + model$biodiversity[[i]]$predictors_names <- c(model$biodiversity[[i]]$predictors_names, colnames(o)) + model$biodiversity[[i]]$predictors_types <- rbind(model$biodiversity[[i]]$predictors_types, + data.frame(predictors = colnames(o), type = "numeric") ) + # Finally remove the original column from the predictor object + model$biodiversity[[i]]$predictors[[colnames(fv)[k]]] <- NULL + model$biodiversity[[i]]$predictors_names <- model$biodiversity[[i]]$predictors_names[-which( model$biodiversity[[i]]$predictors_names == colnames(fv)[k] )] + model$biodiversity[[i]]$predictors_types <- subset(model$biodiversity[[i]]$predictors_types, subset = predictors != colnames(fv)[k]) + } + } + + # Add pseudo-absence points if necessary, by including nearest predictor values for each + if('poipo' == model$biodiversity[[i]]$type){ + + # Get background layer + bg <- x$engine$get_data('template') + assertthat::assert_that(!is.na(cellStats(bg,min))) + + # Add pseudo-absence points + presabs <- add_pseudoabsence(df = model$biodiversity[[i]]$observations, + field_occurrence = 'observed', + template = bg, + settings = model$biodiversity[[i]]$pseudoabsence_settings) + if(inherits(presabs, 'sf')) presabs <- presabs |> sf::st_drop_geometry() + # Sample environmental points for absence only points + abs <- subset(presabs, observed == 0) + # Re-extract environmental information for absence points + envs <- get_rastervalue(coords = abs[,c('x','y')], + env = model$predictors_object$get_data(df = FALSE), + rm.na = FALSE) + if(assertthat::has_name(model$biodiversity[[i]]$predictors, "Intercept")){ envs$Intercept <- 1} + + # Format out + df <- rbind(model$biodiversity[[i]]$predictors[,c('x','y','Intercept', model$biodiversity[[i]]$predictors_names)], + envs[,c('x','y','Intercept', model$biodiversity[[i]]$predictors_names)] ) + any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) + if(length(any_missing)>0){ + presabs <- presabs[-any_missing,] # This works as they are in the same order + model$biodiversity[[i]]$expect <- model$biodiversity[[i]]$expect[-any_missing] + # Fill the absences with 1 as multiplier. This works since absences follow the presences + model$biodiversity[[i]]$expect <- c( model$biodiversity[[i]]$expect, + rep(1, nrow(presabs)-length(model$biodiversity[[i]]$expect) )) + } + df <- subset(df, stats::complete.cases(df)) + assertthat::assert_that(nrow(presabs) == nrow(df)) + + # Overwrite observation data + model$biodiversity[[i]]$observations <- presabs + + # Preprocessing security checks + assertthat::assert_that( all( model$biodiversity[[i]]$observations[['observed']] >= 0 ), + any(!is.na(presabs[['observed']])), + length(model$biodiversity[[i]]$expect)==nrow(model$biodiversity[[i]]$observations), + nrow(df) == nrow(model$biodiversity[[i]]$observations) + ) + + # Add offset if existent + if(!is.Waiver(model$offset)) { + # Respecify offset if not set + of <- model$offset; of[, "spatial_offset" ] <- ifelse(is.na(of[, "spatial_offset" ]), 1, of[, "spatial_offset"]) + of1 <- get_ngbvalue(coords = model$biodiversity[[i]]$observations[,c("x","y")], + env = of, + longlat = raster::isLonLat(bg), + field_space = c('x','y') + ) + df[["spatial_offset"]] <- of1 + } + + # Define expectation as very small vector following Renner et al. + w <- ppm_weights(df = df, + pa = model$biodiversity[[i]]$observations[['observed']], + bg = bg, + weight = 1e-6 + ) + df$w <- w * (1/model$biodiversity[[i]]$expect) # Also add as column + + model$biodiversity[[i]]$predictors <- df + model$biodiversity[[i]]$expect <- df$w + } else { + # calculating the case weights (equal weights) + # the order of weights should be the same as presences and backgrounds in the training data + prNum <- as.numeric(table(model$biodiversity[[i]]$observations[['observed']])["1"]) # number of presences + bgNum <- as.numeric(table(model$biodiversity[[i]]$observations[['observed']])["0"]) # number of backgrounds + w <- ifelse(model$biodiversity[[i]]$observations[['observed']] == 1, 1, prNum / bgNum) + model$biodiversity[[i]]$expect <- w * model$biodiversity[[i]]$expect # Multiply with provided weights + } + } + # --- # + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Building stan code.') + sm_code <- vector("list", 7) + names(sm_code) <- c("functions","data","transformed_data","parameters","transformed_parameters", + "model","generated_quantities") + + # Has intercept? + has_intercept <- attr(stats::terms.formula(model$biodiversity[[1]]$equation), "intercept") == 1 + # Family and link function + fam <- model$biodiversity[[1]]$family + li <- model$biodiversity[[1]]$link + + # Any spatial or other functions needed? + if(!is.null(self$get_equation_latent_spatial())){ + # Stan functions for CAR and GP models + ir <- readLines( system.file("inst/stanfiles/spatial_functions.stan",package = "ibis.iSDM",mustWork = TRUE) ) + assertthat::assert_that(length(ir)>0) + for(i in ir) sm_code$functions <- append(sm_code$functions, i) + } + + # Load all the data parameters + ir <- readLines( system.file("stanfiles/data_parameters.stan",package = "ibis.iSDM",mustWork = TRUE) ) + assertthat::assert_that(length(ir)>0) + for(i in ir) sm_code$data <- append(sm_code$data, i) + + # Append prior to transformed parameters + sm_code$transformed_parameters <- append(sm_code$transformed_parameters," + // Prior contribution to log posterior + real lprior = 0;") + + # Equation has overall intercept + if(has_intercept){ + # Add data + sm_code$transformed_data <- append(sm_code$transformed_data," + int Kc = K - 1; + matrix[N, Kc] Xc; // centered version of X without an intercept + vector[Kc] means_X; // column means of X before centering + for (i in 2:K) { + means_X[i - 1] = mean(X[, i]); + Xc[, i - 1] = X[, i] - means_X[i - 1]; + } + ") + # Add population level effects for rest of coefficients + sm_code$parameters <- append(sm_code$parameters," + vector[Kc] b; // population-level effects + real Intercept; // temporary intercept for centered predictors + ") + # add a prior on the intercept + sm_code$transformed_parameters <- append(sm_code$transformed_parameters, + paste0(" + // priors including constants + lprior += student_t_lpdf(Intercept | 3, ", + ifelse(fam == "poisson", -2, 0), # Adapted student prior for poisson + ", 2.5); + ") + ) + # Generate actual population-level intercept + sm_code$generated_quantities <- append(sm_code$generated_quantities," + // actual population-level intercept + real b_Intercept = Intercept - dot_product(means_X, b); + ") + } + + # Transformed parameters + # Add (gaussian) priors to model likelihood if set + if((!is.Waiver(model$priors) || settings$get(what='varsel') == "none")){ + # If no intercept is specified, add beta + if(has_intercept){ + # Parameters + sm_code$parameters <- append(sm_code$parameters, " + vector[Kc] beta;") + } else { + sm_code$parameters <- append(sm_code$parameters, "vector[K] beta;") + } + + # Add priors for each variable for which it is set to the model + sm_code$transformed_parameters <- append(sm_code$transformed_parameters, "// beta priors including constants") + # Now add for each one a normal effect + for(i in 1:length(model$predictors_names)){ + if(!is.Waiver(model$priors)){ + if(model$predictors_names[i] %in% model$priors$varnames()) { + # Get prior estimats + pp <- model$priors$get(model$predictors_names[i]) + sm_code$transformed_parameters <- append(sm_code$transformed_parameters, paste0( + "lprior += normal_lpdf(beta[",i,"] | ",pp[1],", ",pp[2],");" + )) + } else { + # Default gaussian prior + sm_code$transformed_parameters <- append(sm_code$transformed_parameters, + paste0("lprior += normal_lpdf(beta[",i,"] | 0, 2);")) + } + } else { + # Default gaussian prior + sm_code$transformed_parameters <- append(sm_code$transformed_parameters, + paste0("lprior += normal_lpdf(beta[",i,"] | 0, 2);")) + } + } + } else + if( settings$get(what='varsel') == "reg" ){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Adding regularized Bayesian priors.') + # Add regularized horseshoe prior + # See brms::horseshoe + ir <- readLines( system.file("stanfiles/prior_functions.stan",package = "ibis.iSDM",mustWork = TRUE) ) + assertthat::assert_that(length(ir)>0) + for(i in ir) sm_code$functions <- append(sm_code$functions, i) + + sm_code$data <- append(sm_code$data," + // data for the horseshoe prior + real hs_df; // local degrees of freedom + real hs_df_global; // global degrees of freedom + real hs_df_slab; // slab degrees of freedom + real hs_scale_global; // global prior scale + real hs_scale_slab; // slab prior scale" + ) + + # Parameters and transformed parameters + sm_code$parameters <- append(sm_code$parameters," + // local parameters for horseshoe prior + vector[K] zb; + vector[K] hs_local; + // horseshoe shrinkage parameters + real hs_global; // global shrinkage parameters + real hs_slab; // slab regularization parameter + ") + sm_code$transformed_parameters <- append(sm_code$transformed_parameters," + vector[K] beta; // population-level effects + // compute actual regression coefficients + beta = horseshoe(zb, hs_local, hs_global, hs_scale_slab^2 * hs_slab); + ") + + # Finally add priors to model + sm_code$model <- append(sm_code$model, " + // priors including constants + target += std_normal_lpdf(zb); + target += student_t_lpdf(hs_local | hs_df, 0, 1) + - rows(hs_local) * log(0.5); + target += student_t_lpdf(hs_global | hs_df_global, 0, hs_scale_global) + - 1 * log(0.5); + target += inv_gamma_lpdf(hs_slab | 0.5 * hs_df_slab, 0.5 * hs_df_slab); + ") + } + + # Now add the model depending on the type + if(length(model$biodiversity)>1){ + # For integrated model + stop("TBD") + + } else if(model$biodiversity[[1]]$type == "poipo" && model$biodiversity[[1]]$family == "poisson"){ + # For poisson process model add likelihood + if(has_intercept){ + ir <- readLines( system.file("stanfiles/poipo_ll_poisson_intercept.stan",package = "ibis.iSDM",mustWork = TRUE)) + } else { + ir <- readLines( system.file("stanfiles/poipo_ll_poisson.stan",package = "ibis.iSDM",mustWork = TRUE) ) + } + assertthat::assert_that(length(ir)>0) + for(i in ir) sm_code$model <- append(sm_code$model, i) + + } else if(model$biodiversity[[1]]$type == "poipa" && model$biodiversity[[1]]$family == "binomial"){ + # For logistic regression + if(has_intercept){ + ir <- readLines( system.file("stanfiles/poipa_ll_bernoulli_intercept.stan",package = "ibis.iSDM",mustWork = TRUE) ) + } else { + ir <- readLines( system.file("stanfiles/poipa_ll_bernoulli.stan",package = "ibis.iSDM",mustWork = TRUE) ) + } + assertthat::assert_that(length(ir)>0) + for(i in ir) sm_code$model <- append(sm_code$model, i) + } else { + # Else + stop("Model as of now not implemented for Stan!") + } + # Append prior contributions to model + sm_code$model <- append(sm_code$model, " + // Prior contributions + target += lprior;") + + # Wrap list entries in model code and save in model object + self$set_data("stancode", wrap_stanmodel(sm_code)) + + # --- # + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') + + # Return modified model object + return(model) + }, + train = function(self, model, settings, ...){ + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting fitting...') + + # Define an algorithm for MCMC sampling + # an be "sampling" for MCMC (the default), "optimize" for optimization, + # "variational" for variational inference with independent normal distributions, + settings$set('algorithm', self$stan_param$algorithm) + settings$set('cores', self$stan_param$cores) + settings$set('chains', self$stan_param$chains) + settings$set('iter', self$stan_param$iter) + settings$set('warmup', self$stan_param$warmup) + settings$set('type', self$stan_param$type) + + # --- # + # Collect data for stan modelling + if(length(model$biodiversity)>1){ + stop("done") + } else { + has_intercept <- attr(stats::terms(model$biodiversity[[1]]$equation), "intercept") + # Format data list + if(has_intercept == 1){ + pn <- c("Intercept",model$biodiversity[[1]]$predictors_names) + } else { pn <- model$biodiversity[[1]]$predictors_names } + + dl <- list( + N = nrow( model$biodiversity[[1]]$observations), + observed = model$biodiversity[[1]]$observations[["observed"]], + X = as.matrix( model$biodiversity[[1]]$predictors[, pn] ), + K = length( pn ), + offsets = log(model$biodiversity[[1]]$expect), # Notice that exposure is log-transformed here! + has_intercept = attr(stats::terms(model$biodiversity[[1]]$equation), "intercept"), + has_spatial = ifelse(is.null(self$get_equation_latent_spatial()), 0, 1), + # Horseshoe prior default parameters + # FIXME: Allow passing this one via a parameter + hs_df = 1, + hs_df_global = 1, hs_df_slab = 4, + hs_scale_global = 1, hs_scale_slab = 2 + ) + # If any additional offset is set, simply to the existing one in sum + # This works as log(2) + log(5) == log(2*5) + if(!is.Waiver(model$offset)) dl$offsets <- dl$offsets + model$biodiversity[[1]]$offset[,"spatial_offset"] + } + + # Model estimation + # ---- # + # Fitting + fpath_code <- write_stanmodel( self$get_data("stancode") ) + fit_stan <- run_stan( + model_code = fpath_code, + data = dl, + algorithm = settings$get('algorithm'), + cores = self$stan_param$cores, + chains = self$stan_param$chains, + iter = self$stan_param$iter, + warmup = self$stan_param$warmup, + path = getwd(), + force = TRUE # Force recompile + ) + + # Prediction + if(!settings$get('inference_only')){ + # Messager + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') + + # Prepare prediction dataset + prediction <- self$get_data('template') # Get output raster and new data + # Full data for prediction + full <- subset(model$predictors, select = c('x','y',model$predictors_names)) + + # Clamp? + if( settings$get("clamp") ) full <- clamp_predictions(model, full) + + if(has_intercept==1) full$Intercept <- 1 + + # If poipo, add w to prediction container + bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" + if(any(bd_poipo)){ + # FIXME: Bit hackish. See if works for other projections + full$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value + } + # Add offset if set + if(!is.Waiver(model$offset)) { + # Offsets are simply added linearly (albeit transformed) + if(utils::hasName(full,"w")) full$w <- full$w + model$offset[,"spatial_offset"] else full$w <- model$offset[,"spatial_offset"] + } + suppressWarnings( + full <- sp::SpatialPointsDataFrame(coords = full[,c("x","y")], + data = full, + proj4string = sp::CRS(sp::proj4string(methods::as(model$background, "Spatial"))) + ) + ) + full <- methods::as(full, 'SpatialPixelsDataFrame') + + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(full)) next() + full[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + + # For Integrated model, follow poisson + fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) + + # Do the prediction by sampling from the posterior + out <- posterior_predict_stanfit(obj = fit_stan, + form = to_formula(paste0("observed ~ ", + ifelse(has_intercept==1, "Intercept + ", ""), + paste(model$biodiversity[[1]]$predictors_names,collapse = " + "))), + newdata = full@data, + offset = (full$w), + family = fam, # Family + mode = self$stan_param$type # Type + ) + + # Convert full to raster + prediction <- raster::stack(full) + # Fill output + prediction <- fill_rasters(post = out, background = prediction) + prediction <- raster::mask(prediction, model$background) # Mask with background + # plot(prediction$mean, col = ibis.iSDM:::ibis_colours$sdm_colour) + try({ rm(out) }) + } else { + prediction <- NULL + } + + # Compute end of computation time + settings$set('end.time', Sys.time()) + + # Define output + # Create output + out <- bdproto( + "STAN-Model", + DistributionModel, + id = model$id, + model = model, + settings = settings, + fits = list( + "fit_best" = fit_stan, + "prediction" = prediction, + "sm_code" = self$get_data("stancode") + ), + # Project function + project = function(self, newdata, offset = NULL, type = NULL, layer = "mean"){ + assertthat::assert_that( + nrow(newdata) > 0, + all( c("x", "y") %in% names(newdata) ), + is.null(offset) || is.numeric(offset), + is.character(type) || is.null(type) + ) + # Check that fitted model exists + obj <- self$get_data("fit_best") + model <- self$model + settings <- self$settings + if(is.null(type)) type <- settings$get("type") + assertthat::assert_that(inherits(obj, "stanfit"), + all(model$predictors_names %in% colnames(newdata))) + + # Clamp? + if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) + + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% names(newdata)) next() + newdata[[settings$get('bias_variable')[i]]] <- settings$get('bias_value')[i] + } + } + + # For Integrated model, follow poisson + fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) + + # Build prediction stack + full <- subset(newdata, select = c('x','y', model$predictors_names)) + + # If poipo, add w to prediction container + bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" + if(any(bd_poipo)){ + # FIXME: Bit hackish. See if works for other projections + full$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value + } + # Add offset if set + if(!is.null(offset)) { + # Offsets are simply added linearly (albeit transformed) + if(utils::hasName(full,"w")) full$w <- full$w + offset else full$w <- offset + } + suppressWarnings( + full <- sp::SpatialPointsDataFrame(coords = full[,c("x","y")], + data = full, + proj4string = sp::CRS(sp::proj4string(methods::as(model$background, "Spatial"))) + ) + ) + full <- methods::as(full, 'SpatialPixelsDataFrame') + + # Do the prediction by sampling from the posterior + pred_stan <- posterior_predict_stanfit(obj = obj, + form = to_formula(paste0("observed ~ ", paste(model$predictors_names,collapse = " + "))), + newdata = full@data, + offset = (full$w), + family = fam, + mode = type # Linear predictor + ) + + # Fill output with summaries of the posterior + prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background + prediction <- fill_rasters(pred_stan, prediction) + + return(prediction) + + }, + # Partial effect + partial = function(self, x.var, constant = NULL, variable_length = 100, values = NULL, plot = FALSE, type = "predictor"){ + # Get model and intercept if present + mod <- self$get_data('fit_best') + model <- self$model + has_intercept <- attr(stats::terms(model$biodiversity[[1]]$equation), "intercept") + if(is.null(type)) type <- self$settings$get("type") + assertthat::assert_that(inherits(mod,'stanfit'), + is.character(x.var), + is.numeric(variable_length) && variable_length > 1, + is.null(constant) || is.numeric(constant) + ) + # Check that given variable is in x.var + assertthat::assert_that(x.var %in% model$predictors_names) + # Calculate + rr <- sapply(model$predictors, function(x) range(x, na.rm = TRUE)) |> as.data.frame() + df_partial <- list() + if(!is.null(values)){ variable_length <- length(values) } + + # Add all others as constant + if(is.null(constant)){ + for(n in names(rr)) df_partial[[n]] <- rep( mean(model$predictors[[n]], na.rm = TRUE), variable_length ) + } else { + for(n in names(rr)) df_partial[[n]] <- rep( constant, variable_length ) + } + if(!is.null(values)){ + df_partial[[x.var]] <- values + } else { + df_partial[[x.var]] <- seq(rr[1,x.var], rr[2,x.var], length.out = variable_length) + } + df_partial <- df_partial |> as.data.frame() + + # For Integrated model, follow poisson + fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) + + # Add intercept if present + if(has_intercept==1) df_partial$Intercept <- 1 + # If poipo, add w to prediction container + bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" + if(any(bd_poipo)){ + # FIXME: Bit hackish. See if works for other projections + df_partial$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value + } + # Add offset if set + if(!is.Waiver(model$offset)) { + # Offsets are simply added linearly (albeit transformed). + # FIXME: Taken here as average. To be re-evaluated as use case develops! + if(utils::hasName(df_partial,"w")) df_partial$w <- df_partial$w + mean(model$offset,na.rm = TRUE) else df_partial$w <- mean(model$offset,na.rm = TRUE) + } + # Simulate from the posterior + pred_part <- posterior_predict_stanfit(obj = mod, + form = to_formula(paste0("observed ~ ", + ifelse(has_intercept==1, "Intercept +", ""), + paste(model$predictors_names,collapse = " + "))), + newdata = df_partial, + offset = df_partial$w, + family = fam, + mode = type # Linear predictor + ) + # Also attach the partial variable + pred_part <- cbind("partial_effect" = df_partial[[x.var]], pred_part) + if(plot){ + o <- pred_part + pm <- ggplot2::ggplot(data = o, ggplot2::aes(x = partial_effect, y = mean, + ymin = mean-sd, + ymax = mean+sd) ) + + ggplot2::theme_classic() + + ggplot2::geom_ribbon(fill = "grey90") + + ggplot2::geom_line() + + ggplot2::labs(x = x.var, y = "Partial effect") + print(pm) + } + return(pred_part) # Return the partial data + }, + # Spatial partial effect plots + spartial = function(self, x.var, constant = NULL, plot = TRUE,type = "predictor", ...){ + # Get model object and check that everything is in order + mod <- self$get_data('fit_best') + model <- self$model + has_intercept <- attr(stats::terms(model$biodiversity[[1]]$equation), "intercept") + assertthat::assert_that(inherits(mod,'stanfit'), + 'model' %in% names(self), + is.character(x.var), + is.null(constant) || is.numeric(constant) + ) + + # Match variable name + x.var <- match.arg(x.var, model$predictors_names, several.ok = FALSE) + + # Make spatial container for prediction + suppressWarnings( + df_partial <- sp::SpatialPointsDataFrame(coords = model$predictors[,c('x', 'y')], + data = model$predictors[, names(model$predictors) %notin% c('x','y')], + proj4string = sp::CRS(sp::proj4string(methods::as(model$background, "Spatial"))) + ) + ) + df_partial <- methods::as(df_partial, 'SpatialPixelsDataFrame') + + # Add all others as constant + if(is.null(constant)){ + for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- mean(model$predictors[[n]], na.rm = TRUE) + } else { + for(n in names(df_partial)) if(n != x.var) df_partial[[n]] <- constant + } + bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" + if(any(bd_poipo)){ + # FIXME: Bit hackish. See if works for other projections + df_partial$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value + } + + # For Integrated model, follow poisson + fam <- ifelse(length(model$biodiversity)>1, "poisson", model$biodiversity[[1]]$family) + + # Check intercept + if(has_intercept==1) df_partial$Intercept <- 1 + # If poipo, add w to prediction container + bd_poipo <- sapply(model$biodiversity, function(x) x$type) == "poipo" + if(any(bd_poipo)){ + # FIXME: Bit hackish. See if works for other projections + df_partial$w <- unique(model$biodiversity[[which(bd_poipo)]]$expect)[2] # Absence location being second unique value + } else { df_partial$w <- NULL } + + # Simulate from the posterior + pred_part <- posterior_predict_stanfit(obj = mod, + form = to_formula(paste0("observed ~ Intercept + ", + paste(model$predictors_names,collapse = " + "))), + newdata = df_partial@data, + offset = df_partial$w, + family = fam, + mode = type # Linear predictor + ) + + prediction <- emptyraster( self$get_data('prediction') ) # Background + prediction <- fill_rasters(pred_part, prediction) + + # Do plot and return result + if(plot){ + plot(prediction[[c("mean","sd")]], col = ibis_colours$viridis_orig) + } + return(prediction) + }, + get_coefficients = function(self){ + # Returns a vector of the coefficients with direction/importance + cofs <- self$summary() + if(nrow(cofs)==0) return(NULL) + cofs <- subset(cofs, select = c("parameter", "mean", "sd")) + names(cofs) <- c("Feature", "Beta", "Sigma") + # Remove intercept(s) + int <- grep("Intercept",cofs$Feature,ignore.case = TRUE) + if(length(int)>0) cofs <- cofs[-int,] + return(cofs) + }, + # Spatial latent effect + plot_spatial = function(self, plot = TRUE){ + return(NULL) + }, + # Custom function to show stan code + stancode = function(self){ + message( + self$get_data("sm_code") + ) + } + ) + # Return + return(out) + + } # End of Train + ) + ) # End of engine definition +} diff --git a/R/engine_xgboost.R b/R/engine_xgboost.R index fe5b843d..376e1935 100644 --- a/R/engine_xgboost.R +++ b/R/engine_xgboost.R @@ -1,815 +1,821 @@ -#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R -NULL - -#' Engine for extreme gradient boosting (XGBoost) -#' -#' @description Allows to estimate eXtreme gradient descent boosting for tree-based or linear boosting regressions. -#' The XGBoost engine is a flexible, yet powerful engine with many customization options, -#' supporting multiple options to perform single and multi-class regression -#' and classification tasks. For a full list of options users are advised to have a look at the -#' [xgboost::xgb.train] help file and [https://xgboost.readthedocs.io](https://xgboost.readthedocs.io). -#' -#' @details -#' The default parameters have been set relatively conservative as to reduce overfitting. -#' -#' XGBoost supports the specification of monotonic constraints on certain variables. Within -#' ibis this is possible via [`XGBPrior`]. However constraints are available only for the -#' \code{"gbtree"} baselearners. -#' -#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. -#' @param booster A [`character`] of the booster to use. Either \code{"gbtree"} or \code{"gblinear"} (Default: \code{gblinear}) -#' @param learning_rate [`numeric`] value indicating the learning rate (eta). -#' Lower values generally being better but also computationally more costly. (Default: \code{1e-3}) -#' @param iter [`numeric`] value giving the the maximum number of boosting iterations for cross-validation (Default: \code{8e3L}). -#' @param gamma [`numeric`] A regularization parameter in the model. Lower values for better estimates (Default: \code{3}). -#' Also see [reg_lambda] parameter for the L2 regularization on the weights -#' @param reg_lambda [`numeric`] L2 regularization term on weights (Default: \code{0}). -#' @param reg_alpha [`numeric`] L1 regularization term on weights (Default: \code{0}). -#' @param max_depth [`numeric`] The Maximum depth of a tree (Default: \code{3}). -#' @param subsample [`numeric`] The ratio used for subsampling to prevent overfitting. Also used for creating a random -#' tresting dataset (Default: \code{0.75}). -#' @param colsample_bytree [`numeric`] Sub-sample ratio of columns when constructing each tree (Default: \code{0.4}). -#' @param min_child_weight [`numeric`] Broadly related to the number of instances necessary for each node (Default: \code{3}). -#' @param nthread [`numeric`] on the number of CPU-threads to use. -#' @param ... Other none specified parameters. -#' @note -#' *'Machine learning is statistics minus any checking of models and assumptions‘* ~ Brian D. Ripley, useR! 2004, Vienna -#' @seealso [xgboost::xgb.train] -#' @references -#' * Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System", 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, https://arxiv.org/abs/1603.02754 -#' @family engine -#' @name engine_xgboost -NULL -#' @rdname engine_xgboost -#' @export - -engine_xgboost <- function(x, - booster = "gbtree", - iter = 8e3L, - learning_rate = 1e-3, - gamma = 6, - reg_lambda = 0, - reg_alpha = 0, - max_depth = 2, - subsample = 0.75, - colsample_bytree = 0.4, - min_child_weight = 3, - nthread = getOption('ibis.nthread'), - ...) { - - # Check whether xgboost package is available - check_package('xgboost') - if(!("xgboost" %in% loadedNamespaces()) || ('xgboost' %notin% sessionInfo()$otherPkgs) ) { - try({requireNamespace('xgboost');attachNamespace("xgboost")},silent = TRUE) - } - - # assert that arguments are valid - assertthat::assert_that(inherits(x, "BiodiversityDistribution"), - inherits(x$background,'sf'), - is.character(booster) && booster %in% c("gbtree","gblinear"), - is.numeric(iter), - is.numeric(learning_rate) && (learning_rate > 0 && learning_rate < 1), - is.numeric(max_depth), - is.numeric(subsample) && (subsample > 0 && subsample <= 1), - is.numeric(colsample_bytree), - is.numeric(nthread) - ) - - # Create a background raster - if(is.Waiver(x$predictors)){ - # Create from background - template <- raster::raster( - ext = raster::extent(x$background), - crs = raster::projection(x$background), - res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution - diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 - ) - ) - } else { - # If predictor existing, use them - template <- emptyraster(x$predictors$get_data() ) - } - - # Burn in the background - template <- raster::rasterize(x$background, template, field = 0) - - # Set up the parameter list - params <- list( - booster = booster, - nrounds = iter, - eta = learning_rate, - gamma = gamma, - lambda = reg_lambda, - alpha = reg_alpha, - max_depth = max_depth, - subsample = subsample, - colsample_bytree = colsample_bytree, - min_child_weight = min_child_weight, - nthread = nthread, - ... - ) - - # Print a message in case there is already an engine object - if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') - - # Set engine in distribution object - x$set_engine( - bdproto( - "XGBOOST-Engine", - Engine, - name = "", - data = list( - 'template' = template, - 'params' = params - ), - # Dummy function for spatial latent effects - calc_latent_spatial = function(self, type = NULL, priors = NULL){ - new_waiver() - }, - # Dummy function for getting the equation of latent effects - get_equation_latent_spatial = function(self, method){ - new_waiver() - }, - # Function to respecify the control parameters - set_control = function(self, - params - ){ - assertthat::assert_that(is.list(params)) - # Overwrite existing - self$data$params <- params - invisible() - }, - # Setup function - setup = function(self, model, settings = NULL, ...){ - # Simple security checks - assertthat::assert_that( - assertthat::has_name(model, 'background'), - assertthat::has_name(model, 'biodiversity'), - inherits(settings,'Settings') || is.null(settings), - nrow(model$predictors) == ncell(self$get_data('template')), - !is.Waiver(self$get_data("params")), - length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately - ) - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') - - # Get parameters - params <- self$data$params - - # Distribution specific procedure - fam <- switch(model$biodiversity[[1]]$family, - "poisson" = "count:poisson", - "binomial" = "binary:logistic", - model$biodiversity[[1]]$family - ) - - # Change the number of variables included if custom equation is used - if(!is.Waiver(model$biodiversity[[1]]$equation)){ - form <- model$biodiversity[[1]]$equation - varn <- model$biodiversity[[1]]$predictors_names[which( all.vars(form) %in% model$biodiversity[[1]]$predictors_names )] - assertthat::assert_that(length(varn)>0) - # Match to existing ones and remove those not covered - model$biodiversity[[1]]$predictors_names <- model$biodiversity[[1]]$predictors_names[match(varn, model$biodiversity[[1]]$predictors_names)] - model$biodiversity[[1]]$predictors_types <- subset(model$biodiversity[[1]]$predictors_types, - predictors %in% model$biodiversity[[1]]$predictors_names) - } - - # If a poisson family is used, weight the observations by their exposure - if(fam == "count:poisson" && model$biodiversity[[1]]$type == "poipo"){ - # Get background layer - bg <- self$get_data("template") - assertthat::assert_that(!is.na(cellStats(bg,min))) - - # Add pseudo-absence points - suppressMessages( - presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, - field_occurrence = 'observed', - template = bg, - settings = model$biodiversity[[1]]$pseudoabsence_settings) - ) - if(inherits(presabs, 'sf')) presabs <- presabs %>% sf::st_drop_geometry() - - # Sample environmental points for absence only points - abs <- subset(presabs, observed == 0) - # Re-extract environmental information for absence points - envs <- get_rastervalue(coords = abs[,c('x','y')], - env = model$predictors_object$get_data(df = FALSE), - rm.na = FALSE) - if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} - - # Format out - df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], - envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) - any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) - if(length(any_missing)>0){ - presabs <- presabs[-any_missing,] # This works as they are in the same order - model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] - # Fill the absences with 1 as multiplier. This works since absences follow the presences - model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, - rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) - } - df <- subset(df, complete.cases(df)) - assertthat::assert_that(nrow(presabs) == nrow(df)) - - # Overwrite observation data - model$biodiversity[[1]]$observations <- presabs - - # Preprocessing security checks - assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), - any(!is.na(presabs[['observed']])), - length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), - nrow(df) == nrow(model$biodiversity[[1]]$observations) - ) - - # Add offset if existent - if(!is.Waiver(model$offset)){ - # ofs <- get_ngbvalue(coords = df[,c('x','y')], - # env = model$offset, - # longlat = raster::isLonLat(bg), - # field_space = c('x','y') - # ) - ofs <- get_rastervalue(coords = df[,c('x','y')], - env = model$offset_object, - rm.na = FALSE) - # Rename to spatial offset - names(ofs)[which(names(ofs)==names(model$offset_object))] <- "spatial_offset" - model$biodiversity[[1]]$offset <- ofs - } - - # Define expectation as very small vector following Renner et al. - w <- ppm_weights(df = df, - pa = model$biodiversity[[1]]$observations[['observed']], - bg = bg, - weight = 1e-6 # Set those to 1 so that absences become ratio of pres/abs - ) - assertthat::assert_that(length(w) == nrow(df)) - - model$biodiversity[[1]]$predictors <- df - model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) - - # Get for the full dataset - pres <- raster::rasterize(model$biodiversity[[1]]$observations[,c("x","y")], - bg, fun = 'count', background = 0) - w_full <- ppm_weights(df = model$predictors, - pa = pres[], - bg = bg, - weight = 1 # Set those to 1 so that absences become ratio of pres/abs - ) - # Multiply with first weight value - w_full <- w_full * (1/unique(model$biodiversity[[1]]$expect)[1]) - assertthat::assert_that( - !anyNA(w_full), all(is.finite(log(w_full))), - !anyNA(w_full), - length(w_full) == nrow(model$predictors) - ) - - } else if(fam == "binary:logistic"){ - # calculating the case weights (equal weights) - # the order of weights should be the same as presences and backgrounds in the training data - prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences - bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds - w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) - model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect - # Convert to numeric - model$biodiversity[[1]]$observations$observed <- as.numeric( model$biodiversity[[1]]$observations$observed ) - } - - # Get Preds and convert to sparse matrix with set labels - # FIXME: Support manual provision of data via xgb.DMatrix.save to save preprocessing time? - train_cov <- model$biodiversity[[1]]$predictors[,model$biodiversity[[1]]$predictors_names] - # Check if there any factors, if yes split up - if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){ - vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")] - # Get factors - z <- explode_factor(train_cov[[vf]], name = vf) - # Remove variables from train_cov and append - train_cov[[vf]] <- NULL - train_cov <- cbind(train_cov, z) - model$biodiversity[[1]]$predictors <- train_cov # Save new in model object - model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) - } - train_cov <- as.matrix( train_cov ) - labels <- model$biodiversity[[1]]$observations$observed - - # ---- # - # Create the subsample based on the subsample parameter for all presence data - # if(model$biodiversity[[1]]$type == "poipo"){ - # ind <- sample(which(labels>0), size = params$subsample * length(which(labels>0)) ) - # ind2 <- which( which(labels>0) %notin% ind ) - # ind_ab <- which(labels==0) - # ind_train <- c(ind, ind_ab); ind_test <- c(ind2, ind_ab) - # } else { - # ind_train <- sample(1:length(labels), size = params$subsample * length(labels) ) - # ind_test <- which((1:length(labels)) %notin% ind_train ) - # } - # Create the sparse matrix for training and testing data - df_train <- xgboost::xgb.DMatrix(data = train_cov, - label = labels#[ind_train] - ) - # df_test <- xgboost::xgb.DMatrix(data = train_cov[c(ind_test),], - # label = labels[c(ind_test)] - # ) - # --- # - # Prediction container - pred_cov <- model$predictors[,model$biodiversity[[1]]$predictors_names] - if(any(model$predictors_types$type=='factor')){ - vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")] - # Get factors - z <- explode_factor(pred_cov[[vf]], name = vf) - # Remove variables from train_cov and append - pred_cov[[vf]] <- NULL - pred_cov <- cbind(pred_cov, z) - model$predictors <- pred_cov # Save new in model object - model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) - model$biodiversity[[1]]$predictors_names <- colnames(pred_cov) - model$predictors_names <- colnames(pred_cov) - } - pred_cov <- as.matrix( pred_cov ) - # Ensure that the column names are identical for both - pred_cov <- pred_cov[, colnames(train_cov)] - - # Clamp? - if( settings$get("clamp") ) pred_cov <- clamp_predictions(model, pred_cov) - - # Set target variables to bias_value for prediction if specified - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% colnames(pred_cov)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - pred_cov[,settings$get('bias_variable')[i]] <- settings$get('bias_value')[i] - } - } - df_pred <- xgboost::xgb.DMatrix(data = as.matrix(pred_cov)) - assertthat::assert_that(all(colnames(df_train) == colnames(df_pred))) - - if(fam == "count:poisson"){ - # Specifically for count poisson data we will set the areas - assertthat::assert_that(all(is.finite(log(w))), - all(is.finite(log(w_full)))) - # as an exposure offset for the base_margin - xgboost::setinfo(df_train, "base_margin", log(w)) - # xgboost::setinfo(df_test, "base_margin", log(w[ind_test])) - assertthat::assert_that(nrow(df_pred) == length(w_full)) - xgboost::setinfo(df_pred, "base_margin", log(w_full)) - params$eval_metric <- "logloss" - } else if(fam == 'binary:logistic'){ - params$eval_metric <- "logloss" - } - - # Process and add priors if set - if(!is.Waiver(model$priors)){ - assertthat::assert_that( - all( model$priors$varnames() %in% model$biodiversity[[1]]$predictors_names ) - ) - # Match position of variables with monotonic constrains - mc <- rep(0, ncol(train_cov)) - names(mc) <- colnames(train_cov) - for(v in model$priors$varnames()){ - mc[v] <- switch (model$priors$get(v), - 'increasing' = 1, 'positive' = 1, - 'decreasing' = -1, 'negative' = -1, - 0 - ) - } - # Save the monotonic constrain - params$monotone_constraints <- mc - } - - if(!is.Waiver(model$offset) ){ - # Set offset to 1 (log(0)) in case nothing is found - if(is.null(xgboost::getinfo(df_train, "base_margin"))) { - of_train <- rep(1, nrow(model$biodiversity[[1]]$observations[,c("x","y")])) - of_pred <- rep(1, nrow(model$offset)) - } else { - # For the offset we simply add the (log-transformed) offset to the existing one - # given that for example log(2*3) == log(2) + log(3) - of_train <- xgboost::getinfo(df_train, "base_margin") - # of_test <- xgboost::getinfo(df_test, "base_marginfit_xgb") |> exp() - of_pred <- xgboost::getinfo(df_pred, "base_margin") - } - # -- Add offset to full prediction and load vector -- - - # Respecify offset - # (Set NA to 1 so that log(1) == 0) - of <- model$offset; of[, "spatial_offset" ] <- ifelse(is.na(of[, "spatial_offset" ]), 1, of[, "spatial_offset"]) - of1 <- get_rastervalue(coords = model$biodiversity[[1]]$observations[,c("x","y")], - env = model$offset_object, - rm.na = FALSE - ) - names(of1)[which(names(of1)==names(model$offset_object))] <- "spatial_offset" - # of2 <- get_rastervalue(coords = model$biodiversity[[1]]$observations[ind_test,c("x","y")], - # env = model$offset_object, - # rm.na = FALSE - # # longlat = raster::isLonLat(self$get_data("template")), - # # field_space = c('x','y') - # ) - # names(of2)[which(names(of2)==names(model$offset_object))] <- "spatial_offset" - assertthat::assert_that(nrow(of1) == length(of_train), - # nrow(of2) == length(of_test), - nrow(of) == length(of_pred)) - of_train <- of_train + of1[,"spatial_offset"] - # of_test <- of_test + of2[,"spatial_offset"] - of_pred <- of_pred + of[,"spatial_offset"] - - # Check that values are valid - assertthat::assert_that(all(is.finite(of_train)), all(is.finite(of_pred)), - !anyNA(of_train), !anyNA(of_pred)) - - # Set the new offset - xgboost::setinfo(df_train, "base_margin", ( of_train )) - # xgboost::setinfo(df_test, "base_margin", of_test) - xgboost::setinfo(df_pred, "base_margin", ( of_pred )) - } - - # --- # - # Save both training and predicting data in the engine data - self$set_data("df_train", df_train) - # self$set_data("df_test", df_test) - self$set_data("df_pred", df_pred) - # --- # - - # Set objective - params$objective <- fam - - self$set_control( params = params ) - - # Instead of invisible return the model object - return( model ) - }, - # Training function - train = function(self, model, settings, ...){ - assertthat::assert_that( - inherits(settings,'Settings'), - is.list(model),length(model)>1, - # Check that model id and setting id are identical - settings$modelid == model$id - ) - # Get name - name <- model$biodiversity[[1]]$name - - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green', paste0('Starting fitting: ', name)) - - # Verbosity - verbose <- settings$get("verbose") - - # Get output raster - prediction <- self$get_data('template') - - # Get parameters control - params <- self$get_data('params') - # Check only linear and reset to linear booster then - if(settings$get("only_linear")) params$booster <- "gblinear" else params$booster <- "gbtree" - # Check that link function and objective is changed if needed - li <- model$biodiversity[[1]]$link - if(!is.null(li)){ - if(model$biodiversity[[1]]$family=="binomial"){ - li <- match.arg(li, c("logit", "cloglog"),several.ok = FALSE) - if(li=="cloglog") params$objective <- "binary:logitraw" - } else { - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red',paste0("Package does not support custom link functions. Ignored!")) - } - } - - # All other needed data for model fitting - df_train <- self$get_data("df_train") - # df_test <- self$get_data("df_test") - df_pred <- self$get_data("df_pred") - w <- model$biodiversity[[1]]$expect # The expected weight - - assertthat::assert_that( - !is.null(df_pred), - all( colnames(df_pred) %in% colnames(df_train) ) - ) - - # Get number of rounds from parameters - nrounds <- params$nrounds;params$nrounds <- NULL - - # --- # - # Pass this parameter possibly on from upper level - # This implements a simple grid search for optimal parameter values - # Using the training data only (!) - if(settings$get('varsel') == "reg"){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting hyperparameters search.') - - # Create combinations of random hyper parameters - set.seed(20) - if(params$booster == 'gblinear'){ - parameters_df <- expand.grid( - lambda = seq(0,7,0.25), alpha = seq(0,1, 0.1), - eval = NA - ) - } else { - parameters_df <- expand.grid( - # overfitting - max_depth = 1:6, - gamma = 0:5, - min_child_weight = 0:6, - # Randomness - subsample = runif(1, .7, 1), - colsample_bytree = runif(1, .6, 1), - eval = NA - ) - } - - # Progressbar - pb <- progress::progress_bar$new(total = nrow(parameters_df)) - # TODO: Could be parallized - for (row in 1:nrow(parameters_df)){ - test_params <- list( - booster = params$booster, - objective = params$objective - ) - if(test_params$booster=='gblinear'){ - test_params$lambda <- parameters_df$lambda[row] - test_params$alpha <- parameters_df$alpha[row] - } else { - test_params$gamma <- parameters_df$gamma[row] - test_params$max_depth <- parameters_df$max_depth[row] - test_params$min_child_weight <- parameters_df$min_child_weight[row] - test_params$subsample <- parameters_df$subsample[row] - test_params$colsample_bytree <- parameters_df$colsample_bytree[row] - } - - suppressMessages( - test_xgb <- xgboost::xgboost( - params = test_params, - data = df_train, - nrounds = 100, - verbose = ifelse(verbose, 1, 0) - ) - ) - if(verbose) pb$tick() - parameters_df$eval[row] <- min(test_xgb$evaluation_log[,2]) - } - # Get the one with minimum error and replace params values - p <- parameters_df[which.min(parameters_df$eval),] - p$eval <- NULL - for(i in names(p)){ params[[i]] <- as.numeric(p[i]) } - - # Find the optimal number of rounds using 5-fold cross validation - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Crossvalidation for determining early stopping rule.') - - fit_cv <- xgboost::xgb.cv( - params = params, - data = df_train, - verbose = ifelse(verbose, 1, 0), - print_every_n = 100, - nrounds = nrounds, nfold = 5, - showsd = TRUE, # standard deviation of loss across folds - stratified = TRUE, # sample is unbalanced; use stratified sampling - maximize = FALSE, - early_stopping_rounds = 10 - ) - # Set new number of rounds - nround <- fit_cv$best_iteration - } - - # Remove unneeded parameters - if(settings$get('only_linear') && params$booster == "gblinear"){ - params[c("colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample")] <- NULL - } - if(settings$get('only_linear') && !is.Waiver(model$priors)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Monotonic constraints not supported for linear regressor.') - } - # Fit the model. - # watchlist <- list(train = df_train,test = df_test) - fit_xgb <- xgboost::xgboost( - params = params, - data = df_train, - # watchlist = watchlist, - nrounds = nrounds, - verbose = ifelse(verbose, 1, 0), - early_stopping_rounds = min(nrounds, ceiling(nrounds*.25)), - print_every_n = 100 - ) - # --- # - - # Predict spatially - if(!settings$get('inference_only')){ - # Messager - if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') - - # Make a prediction - suppressWarnings( - pred_xgb <- xgboost:::predict.xgb.Booster( - object = fit_xgb, - newdata = df_pred - ) - ) - if(params$objective=="binary:logitraw") pred_xgb <- ilink(pred_xgb, "cloglog") - - # Fill output with summaries of the posterior - prediction[] <- pred_xgb - names(prediction) <- 'mean' - prediction <- raster::mask(prediction, self$get_data("template") ) - - } else { - # No prediction done - prediction <- NULL - } - # Compute end of computation time - settings$set('end.time', Sys.time()) - # Also append boosting control option to settings - for(entry in names(params)) settings$set(entry, params[entry]) - - # Definition of XGBOOST Model object ---- - # Create output - out <- bdproto( - "XGBOOST-Model", - DistributionModel, - id = model$id, - model = model, - settings = settings, - fits = list( - "fit_best" = fit_xgb, - "prediction" = prediction - ), - # Partial effects - partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = TRUE, type = "response"){ - assertthat::assert_that(is.character(x.var) || is.null(x.var)) - if(!is.null(constant)) message("Constant is ignored for xgboost!") - check_package("pdp") - mod <- self$get_data('fit_best') - df <- self$model$biodiversity[[length( self$model$biodiversity )]]$predictors - df <- subset(df, select = mod$feature_names) - - # Match x.var to argument - if(is.null(x.var)){ - x.var <- colnames(df) - } else { - x.var <- match.arg(x.var, mod$feature_names, several.ok = FALSE) - } - - # if values are set, make sure that they cover the data.frame - if(!is.null(values)){ - assertthat::assert_that(length(x.var) == 1) - df2 <- list() - df2[[x.var]] <- values - # Then add the others - for(var in colnames(df)){ - if(var == x.var) next() - df2[[var]] <- mean(df[[var]], na.rm = TRUE) - } - df2 <- df2 |> as.data.frame() - df2 <- df2[, mod$feature_names] - } else { - df2 <- df - } - - # Check that variables are in - assertthat::assert_that(all( x.var %in% colnames(df) ), - all( names(df) == mod$feature_names ), - msg = 'Variable not in predicted model.') - - pp <- data.frame() - pb <- progress::progress_bar$new(total = length(x.var)) - for(v in x.var){ - p1 <- pdp::partial(mod, pred.var = v, pred.grid = df2, ice = FALSE, center = FALSE, - plot = FALSE, rug = TRUE, train = df) - p1 <- p1[,c(x.var, "yhat")] - names(p1) <- c("partial_effect", "mean") - p1$variable <- v - pp <- rbind(pp, p1) - if(length(x.var) > 1) pb$tick() - } - - if(plot){ - # Make a plot - pm <- ggplot2::ggplot(data = pp, ggplot2::aes(x = partial_effect, y = mean)) + - ggplot2::theme_classic(base_size = 18) + - ggplot2::geom_line() + - ggplot2::labs(x = "", y = expression(hat(y))) + - ggplot2::facet_wrap(~variable,scales = 'free') - print(pm) - } - # Return the data - return(pp) - }, - # Spatial partial dependence plot - spartial = function(self, x.var, constant = NULL, plot = TRUE, ...){ - assertthat::assert_that(is.character(x.var) || is.null(x.var), - "model" %in% names(self)) - - # Get data - mod <- self$get_data('fit_best') - model <- self$model - x.var <- match.arg(x.var, model$predictors_names, several.ok = FALSE) - - # Get predictor - df <- subset(model$predictors, select = mod$feature_names) - # Convert all non x.vars to the mean - # Make template of target variable(s) - template <- raster::rasterFromXYZ( - raster::coordinates( model$predictors_object$get_data() ), - crs = raster::projection(model$background) - ) - - # Set all variables other the target variable to constant - if(is.null(constant)){ - # Calculate mean - # FIXME: for factor use mode! - constant <- apply(df, 2, function(x) mean(x, na.rm=T)) - for(v in mod$feature_names[ mod$feature_names %notin% x.var]){ - if(v %notin% names(df) ) next() - df[!is.na(df[v]),v] <- as.numeric( constant[v] ) - } - } else { - df[!is.na(df[,x.var]), mod$feature_names[ mod$feature_names %notin% x.var]] <- constant - } - df <- xgboost::xgb.DMatrix(data = as.matrix(df)) - - # Spartial prediction - suppressWarnings( - pp <- xgboost:::predict.xgb.Booster( - object = mod, - newdata = df - ) - ) - # Fill output with summaries of the posterior - template[] <- pp - names(template) <- 'mean' - template <- raster::mask(template, model$background) - - if(plot){ - # Quick plot - raster::plot(template, col = ibis_colours$viridis_plasma, main = paste0(x.var, collapse ='|')) - } - # Also return spatial - return(template) - }, - # Engine-specific projection function - project = function(self, newdata, layer = "mean"){ - assertthat::assert_that(!missing(newdata), - is.data.frame(newdata) || inherits(newdata, "xgb.DMatrix") ) - - mod <- self$get_data('fit_best') - # Get model object - model <- self$model - - # Also get settings for bias values - settings <- self$settings - - if(!inherits(newdata, "xgb.DMatrix")){ - assertthat::assert_that( - all( mod$feature_names %in% colnames(newdata) ) - ) - newdata <- subset(newdata, select = mod$feature_names) - - # Clamp? - if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) - - if(!is.Waiver(settings$get('bias_variable'))){ - for(i in 1:length(settings$get('bias_variable'))){ - if(settings$get('bias_variable')[i] %notin% colnames(newdata)){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') - next() - } - newdata[,settings$get('bias_variable')[i]] <- settings$get('bias_value')[i] - } - } - newdata <- xgboost::xgb.DMatrix(as.matrix(newdata)) - } else {stop("Not implemented. Supply a data.frame as newdata!")} - - # Make a prediction - suppressWarnings( - pred_xgb <- xgboost:::predict.xgb.Booster( - object = mod, - newdata = newdata - ) - ) - - # Fill output with summaries of the posterior - prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background - prediction[] <- pred_xgb - prediction <- raster::mask(prediction, self$model$predictors_object$get_data()[[1]] ) - - return(prediction) - }, - # Get coefficients - get_coefficients = function(self){ - # Returns a vector of the coefficients with direction/importance - obj <- self$get_data('fit_best') - # Simply use the weights from the importance estimates - cofs <- xgboost:::xgb.importance(model = obj) %>% - as.data.frame() - cofs$Sigma <- NA - if(!self$settings$get("only_linear")){ - cofs <- subset(cofs, select = c("Feature", "Gain", "Sigma")) - } - names(cofs) <- c("Feature", "Beta", "Sigma") - return(cofs) - }, - # Save the model object - save = function(self, fname, what = "fit_best"){ - assertthat::assert_that(is.character(fname)) - xgboost::xgb.save( self$get_data(what), fname) - } - ) - return(out) - } - ) - ) # End of bdproto object -} # End of function +#' @include bdproto-engine.R utils-spatial.R bdproto-distributionmodel.R +NULL + +#' Engine for extreme gradient boosting (XGBoost) +#' +#' @description Allows to estimate eXtreme gradient descent boosting for tree-based or linear boosting regressions. +#' The XGBoost engine is a flexible, yet powerful engine with many customization options, +#' supporting multiple options to perform single and multi-class regression +#' and classification tasks. For a full list of options users are advised to have a look at the +#' [xgboost::xgb.train] help file and [https://xgboost.readthedocs.io](https://xgboost.readthedocs.io). +#' +#' @details +#' The default parameters have been set relatively conservative as to reduce overfitting. +#' +#' XGBoost supports the specification of monotonic constraints on certain variables. Within +#' ibis this is possible via [`XGBPrior`]. However constraints are available only for the +#' \code{"gbtree"} baselearners. +#' +#' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object. +#' @param booster A [`character`] of the booster to use. Either \code{"gbtree"} or \code{"gblinear"} (Default: \code{gblinear}) +#' @param learning_rate [`numeric`] value indicating the learning rate (eta). +#' Lower values generally being better but also computationally more costly. (Default: \code{1e-3}) +#' @param iter [`numeric`] value giving the the maximum number of boosting iterations for cross-validation (Default: \code{8e3L}). +#' @param gamma [`numeric`] A regularization parameter in the model. Lower values for better estimates (Default: \code{3}). +#' Also see [reg_lambda] parameter for the L2 regularization on the weights +#' @param reg_lambda [`numeric`] L2 regularization term on weights (Default: \code{0}). +#' @param reg_alpha [`numeric`] L1 regularization term on weights (Default: \code{0}). +#' @param max_depth [`numeric`] The Maximum depth of a tree (Default: \code{3}). +#' @param subsample [`numeric`] The ratio used for subsampling to prevent overfitting. Also used for creating a random +#' tresting dataset (Default: \code{0.75}). +#' @param colsample_bytree [`numeric`] Sub-sample ratio of columns when constructing each tree (Default: \code{0.4}). +#' @param min_child_weight [`numeric`] Broadly related to the number of instances necessary for each node (Default: \code{3}). +#' @param nthread [`numeric`] on the number of CPU-threads to use. +#' @param ... Other none specified parameters. +#' @note +#' *'Machine learning is statistics minus any checking of models and assumptions‘* ~ Brian D. Ripley, useR! 2004, Vienna +#' @seealso [xgboost::xgb.train] +#' @references +#' * Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System", 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, https://arxiv.org/abs/1603.02754 +#' @family engine +#' @returns An [engine]. +#' @examples +#' \dontrun{ +#' # Add xgboost as an engine +#' x <- distribution(background) |> engine_xgboost(iter = 4000) +#' } +#' @name engine_xgboost +NULL +#' @rdname engine_xgboost +#' @export + +engine_xgboost <- function(x, + booster = "gbtree", + iter = 8e3L, + learning_rate = 1e-3, + gamma = 6, + reg_lambda = 0, + reg_alpha = 0, + max_depth = 2, + subsample = 0.75, + colsample_bytree = 0.4, + min_child_weight = 3, + nthread = getOption('ibis.nthread'), + ...) { + + # Check whether xgboost package is available + check_package('xgboost') + if(!("xgboost" %in% loadedNamespaces()) || ('xgboost' %notin% utils::sessionInfo()$otherPkgs) ) { + try({requireNamespace('xgboost');attachNamespace("xgboost")},silent = TRUE) + } + + # assert that arguments are valid + assertthat::assert_that(inherits(x, "BiodiversityDistribution"), + inherits(x$background,'sf'), + is.character(booster) && booster %in% c("gbtree","gblinear"), + is.numeric(iter), + is.numeric(learning_rate) && (learning_rate > 0 && learning_rate < 1), + is.numeric(max_depth), + is.numeric(subsample) && (subsample > 0 && subsample <= 1), + is.numeric(colsample_bytree), + is.numeric(nthread) + ) + + # Create a background raster + if(is.Waiver(x$predictors)){ + # Create from background + template <- raster::raster( + ext = raster::extent(x$background), + crs = raster::projection(x$background), + res = c(diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100, # Simplified assumption for resolution + diff( (sf::st_bbox(x$background)[c(1,3)]) ) / 100 + ) + ) + } else { + # If predictor existing, use them + template <- emptyraster(x$predictors$get_data() ) + } + + # Burn in the background + template <- raster::rasterize(x$background, template, field = 0) + + # Set up the parameter list + params <- list( + booster = booster, + nrounds = iter, + eta = learning_rate, + gamma = gamma, + lambda = reg_lambda, + alpha = reg_alpha, + max_depth = max_depth, + subsample = subsample, + colsample_bytree = colsample_bytree, + min_child_weight = min_child_weight, + nthread = nthread, + ... + ) + + # Print a message in case there is already an engine object + if(!is.Waiver(x$engine)) myLog('[Setup]','yellow','Replacing currently selected engine.') + + # Set engine in distribution object + x$set_engine( + bdproto( + "XGBOOST-Engine", + Engine, + name = "", + data = list( + 'template' = template, + 'params' = params + ), + # Dummy function for spatial latent effects + calc_latent_spatial = function(self, type = NULL, priors = NULL){ + new_waiver() + }, + # Dummy function for getting the equation of latent effects + get_equation_latent_spatial = function(self, method){ + new_waiver() + }, + # Function to respecify the control parameters + set_control = function(self, + params + ){ + assertthat::assert_that(is.list(params)) + # Overwrite existing + self$data$params <- params + invisible() + }, + # Setup function + setup = function(self, model, settings = NULL, ...){ + # Simple security checks + assertthat::assert_that( + assertthat::has_name(model, 'background'), + assertthat::has_name(model, 'biodiversity'), + inherits(settings,'Settings') || is.null(settings), + nrow(model$predictors) == ncell(self$get_data('template')), + !is.Waiver(self$get_data("params")), + length(model$biodiversity) == 1 # Only works with single likelihood. To be processed separately + ) + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Engine setup.') + + # Get parameters + params <- self$data$params + + # Distribution specific procedure + fam <- switch(model$biodiversity[[1]]$family, + "poisson" = "count:poisson", + "binomial" = "binary:logistic", + model$biodiversity[[1]]$family + ) + + # Change the number of variables included if custom equation is used + if(!is.Waiver(model$biodiversity[[1]]$equation)){ + form <- model$biodiversity[[1]]$equation + varn <- model$biodiversity[[1]]$predictors_names[which( all.vars(form) %in% model$biodiversity[[1]]$predictors_names )] + assertthat::assert_that(length(varn)>0) + # Match to existing ones and remove those not covered + model$biodiversity[[1]]$predictors_names <- model$biodiversity[[1]]$predictors_names[match(varn, model$biodiversity[[1]]$predictors_names)] + model$biodiversity[[1]]$predictors_types <- subset(model$biodiversity[[1]]$predictors_types, + predictors %in% model$biodiversity[[1]]$predictors_names) + } + + # If a poisson family is used, weight the observations by their exposure + if(fam == "count:poisson" && model$biodiversity[[1]]$type == "poipo"){ + # Get background layer + bg <- self$get_data("template") + assertthat::assert_that(!is.na(cellStats(bg,min))) + + # Add pseudo-absence points + suppressMessages( + presabs <- add_pseudoabsence(df = model$biodiversity[[1]]$observations, + field_occurrence = 'observed', + template = bg, + settings = model$biodiversity[[1]]$pseudoabsence_settings) + ) + if(inherits(presabs, 'sf')) presabs <- presabs |> sf::st_drop_geometry() + + # Sample environmental points for absence only points + abs <- subset(presabs, observed == 0) + # Re-extract environmental information for absence points + envs <- get_rastervalue(coords = abs[,c('x','y')], + env = model$predictors_object$get_data(df = FALSE), + rm.na = FALSE) + if(assertthat::has_name(model$biodiversity[[1]]$predictors, "Intercept")){ envs$Intercept <- 1} + + # Format out + df <- rbind(model$biodiversity[[1]]$predictors[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)], + envs[,c('x','y','Intercept', model$biodiversity[[1]]$predictors_names)] ) + any_missing <- which(apply(df, 1, function(x) any(is.na(x)))) + if(length(any_missing)>0){ + presabs <- presabs[-any_missing,] # This works as they are in the same order + model$biodiversity[[1]]$expect <- model$biodiversity[[1]]$expect[-any_missing] + # Fill the absences with 1 as multiplier. This works since absences follow the presences + model$biodiversity[[1]]$expect <- c( model$biodiversity[[1]]$expect, + rep(1, nrow(presabs)-length(model$biodiversity[[1]]$expect) )) + } + df <- subset(df, stats::complete.cases(df)) + assertthat::assert_that(nrow(presabs) == nrow(df)) + + # Overwrite observation data + model$biodiversity[[1]]$observations <- presabs + + # Preprocessing security checks + assertthat::assert_that( all( model$biodiversity[[1]]$observations[['observed']] >= 0 ), + any(!is.na(presabs[['observed']])), + length(model$biodiversity[[1]]$expect)==nrow(model$biodiversity[[1]]$observations), + nrow(df) == nrow(model$biodiversity[[1]]$observations) + ) + + # Add offset if existent + if(!is.Waiver(model$offset)){ + # ofs <- get_ngbvalue(coords = df[,c('x','y')], + # env = model$offset, + # longlat = raster::isLonLat(bg), + # field_space = c('x','y') + # ) + ofs <- get_rastervalue(coords = df[,c('x','y')], + env = model$offset_object, + rm.na = FALSE) + # Rename to spatial offset + names(ofs)[which(names(ofs)==names(model$offset_object))] <- "spatial_offset" + model$biodiversity[[1]]$offset <- ofs + } + + # Define expectation as very small vector following Renner et al. + w <- ppm_weights(df = df, + pa = model$biodiversity[[1]]$observations[['observed']], + bg = bg, + weight = 1e-6 # Set those to 1 so that absences become ratio of pres/abs + ) + assertthat::assert_that(length(w) == nrow(df)) + + model$biodiversity[[1]]$predictors <- df + model$biodiversity[[1]]$expect <- w * (1/model$biodiversity[[1]]$expect) + + # Get for the full dataset + pres <- raster::rasterize(model$biodiversity[[1]]$observations[,c("x","y")], + bg, fun = 'count', background = 0) + w_full <- ppm_weights(df = model$predictors, + pa = pres[], + bg = bg, + weight = 1 # Set those to 1 so that absences become ratio of pres/abs + ) + # Multiply with first weight value + w_full <- w_full * (1/unique(model$biodiversity[[1]]$expect)[1]) + assertthat::assert_that( + !anyNA(w_full), all(is.finite(log(w_full))), + !anyNA(w_full), + length(w_full) == nrow(model$predictors) + ) + + } else if(fam == "binary:logistic"){ + # calculating the case weights (equal weights) + # the order of weights should be the same as presences and backgrounds in the training data + prNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["1"]) # number of presences + bgNum <- as.numeric(table(model$biodiversity[[1]]$observations[['observed']])["0"]) # number of backgrounds + w <- ifelse(model$biodiversity[[1]]$observations[['observed']] == 1, 1, prNum / bgNum) + model$biodiversity[[1]]$expect <- w * model$biodiversity[[1]]$expect + # Convert to numeric + model$biodiversity[[1]]$observations$observed <- as.numeric( model$biodiversity[[1]]$observations$observed ) + } + + # Get Preds and convert to sparse matrix with set labels + # FIXME: Support manual provision of data via xgb.DMatrix.save to save preprocessing time? + train_cov <- model$biodiversity[[1]]$predictors[,model$biodiversity[[1]]$predictors_names] + # Check if there any factors, if yes split up + if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){ + vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")] + # Get factors + z <- explode_factor(train_cov[[vf]], name = vf) + # Remove variables from train_cov and append + train_cov[[vf]] <- NULL + train_cov <- cbind(train_cov, z) + model$biodiversity[[1]]$predictors <- train_cov # Save new in model object + model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) + } + train_cov <- as.matrix( train_cov ) + labels <- model$biodiversity[[1]]$observations$observed + + # ---- # + # Create the subsample based on the subsample parameter for all presence data + # if(model$biodiversity[[1]]$type == "poipo"){ + # ind <- sample(which(labels>0), size = params$subsample * length(which(labels>0)) ) + # ind2 <- which( which(labels>0) %notin% ind ) + # ind_ab <- which(labels==0) + # ind_train <- c(ind, ind_ab); ind_test <- c(ind2, ind_ab) + # } else { + # ind_train <- sample(1:length(labels), size = params$subsample * length(labels) ) + # ind_test <- which((1:length(labels)) %notin% ind_train ) + # } + # Create the sparse matrix for training and testing data + df_train <- xgboost::xgb.DMatrix(data = train_cov, + label = labels#[ind_train] + ) + # df_test <- xgboost::xgb.DMatrix(data = train_cov[c(ind_test),], + # label = labels[c(ind_test)] + # ) + # --- # + # Prediction container + pred_cov <- model$predictors[,model$biodiversity[[1]]$predictors_names] + if(any(model$predictors_types$type=='factor')){ + vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")] + # Get factors + z <- explode_factor(pred_cov[[vf]], name = vf) + # Remove variables from train_cov and append + pred_cov[[vf]] <- NULL + pred_cov <- cbind(pred_cov, z) + model$predictors <- pred_cov # Save new in model object + model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric")) + model$biodiversity[[1]]$predictors_names <- colnames(pred_cov) + model$predictors_names <- colnames(pred_cov) + } + pred_cov <- as.matrix( pred_cov ) + # Ensure that the column names are identical for both + pred_cov <- pred_cov[, colnames(train_cov)] + + # Clamp? + if( settings$get("clamp") ) pred_cov <- clamp_predictions(model, pred_cov) + + # Set target variables to bias_value for prediction if specified + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% colnames(pred_cov)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + pred_cov[,settings$get('bias_variable')[i]] <- settings$get('bias_value')[i] + } + } + df_pred <- xgboost::xgb.DMatrix(data = as.matrix(pred_cov)) + assertthat::assert_that(all(colnames(df_train) == colnames(df_pred))) + + if(fam == "count:poisson"){ + # Specifically for count poisson data we will set the areas + assertthat::assert_that(all(is.finite(log(w))), + all(is.finite(log(w_full)))) + # as an exposure offset for the base_margin + xgboost::setinfo(df_train, "base_margin", log(w)) + # xgboost::setinfo(df_test, "base_margin", log(w[ind_test])) + assertthat::assert_that(nrow(df_pred) == length(w_full)) + xgboost::setinfo(df_pred, "base_margin", log(w_full)) + params$eval_metric <- "logloss" + } else if(fam == 'binary:logistic'){ + params$eval_metric <- "logloss" + } + + # Process and add priors if set + if(!is.Waiver(model$priors)){ + assertthat::assert_that( + all( model$priors$varnames() %in% model$biodiversity[[1]]$predictors_names ) + ) + # Match position of variables with monotonic constrains + mc <- rep(0, ncol(train_cov)) + names(mc) <- colnames(train_cov) + for(v in model$priors$varnames()){ + mc[v] <- switch (model$priors$get(v), + 'increasing' = 1, 'positive' = 1, + 'decreasing' = -1, 'negative' = -1, + 0 + ) + } + # Save the monotonic constrain + params$monotone_constraints <- mc + } + + if(!is.Waiver(model$offset) ){ + # Set offset to 1 (log(0)) in case nothing is found + if(is.null(xgboost::getinfo(df_train, "base_margin"))) { + of_train <- rep(1, nrow(model$biodiversity[[1]]$observations[,c("x","y")])) + of_pred <- rep(1, nrow(model$offset)) + } else { + # For the offset we simply add the (log-transformed) offset to the existing one + # given that for example log(2*3) == log(2) + log(3) + of_train <- xgboost::getinfo(df_train, "base_margin") + # of_test <- xgboost::getinfo(df_test, "base_marginfit_xgb") |> exp() + of_pred <- xgboost::getinfo(df_pred, "base_margin") + } + # -- Add offset to full prediction and load vector -- + + # Respecify offset + # (Set NA to 1 so that log(1) == 0) + of <- model$offset; of[, "spatial_offset" ] <- ifelse(is.na(of[, "spatial_offset" ]), 1, of[, "spatial_offset"]) + of1 <- get_rastervalue(coords = model$biodiversity[[1]]$observations[,c("x","y")], + env = model$offset_object, + rm.na = FALSE + ) + names(of1)[which(names(of1)==names(model$offset_object))] <- "spatial_offset" + # of2 <- get_rastervalue(coords = model$biodiversity[[1]]$observations[ind_test,c("x","y")], + # env = model$offset_object, + # rm.na = FALSE + # # longlat = raster::isLonLat(self$get_data("template")), + # # field_space = c('x','y') + # ) + # names(of2)[which(names(of2)==names(model$offset_object))] <- "spatial_offset" + assertthat::assert_that(nrow(of1) == length(of_train), + # nrow(of2) == length(of_test), + nrow(of) == length(of_pred)) + of_train <- of_train + of1[,"spatial_offset"] + # of_test <- of_test + of2[,"spatial_offset"] + of_pred <- of_pred + of[,"spatial_offset"] + + # Check that values are valid + assertthat::assert_that(all(is.finite(of_train)), all(is.finite(of_pred)), + !anyNA(of_train), !anyNA(of_pred)) + + # Set the new offset + xgboost::setinfo(df_train, "base_margin", ( of_train )) + # xgboost::setinfo(df_test, "base_margin", of_test) + xgboost::setinfo(df_pred, "base_margin", ( of_pred )) + } + + # --- # + # Save both training and predicting data in the engine data + self$set_data("df_train", df_train) + # self$set_data("df_test", df_test) + self$set_data("df_pred", df_pred) + # --- # + + # Set objective + params$objective <- fam + + self$set_control( params = params ) + + # Instead of invisible return the model object + return( model ) + }, + # Training function + train = function(self, model, settings, ...){ + assertthat::assert_that( + inherits(settings,'Settings'), + is.list(model),length(model)>1, + # Check that model id and setting id are identical + settings$modelid == model$id + ) + # Get name + name <- model$biodiversity[[1]]$name + + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green', paste0('Starting fitting: ', name)) + + # Verbosity + verbose <- settings$get("verbose") + + # Get output raster + prediction <- self$get_data('template') + + # Get parameters control + params <- self$get_data('params') + # Check only linear and reset to linear booster then + if(settings$get("only_linear")) params$booster <- "gblinear" else params$booster <- "gbtree" + # Check that link function and objective is changed if needed + li <- model$biodiversity[[1]]$link + if(!is.null(li)){ + if(model$biodiversity[[1]]$family=="binomial"){ + li <- match.arg(li, c("logit", "cloglog"),several.ok = FALSE) + if(li=="cloglog") params$objective <- "binary:logitraw" + } else { + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red',paste0("Package does not support custom link functions. Ignored!")) + } + } + + # All other needed data for model fitting + df_train <- self$get_data("df_train") + # df_test <- self$get_data("df_test") + df_pred <- self$get_data("df_pred") + w <- model$biodiversity[[1]]$expect # The expected weight + + assertthat::assert_that( + !is.null(df_pred), + all( colnames(df_pred) %in% colnames(df_train) ) + ) + + # Get number of rounds from parameters + nrounds <- params$nrounds;params$nrounds <- NULL + + # --- # + # Pass this parameter possibly on from upper level + # This implements a simple grid search for optimal parameter values + # Using the training data only (!) + if(settings$get('optim_hyperparam')){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting hyperparameters search...') + + # Create combinations of random hyper parameters + set.seed(20) + if(params$booster == 'gblinear'){ + parameters_df <- expand.grid( + lambda = seq(0,7,0.25), alpha = seq(0,1, 0.1), + eval = NA + ) + } else { + parameters_df <- expand.grid( + # overfitting + max_depth = 1:6, + gamma = 0:5, + min_child_weight = 0:6, + # Randomness + subsample = stats::runif(1, .7, 1), + colsample_bytree = stats::runif(1, .6, 1), + eval = NA + ) + } + + # Progressbar + pb <- progress::progress_bar$new(total = nrow(parameters_df)) + # TODO: Could be parallized + for (row in 1:nrow(parameters_df)){ + test_params <- list( + booster = params$booster, + objective = params$objective + ) + if(test_params$booster=='gblinear'){ + test_params$lambda <- parameters_df$lambda[row] + test_params$alpha <- parameters_df$alpha[row] + } else { + test_params$gamma <- parameters_df$gamma[row] + test_params$max_depth <- parameters_df$max_depth[row] + test_params$min_child_weight <- parameters_df$min_child_weight[row] + test_params$subsample <- parameters_df$subsample[row] + test_params$colsample_bytree <- parameters_df$colsample_bytree[row] + } + + suppressMessages( + test_xgb <- xgboost::xgboost( + params = test_params, + data = df_train, + nrounds = 100, + verbose = ifelse(verbose, 1, 0) + ) + ) + if(verbose) pb$tick() + parameters_df$eval[row] <- min(test_xgb$evaluation_log[,2]) + } + # Get the one with minimum error and replace params values + p <- parameters_df[which.min(parameters_df$eval),] + p$eval <- NULL + for(i in names(p)){ params[[i]] <- as.numeric(p[i]) } + + # Find the optimal number of rounds using 5-fold cross validation + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Crossvalidation for determining early stopping rule.') + + fit_cv <- xgboost::xgb.cv( + params = params, + data = df_train, + verbose = ifelse(verbose, 1, 0), + print_every_n = 100, + nrounds = nrounds, nfold = 5, + showsd = TRUE, # standard deviation of loss across folds + stratified = TRUE, # sample is unbalanced; use stratified sampling + maximize = FALSE, + early_stopping_rounds = 10 + ) + # Set new number of rounds + nround <- fit_cv$best_iteration + } + + # Remove unneeded parameters + if(settings$get('only_linear') && params$booster == "gblinear"){ + params[c("colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample")] <- NULL + } + if(settings$get('only_linear') && !is.Waiver(model$priors)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Monotonic constraints not supported for linear regressor.') + } + # Fit the model. + # watchlist <- list(train = df_train,test = df_test) + fit_xgb <- xgboost::xgboost( + params = params, + data = df_train, + # watchlist = watchlist, + nrounds = nrounds, + verbose = ifelse(verbose, 1, 0), + early_stopping_rounds = min(nrounds, ceiling(nrounds*.25)), + print_every_n = 100 + ) + # --- # + + # Predict spatially + if(!settings$get('inference_only')){ + # Messager + if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Starting prediction...') + + # Make a prediction + suppressWarnings( + pred_xgb <- xgboost:::predict.xgb.Booster( + object = fit_xgb, + newdata = df_pred + ) + ) + if(params$objective=="binary:logitraw") pred_xgb <- ilink(pred_xgb, "cloglog") + + # Fill output with summaries of the posterior + prediction[] <- pred_xgb + names(prediction) <- 'mean' + prediction <- raster::mask(prediction, self$get_data("template") ) + + } else { + # No prediction done + prediction <- NULL + } + # Compute end of computation time + settings$set('end.time', Sys.time()) + # Also append boosting control option to settings + for(entry in names(params)) settings$set(entry, params[entry]) + + # Definition of XGBOOST Model object ---- + # Create output + out <- bdproto( + "XGBOOST-Model", + DistributionModel, + id = model$id, + model = model, + settings = settings, + fits = list( + "fit_best" = fit_xgb, + "prediction" = prediction + ), + # Partial effects + partial = function(self, x.var = NULL, constant = NULL, variable_length = 100, values = NULL, plot = TRUE, type = "response"){ + assertthat::assert_that(is.character(x.var) || is.null(x.var)) + if(!is.null(constant)) message("Constant is ignored for xgboost!") + check_package("pdp") + mod <- self$get_data('fit_best') + df <- self$model$biodiversity[[length( self$model$biodiversity )]]$predictors + df <- subset(df, select = mod$feature_names) + + # Match x.var to argument + if(is.null(x.var)){ + x.var <- colnames(df) + } else { + x.var <- match.arg(x.var, mod$feature_names, several.ok = FALSE) + } + + # if values are set, make sure that they cover the data.frame + if(!is.null(values)){ + assertthat::assert_that(length(x.var) == 1) + df2 <- list() + df2[[x.var]] <- values + # Then add the others + for(var in colnames(df)){ + if(var == x.var) next() + df2[[var]] <- mean(df[[var]], na.rm = TRUE) + } + df2 <- df2 |> as.data.frame() + df2 <- df2[, mod$feature_names] + } else { + df2 <- df + } + + # Check that variables are in + assertthat::assert_that(all( x.var %in% colnames(df) ), + all( names(df) == mod$feature_names ), + msg = 'Variable not in predicted model.') + + pp <- data.frame() + pb <- progress::progress_bar$new(total = length(x.var)) + for(v in x.var){ + p1 <- pdp::partial(mod, pred.var = v, pred.grid = df2, ice = FALSE, center = FALSE, + plot = FALSE, rug = TRUE, train = df) + p1 <- p1[,c(x.var, "yhat")] + names(p1) <- c("partial_effect", "mean") + p1$variable <- v + pp <- rbind(pp, p1) + if(length(x.var) > 1) pb$tick() + } + + if(plot){ + # Make a plot + pm <- ggplot2::ggplot(data = pp, ggplot2::aes(x = partial_effect, y = mean)) + + ggplot2::theme_classic(base_size = 18) + + ggplot2::geom_line() + + ggplot2::labs(x = "", y = expression(hat(y))) + + ggplot2::facet_wrap(~variable,scales = 'free') + print(pm) + } + # Return the data + return(pp) + }, + # Spatial partial dependence plot + spartial = function(self, x.var, constant = NULL, plot = TRUE, ...){ + assertthat::assert_that(is.character(x.var) || is.null(x.var), + "model" %in% names(self)) + + # Get data + mod <- self$get_data('fit_best') + model <- self$model + x.var <- match.arg(x.var, model$predictors_names, several.ok = FALSE) + + # Get predictor + df <- subset(model$predictors, select = mod$feature_names) + # Convert all non x.vars to the mean + # Make template of target variable(s) + template <- raster::rasterFromXYZ( + raster::coordinates( model$predictors_object$get_data() ), + crs = raster::projection(model$background) + ) + + # Set all variables other the target variable to constant + if(is.null(constant)){ + # Calculate mean + # FIXME: for factor use mode! + constant <- apply(df, 2, function(x) mean(x, na.rm=T)) + for(v in mod$feature_names[ mod$feature_names %notin% x.var]){ + if(v %notin% names(df) ) next() + df[!is.na(df[v]),v] <- as.numeric( constant[v] ) + } + } else { + df[!is.na(df[,x.var]), mod$feature_names[ mod$feature_names %notin% x.var]] <- constant + } + df <- xgboost::xgb.DMatrix(data = as.matrix(df)) + + # Spartial prediction + suppressWarnings( + pp <- xgboost:::predict.xgb.Booster( + object = mod, + newdata = df + ) + ) + # Fill output with summaries of the posterior + template[] <- pp + names(template) <- 'mean' + template <- raster::mask(template, model$background) + + if(plot){ + # Quick plot + raster::plot(template, col = ibis_colours$viridis_plasma, main = paste0(x.var, collapse ='|')) + } + # Also return spatial + return(template) + }, + # Engine-specific projection function + project = function(self, newdata, layer = "mean"){ + assertthat::assert_that(!missing(newdata), + is.data.frame(newdata) || inherits(newdata, "xgb.DMatrix") ) + + mod <- self$get_data('fit_best') + # Get model object + model <- self$model + + # Also get settings for bias values + settings <- self$settings + + if(!inherits(newdata, "xgb.DMatrix")){ + assertthat::assert_that( + all( mod$feature_names %in% colnames(newdata) ) + ) + newdata <- subset(newdata, select = mod$feature_names) + + # Clamp? + if( settings$get("clamp") ) newdata <- clamp_predictions(model, newdata) + + if(!is.Waiver(settings$get('bias_variable'))){ + for(i in 1:length(settings$get('bias_variable'))){ + if(settings$get('bias_variable')[i] %notin% colnames(newdata)){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','red','Did not find bias variable in prediction object!') + next() + } + newdata[,settings$get('bias_variable')[i]] <- settings$get('bias_value')[i] + } + } + newdata <- xgboost::xgb.DMatrix(as.matrix(newdata)) + } else {stop("Not implemented. Supply a data.frame as newdata!")} + + # Make a prediction + suppressWarnings( + pred_xgb <- xgboost:::predict.xgb.Booster( + object = mod, + newdata = newdata + ) + ) + + # Fill output with summaries of the posterior + prediction <- emptyraster( self$model$predictors_object$get_data()[[1]] ) # Background + prediction[] <- pred_xgb + prediction <- raster::mask(prediction, self$model$predictors_object$get_data()[[1]] ) + + return(prediction) + }, + # Get coefficients + get_coefficients = function(self){ + # Returns a vector of the coefficients with direction/importance + obj <- self$get_data('fit_best') + # Simply use the weights from the importance estimates + cofs <- xgboost::xgb.importance(model = obj) |> + as.data.frame() + cofs$Sigma <- NA + if(!self$settings$get("only_linear")){ + cofs <- subset(cofs, select = c("Feature", "Gain", "Sigma")) + } + names(cofs) <- c("Feature", "Beta", "Sigma") + return(cofs) + }, + # Save the model object + save = function(self, fname, what = "fit_best"){ + assertthat::assert_that(is.character(fname)) + xgboost::xgb.save( self$get_data(what), fname) + } + ) + return(out) + } + ) + ) # End of bdproto object +} # End of function diff --git a/R/ensemble.R b/R/ensemble.R index b24a79c3..d19dd6fd 100644 --- a/R/ensemble.R +++ b/R/ensemble.R @@ -1,543 +1,543 @@ -#' @include utils-spatial.R -NULL - -#' Function to create an ensemble of multiple fitted models -#' -#' @description -#' Ensemble models calculated on multiple models have often been shown to -#' outcompete any single model in comparative assessments (Valavi et al. 2022). -#' -#' This function creates an ensemble of multiple provided distribution models -#' fitted with the [`ibis.iSDM-package`]. Each model has to have estimated predictions with a given method and -#' optional uncertainty in form of the standard deviation or similar. -#' Through the `layer` parameter it can be specified which part of the prediction -#' should be averaged in an ensemble. This can be for instance the *mean* prediction and/or -#' the standard deviation *sd*. See Details below for an overview of the different methods. -#' -#' Also returns a coefficient of variation (cv) as output of the ensemble, but note -#' this should not be interpreted as measure of model uncertainty as it cannot -#' capture parameter uncertainty of individual models; rather it reflects variation among predictions which -#' can be due to many factors including simply differences in model complexity. -#' -#' @details -#' Possible options for creating an ensemble includes: -#' * \code{'mean'} - Calculates the mean of several predictions. -#' * \code{'median'} - Calculates the median of several predictions. -#' * \code{'weighted.mean'} - Calculates a weighted mean. Weights have to be supplied separately (e.g. TSS). -#' * \code{'min.sd'} - Ensemble created by minimizing the uncertainty among predictions. -#' * \code{'threshold.frequency'} - Returns an ensemble based on threshold frequency (simple count). Requires thresholds to be computed. -#' * \code{'pca'} - Calculates a PCA between predictions of each algorithm and then extract the first axis (the one explaining the most variation). -#' -#' In addition to the different ensemble methods, a minimal threshold (\code{min.value}) can be set that needs to be surpassed for averaging. -#' By default this option is not used (Default: \code{NULL}). -#' -#' Note by default only the band in the \code{layer} parameter is composited. If supported by the model -#' other summary statistics from the posterior (e.g. \code{'sd'}) can be specified. -#' -#' @note -#' If a list is supplied, then it is assumed that each entry in the list is a fitted [`DistributionModel`] object. -#' Take care not to create an ensemble of models constructed with different link functions, e.g. [logistic] vs [log]. In this case -#' the \code{"normalize"} parameter has to be set. -#' @param ... Provided [`DistributionModel`] objects. -#' @param method Approach on how the ensemble is to be created. See details for available options (Default: \code{'mean'}). -#' @param weights (*Optional*) weights provided to the ensemble function if weighted means are to be constructed (Default: \code{NULL}). -#' @param min.value A [`numeric`] stating a minimum threshold value that needs to be surpassed in each layer (Default: \code{NULL}). -#' @param layer A [`character`] of the layer to be taken from each prediction (Default: \code{'mean'}). If set to \code{NULL} -#' ignore any of the layer names in ensembles of `Raster` objects. -#' @param normalize [`logical`] on whether the inputs of the ensemble should be normalized to a scale of 0-1 (Default: \code{FALSE}). -#' @param uncertainty A [`character`] indicating how the uncertainty among models should be calculated. Available options include -#' \code{"none"}, the standard deviation (\code{"sd"}), the coefficient of variation (\code{"cv"}, Default) -#' or the range between the lowest and highest value (\code{"range"}). -#' @references -#' * Valavi, R., Guillera‐Arroita, G., Lahoz‐Monfort, J. J., & Elith, J. (2022). Predictive performance of presence‐only species distribution models: a benchmark study with reproducible code. Ecological Monographs, 92(1), e01486. -#' @examples -#' \dontrun{ -#' # Assumes previously computed predictions -#' ex <- ensemble(mod1, mod2, mod3, method = "mean") -#' names(ex) -#' -#' # Make a bivariate plot (might require other packages) -#' bivplot(ex) -#' } -#' @returns A [`RasterStack`] containing the ensemble of the provided predictions specified by \code{method} and a -#' coefficient of variation across all models. - -#' @name ensemble -#' @aliases ensemble -#' @keywords train -#' @exportMethod ensemble -#' @export -NULL -methods::setGeneric("ensemble", - signature = methods::signature("..."), - function(..., method = "mean", weights = NULL, min.value = NULL, layer = "mean", - normalize = FALSE, uncertainty = "cv") standardGeneric("ensemble")) - -#' @name ensemble -#' @rdname ensemble -#' @usage \S4method{ensemble}{ANY}(...) -methods::setMethod( - "ensemble", - methods::signature("ANY"), - function(..., method = "mean", weights = NULL, min.value = NULL, layer = "mean", - normalize = FALSE, uncertainty = "cv"){ - if(length(list(...))>1) { - mc <- list(...) - } else { - # Collate provided models - if(!is.list(...)){ - mc <- list(...) - } else mc <- c(...) - } - - # Get all those that are DistributionModels - mods <- mc[ sapply(mc, function(x) inherits(x, "DistributionModel") ) ] - if(length(mods)==0) { - # Check whether scenario objects were not provided instead - mods1 <- mc[ sapply(mc, function(x) inherits(x, "BiodiversityScenario") ) ] - mods2 <- mc[ sapply(mc, function(x) is.Raster(x) ) ] - mods3 <- mc[ sapply(mc, function(x) inherits(x, "stars") ) ] - assertthat::assert_that(length(mods1)>0 || length(mods2)>0 || length(mods3)>0, - msg = "Ensemble only works with DistributionModel or BiodiversityScenario objects! Alternativel supply raster or stars objects.") - if(length(mods1)>0) mods <- mods1 else if(length(mods2)>0) mods <- mods2 else mods <- mods3 - } - - # Further checks - assertthat::assert_that(length(mods)>=2, # Need at least 2 otherwise this does not make sense - msg = "No use calculating an ensemble on one object only..." - ) - assertthat::assert_that( - is.character(method), - is.null(min.value) || is.numeric(min.value), - is.null(layer) || is.character(layer), - is.null(weights) || is.vector(weights), - is.logical(normalize), - is.character(uncertainty) - ) - - # Check the method - method <- match.arg(method, c('mean', 'weighted.mean', 'median', 'threshold.frequency', 'min.sd', 'pca'), several.ok = FALSE) - # Uncertainty calculation - uncertainty <- match.arg(uncertainty, c('none','sd', 'cv', 'range'), several.ok = FALSE) - - # Check that weight lengths is equal to provided distribution objects - if(!is.null(weights)) assertthat::assert_that(length(weights) == length(mods)) - # If weights vector is numeric, standardize the weights - if(is.numeric(weights)) weights <- weights / sum(weights) - - # For Distribution model ensembles - if( all( sapply(mods, function(z) inherits(z, "DistributionModel")) ) ){ - # Check that layers all have a prediction layer - assertthat::assert_that( - all( sapply(mods, function(x) !is.Waiver(x$get_data('prediction')) ) ), - msg = "All distribution models need a fitted prediction object!" - ) - # Check that layer is present in supplied mods - assertthat::assert_that( - all( sapply(mods, function(x) layer %in% names(x$get_data('prediction')) ) ), - msg = paste("Layer", text_red(layer), "not found in supplied objects!") - ) - - # Get prediction stacks from all mods - ll_ras <- sapply(mods, function(x) x$get_data('prediction')[[layer]]) - # Ensure that the layers have the same resolution, otherwise align - if(!compareRaster(ll_ras[[1]], ll_ras[[2]], stopiffalse = FALSE)){ - if(getOption('ibis.setupmessages')) myLog('[Ensemble]','red','Rasters need to be aligned. Check.') - ll_ras[[2]] <- raster::resample(ll_ras[[2]], ll_ras[[1]]) - } - # Now ensemble per layer entry - out <- raster::stack() - for(lyr in layer){ - ras <- raster::stack(sapply(ll_ras, function(x) x[[lyr]])) - - # If normalize before running an ensemble if parameter set - if(normalize) ras <- predictor_transform(ras, option = "norm") - - # Apply threshold if set. Set to 0 thus reducing the value of the ensembled layer. - if(!is.null(min.value)) ras[ras < min.value] <- 0 - - # Now create the ensemble depending on the option - if(method == 'mean'){ - new <- mean( ras, na.rm = TRUE) - } else if(method == 'median'){ - new <- raster::calc(ras, fun = median, na.rm = TRUE) - } else if(method == 'weighted.mean'){ - new <- weighted.mean( ras, w = weights, na.rm = TRUE) - } else if(method == 'threshold.frequency'){ - # Check that thresholds are available - assertthat::assert_that( - all( sapply(mods, function(x) length( grep("threshold", x$show_rasters()) )>0 ) ), - msg = "This function requires thresholds to be computed!" - ) - n_tr <- sapply(mods, function(x) grep("threshold", x$show_rasters(),value = TRUE) ) - # Get layer of each threshold if there are multiple - ras_tr <- raster::stack() - for(i in 1:length(n_tr)){ - o <- mods[[i]]$get_data(n_tr[i]) - # Grep layer name from the stack - ras_tr <- raster::addLayer(ras_tr, o[[grep(layer, names(o))]] ) - } - # Calculate frequency - new <- sum(ras_tr, na.rm = TRUE) - new <- raster::mask(new, ras_tr[[1]]) - } else if(method == 'min.sd'){ - # If method 'min.sd' furthermore check that there is a sd object for all of them - assertthat::assert_that( - all( sapply(mods, function(x) "sd" %in% names(x$get_data('prediction')) ) ), - msg = "Method \'min.sd\' needs parametrized uncertainty (sd) for all objects." - ) - # Also get SD prediction from models - ras_sd <- raster::stack( sapply(mods, function(x) x$get_data('prediction')[['sd']])) - # Normalize the sds for each - ras_sd <- predictor_transform(ras_sd, option = "norm") - # Get the id of the layer where standard deviation is lower - min_sd <- raster::whiches.min(ras_sd) - new <- emptyraster(ras) - for(cl in raster::unique(min_sd)){ - new[min_sd == cl] <- ras[[cl]][min_sd == cl] - } - } else if(method == 'pca'){ - # Calculate a pca on the layers and return the first axes - new <- predictor_transform(ras, option = "pca",pca.var = 1)[[1]] - } - - # Rename - names(new) <- paste0("ensemble_", lyr) - # Add attributes on the method of ensembling - attr(new, "method") <- method - if(uncertainty!='none'){ - # Add uncertainty - ras_uncertainty <- switch (uncertainty, - "sd" = raster::calc(ras, sd, na.rm = TRUE), - "cv" = raster::cv(ras, na.rm = TRUE), - "range" = max(ras, na.rm = TRUE) - min(ras, na.rm = TRUE) - ) - names(ras_uncertainty) <- paste0(uncertainty, "_", lyr) - # Add attributes on the method of ensembling - attr(ras_uncertainty, "method") <- uncertainty - - # Add all layers to out - out <- raster::stack(out, new, ras_uncertainty) - } else { - out <- raster::stack(out, new) - } - } - - assertthat::assert_that(is.Raster(out)) - - return(out) - } else if(is.Raster(mods[[1]])) { - # Check that layer is present in supplied mods - if(!is.null(layer)){ - assertthat::assert_that( - all( sapply(mods, function(x) layer %in% names(x) ) ), - msg = paste("Layer", text_red(layer), "not found in supplied objects!") - ) - } else { layer <- 1 } # Take the first one - # TODO: - if(length(layer)>1) stop("Not implemented yet") - # Get prediction stacks from all mods - ll_ras <- sapply(mods, function(x) x[[layer]]) - # Ensure that the layers have the same resolution, otherwise align - if(!compareRaster(ll_ras[[1]], ll_ras[[2]], stopiffalse = FALSE)){ - if(getOption('ibis.setupmessages')) myLog('[Ensemble]','red','Rasters need to be aligned. Check.') - ll_ras[[2]] <- raster::resample(ll_ras[[2]], ll_ras[[1]]) - } - ras <- raster::stack(ll_ras) - # If normalize before running an ensemble if parameter set - if(normalize) ras <- predictor_transform(ras, option = "norm") - - # Apply threshold if set. Set to 0 thus reducing the value of the ensembled layer. - if(!is.null(min.value)) ras[ras < min.value] <- 0 - - # Now ensemble per layer entry - out <- raster::stack() - for(lyr in layer){ - - # Now create the ensemble depending on the option - if(method == 'mean'){ - new <- mean( ras, na.rm = TRUE) - } else if(method == 'median'){ - new <- median( ras, na.rm = TRUE) - } else if(method == 'weighted.mean'){ - new <- weighted.mean( ras, w = weights, na.rm = TRUE) - } else if(method == 'threshold.frequency'){ - # Check that thresholds are available - stop("This function does not (yet) work with directly provided Raster objects.") - - } else if(method == 'min.sd'){ - # If method 'min.sd' furthermore check that there is a sd object for all of them - assertthat::assert_that( - all( sapply(mods, function(x) "sd" %in% names(mods) ) ), - msg = "Method \'min.sd\' needs parametrized uncertainty (sd) for all objects." - ) - # Also get SD prediction from models - ras_sd <- raster::stack( sapply(mods, function(x) x[['sd']])) - # Get the id of the layer where standard deviation is lower - min_sd <- raster::whiches.min(ras_sd) - new <- emptyraster(ras) - for(cl in raster::unique(min_sd)){ - new[min_sd == cl] <- ras[[cl]][min_sd == cl] - } - } else if(method == 'pca'){ - # Calculate a pca on the layers and return the first axes - new <- predictor_transform(ras, option = "pca", pca.var = 1)[[1]] - } - # Rename - names(new) <- paste0("ensemble_", lyr) - # Add attributes on the method of ensemble - attr(new, "method") <- method - if(uncertainty != "none"){ - # Add uncertainty - ras_uncertainty <- switch (uncertainty, - "sd" = raster::calc(ras, sd, na.rm = TRUE), - "cv" = raster::cv(ras, na.rm = TRUE), - "range" = max(ras, na.rm = TRUE) - min(ras, na.rm = TRUE) - ) - names(ras_uncertainty) <- paste0(uncertainty, "_", lyr) - # Add attributes on the method of ensembling - attr(ras_uncertainty, "method") <- uncertainty - # Add all layers to out - out <- raster::stack(out, new, ras_uncertainty) - } else { - out <- raster::stack(out, new) - } - } - - assertthat::assert_that(is.Raster(out)) - return(out) - } else { - # Scenario objects as stars or Scenario objects - if(all(sapply(mods, function(z) inherits(z, "stars")))){ - # Check that layer is in stars - if(!assertthat::see_if(all( sapply(mods, function(z) layer %in% names(z)) ))){ - if(getOption('ibis.setupmessages')) myLog('[Ensemble]','red','Provided layer not in objects. Taking first option!') - layer <- names(mods[[1]])[1] - } - # Format to table - lmat <- do.call("rbind", mods) |> as.data.frame() - # Get dimensions - lmat_dim <- stars:::st_dimensions(mods[[1]]) - - } else { - # Check that layers all have a prediction layer - assertthat::assert_that( - all( sapply(mods, function(x) !is.Waiver(x$get_data()) ) ), - msg = "All distribution models need a fitted scenario object!" - ) - # Check that layer is present in supplied mods - assertthat::assert_that( - all( sapply(mods, function(x) layer %in% names(x$get_data()) ) ), - msg = paste("Layer", text_red(layer), "not found in supplied objects!") - ) - # Get projected suitability from all mods - lmat <- stars::st_as_stars( - sapply(mods, function(x) x$get_data()[layer]) - ) |> as.data.frame() - # Get dimensions - lmat_dim <- stars:::st_dimensions(mods[[1]]$get_data()) - } - if(normalize){ - lmat[,4:ncol(lmat)] <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time - 2, function(x) { - (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE) ) - }) - } - - # Now create the ensemble depending on the option - if(method == 'mean'){ - out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time - 1, function(x) mean(x, na.rm = TRUE)) - } else if(method == 'median'){ - out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time - 1, function(x) median(x, na.rm = TRUE)) - } else if(method == 'weighted.mean'){ - out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time - 1, function(x) weighted.mean(x, w = weights, na.rm = TRUE)) - } else if(method == 'threshold.frequency'){ - out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time - 1, function(x) sum(x, na.rm = TRUE) / (ncol(lmat)-3) ) - # Check that thresholds are available - } else if(method == 'min.sd'){ - stop("This has not been reasonably implemented in this context.") - } else if(method == 'pca'){ - stop("This has not been reasonably implemented in this context.") - } - # Add dimensions to output - out <- cbind( sf::st_coordinates(mods[[1]]$get_data()[layer]), "ensemble" = out ) |> as.data.frame() - - # Convert to stars - out <- out |> stars:::st_as_stars.data.frame(dims = c(1,2,3), coords = 1:2) - # Rename dimension names - out <- out |> stars:::st_set_dimensions(names = c("x", "y", "band")) - # Rename - names(out) <- paste0("ensemble_", layer) - # Add attributes on the method of ensemble - attr(out, "method") <- method - - # --- # - if(uncertainty != 'none'){ - # Add uncertainty - out_uncertainty <- switch (uncertainty, - "sd" = apply(lmat[,4:ncol(lmat)], 1, function(x) sd(x, na.rm = TRUE)), - "cv" = apply(lmat[,4:ncol(lmat)], 1, function(x) raster::cv(x, na.rm = TRUE)), - "range" = apply(lmat[,4:ncol(lmat)], 1, function(x) (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))) - ) - if(any(is.infinite(out_uncertainty))) out_uncertainty[is.infinite(out_uncertainty)] <- NA - # Add dimensions to output - out_uncertainty <- cbind( sf::st_coordinates(mods[[1]]$get_data()[layer]), "ensemble" = out_uncertainty ) |> as.data.frame() - - # Convert to stars - out_uncertainty <- out_uncertainty |> stars:::st_as_stars.data.frame(dims = c(1,2,3), coords = 1:2) - # Rename dimension names - out_uncertainty <- out_uncertainty |> stars:::st_set_dimensions(names = c("x", "y", "band")) - # Rename - names(out_uncertainty) <- paste0(uncertainty, "_", layer) - # Add attributes on the method of ensembling - attr(out_uncertainty, "method") <- uncertainty - # --- # - # Combine both ensemble and uncertainty - ex <- stars:::c.stars(out, out_uncertainty) - # Correct projection is unset - if(is.na(sf::st_crs(ex))) ex <- st_set_crs(ex, st_crs(mods[[1]]$get_data())) - } else { - # Only the output - ex <- out - } - # Correct projection is unset - if(is.na(sf::st_crs(ex))) ex <- st_set_crs(ex, st_crs(mods[[1]]$get_data())) - assertthat::assert_that(inherits(ex, "stars")) - return(ex) - } - } -) - -#' Function to create an ensemble of partial effects from multiple models -#' -#' @description Similar to the `ensemble()` function, this function creates an ensemble of -#' partial responses of provided distribution models fitted with the [`ibis.iSDM-package`]. -#' Through the `layer` parameter it can be specified which part of the partial prediction -#' should be averaged in an ensemble (if given). This can be for instance the *mean* prediction and/or -#' the standard deviation *sd*. Ensemble partial is also being called if more than one input -#' [`DistributionModel`] object is provided to `partial`. -#' -#' By default the ensemble of partial responses is created as average across all models with the -#' uncertainty being the standard deviation of responses. -#' -#' @details -#' Possible options for creating an ensemble includes: -#' * \code{'mean'} - Calculates the mean of several predictions. -#' * \code{'median'} - Calculates the median of several predictions. -#' -#' @note -#' If a list is supplied, then it is assumed that each entry in the list is a fitted [`DistributionModel`] object. -#' Take care not to create an ensemble of models constructed with different link functions, e.g. [logistic] vs [log]. -#' By default the response functions of each model are normalized. -#' @param ... Provided [`DistributionModel`] objects from which partial responses can be called. In the future provided data.frames might be supported as well. -#' @param x.var A [`character`] of the variable from which an ensemble is to be created. -#' @param method Approach on how the ensemble is to be created. See details for options (Default: \code{'mean'}). -#' @param layer A [`character`] of the layer to be taken from each prediction (Default: \code{'mean'}). If set to \code{NULL} -#' ignore any of the layer names in ensembles of `Raster` objects. -#' @param normalize [`logical`] on whether the inputs of the ensemble should be normalized to a scale of 0-1 (Default: \code{TRUE}). -#' @returns A [`RasterStack`] containing the ensemble of the provided predictions specified by \code{method} and a -#' coefficient of variation across all models. - -#' @name ensemble_partial -#' @aliases ensemble_partial -#' @keywords train -#' @exportMethod ensemble_partial -#' @export -NULL -methods::setGeneric("ensemble_partial", - signature = methods::signature("..."), - function(..., x.var, method = "mean", layer = "mean", normalize = TRUE) standardGeneric("ensemble_partial")) - -#' @name ensemble_partial -#' @rdname ensemble_partial -#' @usage \S4method{ensemble_partial}{ANY}(...) -methods::setMethod( - "ensemble_partial", - methods::signature("ANY"), - function(..., x.var, method = "mean", layer = "mean", normalize = TRUE){ - assertthat::assert_that( - is.character(x.var), - msg = "Partial ensemble requires explicit specification of the parameter x.var." - ) - if(length(list(...))>1) { - mc <- list(...) - } else { - # Collate provided models - if(!is.list(...)){ - mc <- list(...) - } else mc <- c(...) - } - - # Get all those that are DistributionModels - mods <- mc[ sapply(mc, function(x) inherits(x, "DistributionModel") ) ] - - if(length(mods)==1){ - # Only one object provided, just return partial results for it - obj <- mods[[1]] - return( obj$partial(x.var = x.var) ) - } - - # Further checks - assertthat::assert_that( - is.character(method), - is.null(layer) || is.character(layer), - is.logical(normalize) - ) - - # Check the method - method <- match.arg(method, c('mean', 'median'), several.ok = FALSE) - - if(getOption("ibis.setupmessages")) myLog("[Inference]","green","Creating a partial ensemble...") - - # Get variable range from the first object - # FIXME: Ideally make a consensus, otherwise assumes that same predictor been used - rr <- range(mods[[1]]$model$predictors[,x.var], na.rm = TRUE) - assertthat::assert_that(length(rr)==2, !anyNA(rr)) - rr <- seq(rr[1], rr[2], length.out = 100) - - # Now for each object get the partial values for the target variable - out <- data.frame() - for(obj in mods){ - if(length(grep(x.var, summary(obj)[[1]]))==0){ - message(paste("Layer", text_red(layer), "not found in model. Skipping!")) - next() - } - # Get partial with identical variable length - o <- partial(mod = obj, x.var = x.var, variable_length = 100, values = rr, plot = FALSE) - assertthat::assert_that(all( o$partial_effect == rr )) - # Subset to target variable - o <- o[, c("partial_effect", layer)] - # Normalize if set - if(normalize){ - if(length(unique(o[[layer]]))>1){ - o[[layer]] <- (o[[layer]] - min( o[[layer]])) / (max(o[[layer]] ) - min(o[[layer]] )) - } else { - o[[layer]] <- 0 # Assumption being the variable has been regularized out - } - } - o$cid <- 1:nrow(o) - o$id <- as.character(obj$id) - out <- rbind(out, o) - } - - # Now composite the ensemble depending on the option - if(method == 'mean'){ - new <- aggregate(out[,layer], by = list(partial_effect = out$partial_effect), - FUN = function(x = out[[layer]]) { - return(cbind( mean = mean(x), sd = sd(x))) - }) |> as.matrix() |> as.data.frame() - colnames(new) <- c("partial_effect", "mean", "sd") - } else if(method == 'median'){ - new <- aggregate(out[,layer], by = list(partial_effect = out$partial_effect), - FUN = function(x = out[[layer]]) { - return(cbind( median = median(x), mad = mad(x))) - }) |> as.matrix() |> as.data.frame() - colnames(new) <- c("partial_effect", "median", "mad") - } - return(new) - } -) +#' @include utils-spatial.R +NULL + +#' Function to create an ensemble of multiple fitted models +#' +#' @description +#' Ensemble models calculated on multiple models have often been shown to +#' outcompete any single model in comparative assessments (Valavi et al. 2022). +#' +#' This function creates an ensemble of multiple provided distribution models +#' fitted with the [`ibis.iSDM-package`]. Each model has to have estimated predictions with a given method and +#' optional uncertainty in form of the standard deviation or similar. +#' Through the `layer` parameter it can be specified which part of the prediction +#' should be averaged in an ensemble. This can be for instance the *mean* prediction and/or +#' the standard deviation *sd*. See Details below for an overview of the different methods. +#' +#' Also returns a coefficient of variation (cv) as output of the ensemble, but note +#' this should not be interpreted as measure of model uncertainty as it cannot +#' capture parameter uncertainty of individual models; rather it reflects variation among predictions which +#' can be due to many factors including simply differences in model complexity. +#' +#' @details +#' Possible options for creating an ensemble includes: +#' * \code{'mean'} - Calculates the mean of several predictions. +#' * \code{'median'} - Calculates the median of several predictions. +#' * \code{'weighted.mean'} - Calculates a weighted mean. Weights have to be supplied separately (e.g. TSS). +#' * \code{'min.sd'} - Ensemble created by minimizing the uncertainty among predictions. +#' * \code{'threshold.frequency'} - Returns an ensemble based on threshold frequency (simple count). Requires thresholds to be computed. +#' * \code{'pca'} - Calculates a PCA between predictions of each algorithm and then extract the first axis (the one explaining the most variation). +#' +#' In addition to the different ensemble methods, a minimal threshold (\code{min.value}) can be set that needs to be surpassed for averaging. +#' By default this option is not used (Default: \code{NULL}). +#' +#' Note by default only the band in the \code{layer} parameter is composited. If supported by the model +#' other summary statistics from the posterior (e.g. \code{'sd'}) can be specified. +#' +#' @note +#' If a list is supplied, then it is assumed that each entry in the list is a fitted [`DistributionModel`] object. +#' Take care not to create an ensemble of models constructed with different link functions, e.g. [logistic] vs [log]. In this case +#' the \code{"normalize"} parameter has to be set. +#' @param ... Provided [`DistributionModel`] objects. +#' @param method Approach on how the ensemble is to be created. See details for available options (Default: \code{'mean'}). +#' @param weights (*Optional*) weights provided to the ensemble function if weighted means are to be constructed (Default: \code{NULL}). +#' @param min.value A [`numeric`] stating a minimum threshold value that needs to be surpassed in each layer (Default: \code{NULL}). +#' @param layer A [`character`] of the layer to be taken from each prediction (Default: \code{'mean'}). If set to \code{NULL} +#' ignore any of the layer names in ensembles of `Raster` objects. +#' @param normalize [`logical`] on whether the inputs of the ensemble should be normalized to a scale of 0-1 (Default: \code{FALSE}). +#' @param uncertainty A [`character`] indicating how the uncertainty among models should be calculated. Available options include +#' \code{"none"}, the standard deviation (\code{"sd"}), the coefficient of variation (\code{"cv"}, Default) +#' or the range between the lowest and highest value (\code{"range"}). +#' @references +#' * Valavi, R., Guillera‐Arroita, G., Lahoz‐Monfort, J. J., & Elith, J. (2022). Predictive performance of presence‐only species distribution models: a benchmark study with reproducible code. Ecological Monographs, 92(1), e01486. +#' @examples +#' \dontrun{ +#' # Assumes previously computed predictions +#' ex <- ensemble(mod1, mod2, mod3, method = "mean") +#' names(ex) +#' +#' # Make a bivariate plot (might require other packages) +#' bivplot(ex) +#' } +#' @returns A [`RasterStack`] containing the ensemble of the provided predictions specified by \code{method} and a +#' coefficient of variation across all models. + +#' @name ensemble +#' @aliases ensemble +#' @keywords train +#' @exportMethod ensemble +#' @export +NULL +methods::setGeneric("ensemble", + signature = methods::signature("..."), + function(..., method = "mean", weights = NULL, min.value = NULL, layer = "mean", + normalize = FALSE, uncertainty = "cv") standardGeneric("ensemble")) + +#' @name ensemble +#' @rdname ensemble +#' @usage \S4method{ensemble}{ANY}(...) +methods::setMethod( + "ensemble", + methods::signature("ANY"), + function(..., method = "mean", weights = NULL, min.value = NULL, layer = "mean", + normalize = FALSE, uncertainty = "cv"){ + if(length(list(...))>1) { + mc <- list(...) + } else { + # Collate provided models + if(!is.list(...)){ + mc <- list(...) + } else mc <- c(...) + } + + # Get all those that are DistributionModels + mods <- mc[ sapply(mc, function(x) inherits(x, "DistributionModel") ) ] + if(length(mods)==0) { + # Check whether scenario objects were not provided instead + mods1 <- mc[ sapply(mc, function(x) inherits(x, "BiodiversityScenario") ) ] + mods2 <- mc[ sapply(mc, function(x) is.Raster(x) ) ] + mods3 <- mc[ sapply(mc, function(x) inherits(x, "stars") ) ] + assertthat::assert_that(length(mods1)>0 || length(mods2)>0 || length(mods3)>0, + msg = "Ensemble only works with DistributionModel or BiodiversityScenario objects! Alternativel supply raster or stars objects.") + if(length(mods1)>0) mods <- mods1 else if(length(mods2)>0) mods <- mods2 else mods <- mods3 + } + + # Further checks + assertthat::assert_that(length(mods)>=2, # Need at least 2 otherwise this does not make sense + msg = "No use calculating an ensemble on one object only..." + ) + assertthat::assert_that( + is.character(method), + is.null(min.value) || is.numeric(min.value), + is.null(layer) || is.character(layer), + is.null(weights) || is.vector(weights), + is.logical(normalize), + is.character(uncertainty) + ) + + # Check the method + method <- match.arg(method, c('mean', 'weighted.mean', 'median', 'threshold.frequency', 'min.sd', 'pca'), several.ok = FALSE) + # Uncertainty calculation + uncertainty <- match.arg(uncertainty, c('none','sd', 'cv', 'range'), several.ok = FALSE) + + # Check that weight lengths is equal to provided distribution objects + if(!is.null(weights)) assertthat::assert_that(length(weights) == length(mods)) + # If weights vector is numeric, standardize the weights + if(is.numeric(weights)) weights <- weights / sum(weights) + + # For Distribution model ensembles + if( all( sapply(mods, function(z) inherits(z, "DistributionModel")) ) ){ + # Check that layers all have a prediction layer + assertthat::assert_that( + all( sapply(mods, function(x) !is.Waiver(x$get_data('prediction')) ) ), + msg = "All distribution models need a fitted prediction object!" + ) + # Check that layer is present in supplied mods + assertthat::assert_that( + all( sapply(mods, function(x) layer %in% names(x$get_data('prediction')) ) ), + msg = paste("Layer", text_red(layer), "not found in supplied objects!") + ) + + # Get prediction stacks from all mods + ll_ras <- sapply(mods, function(x) x$get_data('prediction')[[layer]]) + # Ensure that the layers have the same resolution, otherwise align + if(!compareRaster(ll_ras[[1]], ll_ras[[2]], stopiffalse = FALSE)){ + if(getOption('ibis.setupmessages')) myLog('[Ensemble]','red','Rasters need to be aligned. Check.') + ll_ras[[2]] <- raster::resample(ll_ras[[2]], ll_ras[[1]]) + } + # Now ensemble per layer entry + out <- raster::stack() + for(lyr in layer){ + ras <- raster::stack(sapply(ll_ras, function(x) x[[lyr]])) + + # If normalize before running an ensemble if parameter set + if(normalize) ras <- predictor_transform(ras, option = "norm") + + # Apply threshold if set. Set to 0 thus reducing the value of the ensembled layer. + if(!is.null(min.value)) ras[ras < min.value] <- 0 + + # Now create the ensemble depending on the option + if(method == 'mean'){ + new <- mean( ras, na.rm = TRUE) + } else if(method == 'median'){ + new <- raster::calc(ras, fun = median, na.rm = TRUE) + } else if(method == 'weighted.mean'){ + new <- weighted.mean( ras, w = weights, na.rm = TRUE) + } else if(method == 'threshold.frequency'){ + # Check that thresholds are available + assertthat::assert_that( + all( sapply(mods, function(x) length( grep("threshold", x$show_rasters()) )>0 ) ), + msg = "This function requires thresholds to be computed!" + ) + n_tr <- sapply(mods, function(x) grep("threshold", x$show_rasters(),value = TRUE) ) + # Get layer of each threshold if there are multiple + ras_tr <- raster::stack() + for(i in 1:length(n_tr)){ + o <- mods[[i]]$get_data(n_tr[i]) + # Grep layer name from the stack + ras_tr <- raster::addLayer(ras_tr, o[[grep(layer, names(o))]] ) + } + # Calculate frequency + new <- sum(ras_tr, na.rm = TRUE) + new <- raster::mask(new, ras_tr[[1]]) + } else if(method == 'min.sd'){ + # If method 'min.sd' furthermore check that there is a sd object for all of them + assertthat::assert_that( + all( sapply(mods, function(x) "sd" %in% names(x$get_data('prediction')) ) ), + msg = "Method \'min.sd\' needs parametrized uncertainty (sd) for all objects." + ) + # Also get SD prediction from models + ras_sd <- raster::stack( sapply(mods, function(x) x$get_data('prediction')[['sd']])) + # Normalize the sds for each + ras_sd <- predictor_transform(ras_sd, option = "norm") + # Get the id of the layer where standard deviation is lower + min_sd <- raster::whiches.min(ras_sd) + new <- emptyraster(ras) + for(cl in raster::unique(min_sd)){ + new[min_sd == cl] <- ras[[cl]][min_sd == cl] + } + } else if(method == 'pca'){ + # Calculate a pca on the layers and return the first axes + new <- predictor_transform(ras, option = "pca",pca.var = 1)[[1]] + } + + # Rename + names(new) <- paste0("ensemble_", lyr) + # Add attributes on the method of ensembling + attr(new, "method") <- method + if(uncertainty!='none'){ + # Add uncertainty + ras_uncertainty <- switch (uncertainty, + "sd" = raster::calc(ras, sd, na.rm = TRUE), + "cv" = raster::cv(ras, na.rm = TRUE), + "range" = max(ras, na.rm = TRUE) - min(ras, na.rm = TRUE) + ) + names(ras_uncertainty) <- paste0(uncertainty, "_", lyr) + # Add attributes on the method of ensembling + attr(ras_uncertainty, "method") <- uncertainty + + # Add all layers to out + out <- raster::stack(out, new, ras_uncertainty) + } else { + out <- raster::stack(out, new) + } + } + + assertthat::assert_that(is.Raster(out)) + + return(out) + } else if(is.Raster(mods[[1]])) { + # Check that layer is present in supplied mods + if(!is.null(layer)){ + assertthat::assert_that( + all( sapply(mods, function(x) layer %in% names(x) ) ), + msg = paste("Layer", text_red(layer), "not found in supplied objects!") + ) + } else { layer <- 1 } # Take the first one + # TODO: + if(length(layer)>1) stop("Not implemented yet") + # Get prediction stacks from all mods + ll_ras <- sapply(mods, function(x) x[[layer]]) + # Ensure that the layers have the same resolution, otherwise align + if(!compareRaster(ll_ras[[1]], ll_ras[[2]], stopiffalse = FALSE)){ + if(getOption('ibis.setupmessages')) myLog('[Ensemble]','red','Rasters need to be aligned. Check.') + ll_ras[[2]] <- raster::resample(ll_ras[[2]], ll_ras[[1]]) + } + ras <- raster::stack(ll_ras) + # If normalize before running an ensemble if parameter set + if(normalize) ras <- predictor_transform(ras, option = "norm") + + # Apply threshold if set. Set to 0 thus reducing the value of the ensembled layer. + if(!is.null(min.value)) ras[ras < min.value] <- 0 + + # Now ensemble per layer entry + out <- raster::stack() + for(lyr in layer){ + + # Now create the ensemble depending on the option + if(method == 'mean'){ + new <- mean( ras, na.rm = TRUE) + } else if(method == 'median'){ + new <- median( ras, na.rm = TRUE) + } else if(method == 'weighted.mean'){ + new <- weighted.mean( ras, w = weights, na.rm = TRUE) + } else if(method == 'threshold.frequency'){ + # Check that thresholds are available + stop("This function does not (yet) work with directly provided Raster objects.") + + } else if(method == 'min.sd'){ + # If method 'min.sd' furthermore check that there is a sd object for all of them + assertthat::assert_that( + all( sapply(mods, function(x) "sd" %in% names(mods) ) ), + msg = "Method \'min.sd\' needs parametrized uncertainty (sd) for all objects." + ) + # Also get SD prediction from models + ras_sd <- raster::stack( sapply(mods, function(x) x[['sd']])) + # Get the id of the layer where standard deviation is lower + min_sd <- raster::whiches.min(ras_sd) + new <- emptyraster(ras) + for(cl in raster::unique(min_sd)){ + new[min_sd == cl] <- ras[[cl]][min_sd == cl] + } + } else if(method == 'pca'){ + # Calculate a pca on the layers and return the first axes + new <- predictor_transform(ras, option = "pca", pca.var = 1)[[1]] + } + # Rename + names(new) <- paste0("ensemble_", lyr) + # Add attributes on the method of ensemble + attr(new, "method") <- method + if(uncertainty != "none"){ + # Add uncertainty + ras_uncertainty <- switch (uncertainty, + "sd" = raster::calc(ras, sd, na.rm = TRUE), + "cv" = raster::cv(ras, na.rm = TRUE), + "range" = max(ras, na.rm = TRUE) - min(ras, na.rm = TRUE) + ) + names(ras_uncertainty) <- paste0(uncertainty, "_", lyr) + # Add attributes on the method of ensembling + attr(ras_uncertainty, "method") <- uncertainty + # Add all layers to out + out <- raster::stack(out, new, ras_uncertainty) + } else { + out <- raster::stack(out, new) + } + } + + assertthat::assert_that(is.Raster(out)) + return(out) + } else { + # Scenario objects as stars or Scenario objects + if(all(sapply(mods, function(z) inherits(z, "stars")))){ + # Check that layer is in stars + if(!assertthat::see_if(all( sapply(mods, function(z) layer %in% names(z)) ))){ + if(getOption('ibis.setupmessages')) myLog('[Ensemble]','red','Provided layer not in objects. Taking first option!') + layer <- names(mods[[1]])[1] + } + # Format to table + lmat <- do.call("rbind", mods) |> as.data.frame() + # Get dimensions + lmat_dim <- stars::st_dimensions(mods[[1]]) + + } else { + # Check that layers all have a prediction layer + assertthat::assert_that( + all( sapply(mods, function(x) !is.Waiver(x$get_data()) ) ), + msg = "All distribution models need a fitted scenario object!" + ) + # Check that layer is present in supplied mods + assertthat::assert_that( + all( sapply(mods, function(x) layer %in% names(x$get_data()) ) ), + msg = paste("Layer", text_red(layer), "not found in supplied objects!") + ) + # Get projected suitability from all mods + lmat <- stars::st_as_stars( + sapply(mods, function(x) x$get_data()[layer]) + ) |> as.data.frame() + # Get dimensions + lmat_dim <- stars::st_dimensions(mods[[1]]$get_data()) + } + if(normalize){ + lmat[,4:ncol(lmat)] <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time + 2, function(x) { + (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE) ) + }) + } + + # Now create the ensemble depending on the option + if(method == 'mean'){ + out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time + 1, function(x) mean(x, na.rm = TRUE)) + } else if(method == 'median'){ + out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time + 1, function(x) stats::median(x, na.rm = TRUE)) + } else if(method == 'weighted.mean'){ + out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time + 1, function(x) weighted.mean(x, w = weights, na.rm = TRUE)) + } else if(method == 'threshold.frequency'){ + out <- apply(lmat[,4:ncol(lmat)], # On the assumption that col 1-3 are coordinates+time + 1, function(x) sum(x, na.rm = TRUE) / (ncol(lmat)-3) ) + # Check that thresholds are available + } else if(method == 'min.sd'){ + stop("This has not been reasonably implemented in this context.") + } else if(method == 'pca'){ + stop("This has not been reasonably implemented in this context.") + } + # Add dimensions to output + out <- cbind( sf::st_coordinates(mods[[1]]$get_data()[layer]), "ensemble" = out ) |> as.data.frame() + + # Convert to stars + out <- out |> stars:::st_as_stars.data.frame(dims = c(1,2,3), coords = 1:2) + # Rename dimension names + out <- out |> stars::st_set_dimensions(names = c("x", "y", "band")) + # Rename + names(out) <- paste0("ensemble_", layer) + # Add attributes on the method of ensemble + attr(out, "method") <- method + + # --- # + if(uncertainty != 'none'){ + # Add uncertainty + out_uncertainty <- switch (uncertainty, + "sd" = apply(lmat[,4:ncol(lmat)], 1, function(x) sd(x, na.rm = TRUE)), + "cv" = apply(lmat[,4:ncol(lmat)], 1, function(x) raster::cv(x, na.rm = TRUE)), + "range" = apply(lmat[,4:ncol(lmat)], 1, function(x) (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))) + ) + if(any(is.infinite(out_uncertainty))) out_uncertainty[is.infinite(out_uncertainty)] <- NA + # Add dimensions to output + out_uncertainty <- cbind( sf::st_coordinates(mods[[1]]$get_data()[layer]), "ensemble" = out_uncertainty ) |> as.data.frame() + + # Convert to stars + out_uncertainty <- out_uncertainty |> stars:::st_as_stars.data.frame(dims = c(1,2,3), coords = 1:2) + # Rename dimension names + out_uncertainty <- out_uncertainty |> stars::st_set_dimensions(names = c("x", "y", "band")) + # Rename + names(out_uncertainty) <- paste0(uncertainty, "_", layer) + # Add attributes on the method of ensembling + attr(out_uncertainty, "method") <- uncertainty + # --- # + # Combine both ensemble and uncertainty + ex <- stars:::c.stars(out, out_uncertainty) + # Correct projection is unset + if(is.na(sf::st_crs(ex))) ex <- st_set_crs(ex, st_crs(mods[[1]]$get_data())) + } else { + # Only the output + ex <- out + } + # Correct projection is unset + if(is.na(sf::st_crs(ex))) ex <- st_set_crs(ex, st_crs(mods[[1]]$get_data())) + assertthat::assert_that(inherits(ex, "stars")) + return(ex) + } + } +) + +#' Function to create an ensemble of partial effects from multiple models +#' +#' @description Similar to the `ensemble()` function, this function creates an ensemble of +#' partial responses of provided distribution models fitted with the [`ibis.iSDM-package`]. +#' Through the `layer` parameter it can be specified which part of the partial prediction +#' should be averaged in an ensemble (if given). This can be for instance the *mean* prediction and/or +#' the standard deviation *sd*. Ensemble partial is also being called if more than one input +#' [`DistributionModel`] object is provided to `partial`. +#' +#' By default the ensemble of partial responses is created as average across all models with the +#' uncertainty being the standard deviation of responses. +#' +#' @details +#' Possible options for creating an ensemble includes: +#' * \code{'mean'} - Calculates the mean of several predictions. +#' * \code{'median'} - Calculates the median of several predictions. +#' +#' @note +#' If a list is supplied, then it is assumed that each entry in the list is a fitted [`DistributionModel`] object. +#' Take care not to create an ensemble of models constructed with different link functions, e.g. [logistic] vs [log]. +#' By default the response functions of each model are normalized. +#' @param ... Provided [`DistributionModel`] objects from which partial responses can be called. In the future provided data.frames might be supported as well. +#' @param x.var A [`character`] of the variable from which an ensemble is to be created. +#' @param method Approach on how the ensemble is to be created. See details for options (Default: \code{'mean'}). +#' @param layer A [`character`] of the layer to be taken from each prediction (Default: \code{'mean'}). If set to \code{NULL} +#' ignore any of the layer names in ensembles of `Raster` objects. +#' @param normalize [`logical`] on whether the inputs of the ensemble should be normalized to a scale of 0-1 (Default: \code{TRUE}). +#' @returns A [`RasterStack`] containing the ensemble of the provided predictions specified by \code{method} and a +#' coefficient of variation across all models. + +#' @name ensemble_partial +#' @aliases ensemble_partial +#' @keywords train +#' @exportMethod ensemble_partial +#' @export +NULL +methods::setGeneric("ensemble_partial", + signature = methods::signature("..."), + function(..., x.var, method = "mean", layer = "mean", normalize = TRUE) standardGeneric("ensemble_partial")) + +#' @name ensemble_partial +#' @rdname ensemble_partial +#' @usage \S4method{ensemble_partial}{ANY}(...) +methods::setMethod( + "ensemble_partial", + methods::signature("ANY"), + function(..., x.var, method = "mean", layer = "mean", normalize = TRUE){ + assertthat::assert_that( + is.character(x.var), + msg = "Partial ensemble requires explicit specification of the parameter x.var." + ) + if(length(list(...))>1) { + mc <- list(...) + } else { + # Collate provided models + if(!is.list(...)){ + mc <- list(...) + } else mc <- c(...) + } + + # Get all those that are DistributionModels + mods <- mc[ sapply(mc, function(x) inherits(x, "DistributionModel") ) ] + + if(length(mods)==1){ + # Only one object provided, just return partial results for it + obj <- mods[[1]] + return( obj$partial(x.var = x.var) ) + } + + # Further checks + assertthat::assert_that( + is.character(method), + is.null(layer) || is.character(layer), + is.logical(normalize) + ) + + # Check the method + method <- match.arg(method, c('mean', 'median'), several.ok = FALSE) + + if(getOption("ibis.setupmessages")) myLog("[Inference]","green","Creating a partial ensemble...") + + # Get variable range from the first object + # FIXME: Ideally make a consensus, otherwise assumes that same predictor been used + rr <- range(mods[[1]]$model$predictors[,x.var], na.rm = TRUE) + assertthat::assert_that(length(rr)==2, !anyNA(rr)) + rr <- seq(rr[1], rr[2], length.out = 100) + + # Now for each object get the partial values for the target variable + out <- data.frame() + for(obj in mods){ + if(length(grep(x.var, summary(obj)[[1]]))==0){ + message(paste("Layer", text_red(layer), "not found in model. Skipping!")) + next() + } + # Get partial with identical variable length + o <- partial(mod = obj, x.var = x.var, variable_length = 100, values = rr, plot = FALSE) + assertthat::assert_that(all( o$partial_effect == rr )) + # Subset to target variable + o <- o[, c("partial_effect", layer)] + # Normalize if set + if(normalize){ + if(length(unique(o[[layer]]))>1){ + o[[layer]] <- (o[[layer]] - min( o[[layer]])) / (max(o[[layer]] ) - min(o[[layer]] )) + } else { + o[[layer]] <- 0 # Assumption being the variable has been regularized out + } + } + o$cid <- 1:nrow(o) + o$id <- as.character(obj$id) + out <- rbind(out, o) + } + + # Now composite the ensemble depending on the option + if(method == 'mean'){ + new <- aggregate(out[,layer], by = list(partial_effect = out$partial_effect), + FUN = function(x = out[[layer]]) { + return(cbind( mean = mean(x), sd = sd(x))) + }) |> as.matrix() |> as.data.frame() + colnames(new) <- c("partial_effect", "mean", "sd") + } else if(method == 'median'){ + new <- aggregate(out[,layer], by = list(partial_effect = out$partial_effect), + FUN = function(x = out[[layer]]) { + return(cbind( median = stats::median(x), mad = mad(x))) + }) |> as.matrix() |> as.data.frame() + colnames(new) <- c("partial_effect", "median", "mad") + } + return(new) + } +) diff --git a/R/ibis.iSDM-package.R b/R/ibis.iSDM-package.R new file mode 100644 index 00000000..037efeb5 --- /dev/null +++ b/R/ibis.iSDM-package.R @@ -0,0 +1,29 @@ +#' @keywords internal +"_PACKAGE" + +#' @importFrom foreach %do% %dopar% +#' @importFrom stats effects + +## usethis namespace: start +## usethis namespace: end +NULL + +globalVariables(c("background", "band", "bi_class", "bias", + "change", "cid", + "data", + "form2", + "id", "included", + "km", + "limit", "lower", + "median", "model", + "name", + "observed", "oversampled", + "partial_effect", "polpo", "predictor", "predictors", + "q05", "q50", "q95", + "ras", "region.poly", + "s", "state", "suitability", + "tothin", "type", "time", + "upper", + "var1", "var2", "value", "variable", + "x", "y", "z", + ".")) diff --git a/R/misc.R b/R/misc.R index 12c05c2a..add494f8 100644 --- a/R/misc.R +++ b/R/misc.R @@ -1,200 +1,170 @@ -#' @include utils.R -NULL - -#' Pipe operator -#' -#' This package uses the pipe operator (`\%>\%`) to express nested code -#' as a series of imperative procedures. -#' -#' @param lhs, rhs An object and a function. -#' @seealso [magrittr::%>%()], [tee()]. -#' @return An object. -#' @keywords internal -#' @examples -#' # set seed for reproducibility -#' set.seed(500) -#' -#' # generate 100 random numbers and calculate the mean -#' mean(runif(100)) -#' -#' # reset the seed -#' set.seed(500) -#' -#' # repeat the previous procedure but use the pipe operator instead of nesting -#' # function calls inside each other. -#' runif(100) %>% mean() -#' -#' @name %>% -#' @rdname pipe -#' @aliases pipe -#' @importFrom magrittr %>% -#' @export -NULL - -#' Central colour repository -#' @description This command contains all the colours -#' specified for use in \pkg{ibis.iSDM}. -#' @name ibis_colours -#' @examples -#' ibis_colours[['viridis_plasma']] -#' @keywords internal -#' @noRd -ibis_colours <- list( - sdm_colour = colorRampPalette(c('grey90','steelblue4','steelblue1','gold','red1','red4'))(100), - prob_colour = colorRampPalette(c("grey90","springgreen4","cornflowerblue","dodgerblue4","yellow","orange","mediumvioletred","red"))(100), - ohsu_palette = colorRampPalette(c("white","#fbcc3f", "#56ab6c", "#5e9dcc", "#575d5f"))(100), - divg_bluegreen = c("#2C194C","#284577","#4B76A0","#8CA7C3","#D0DCE6","#D4E6D6","#98C39B","#5C9F61","#3E7229","#434C01"), - divg_bluered = c("#4E193D","#44234E","#3B3264","#34487B","#376091","#4B7BA5","#6996B6","#8DADC3","#B1BEC7","#CCC1BE","#D8B7A7", - "#D8A589","#CE8C6A","#BF724C","#A95432","#8E3821","#77231D","#661723","#5A152D","#50193B"), - viridis_orig = c("#440154FF","#482878FF","#3E4A89FF","#31688EFF","#26828EFF","#1F9E89FF","#35B779FF","#6DCD59FF","#B4DE2CFF","#FDE725FF"), - viridis_cividis = c("#00204DFF","#00336FFF","#39486BFF","#575C6DFF","#707173FF","#8A8779FF","#A69D75FF","#C4B56CFF","#E4CF5BFF","#FFEA46FF"), - viridis_plasma = c("#0D0887FF","#47039FFF","#7301A8FF","#9C179EFF","#BD3786FF","#D8576BFF","#ED7953FF","#FA9E3BFF","#FDC926FF","#F0F921FF"), - distinct_random = c("#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059", - "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", - "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80", - "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", - "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", - "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09", - "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66", - "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", - "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", - "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", - "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", - "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", - "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C") -) - -#' Print ibis options -#' -#' @description There are a number of hidden options that can be specified for ibis.iSDM. -#' Currently supported are: -#' * \code{'ibis.runparallel'} : [`logical`] value on whether processing should be run in parallel -#' * \code{'ibis.nthread'} : [`numeric`] value on how many cores should be used by default -#' * \code{'ibis.setupmessages'} : [`logical`] value indicating whether message during object creation should be shown -#' * \code{'ibis.engines'} : Returns a [`vector`] with all valid engines. -#' * \code{'ibis.use_future'} : [`logical`] on whether the \pkg{future} package should be used for parallel computing. -#' @return The output of \code{getOptions} for all ibis related variables -#' @keywords misc -#' @examples -#' \dontrun{ -#' ibis_options() -#' } -#' @export -ibis_options <- function(){ - what <- grep('ibis',names(options()),value = TRUE) - items <- options()[what] - items -} - -#' Install ibis dependencies -#' -#' @description -#' Some of the dependencies (R-Packages) that ibis.iSDM relies on are by intention -#' not added to the Description of the file to keep the number of mandatory dependencies small -#' and enable the package to run even on systems that might not have all libraries pre-installed. -#' -#' This function provides a convenience wrapper to install those missing dependencies as needed. It -#' furthermore checks which packages require updating and updates them as needed. -#' @note -#' INLA is handled in a special way as it is not available via cran. -#' @param deps A [`vector`] with the names of the packages to be installed (Default: \code{"ibis.dependencies"} in [`ibis_options`]). -#' @param update A [`logical`] flag of whether all (installed) packages should also be checked for updates (Default: \code{TRUE}). -#' @returns Nothing. Packages will be installed. -#' @examples -#' \dontrun{ -#' # Install and update all dependencies -#' ibis_dependencies() -#' } -#' @keywords misc -#' @export -ibis_dependencies <- function(deps = getOption("ibis.dependencies"), update = TRUE){ - assertthat::assert_that( - is.vector(deps), - length(deps) >= 1, - is.logical(update) - ) - # First check which packages are not installed and then do so. - new.packages <- deps[!(deps %in% utils::installed.packages()[, "Package"])] - if(length(new.packages)>0){ - if("INLA" %in% new.packages){ - suppressMessages( - install.packages("INLA", repos=c(getOption("repos"), INLA="https://inla.r-inla-download.org/R/stable"), - dependencies = TRUE, quiet = TRUE) - ) - } - suppressMessages( - install.packages(new.packages, dependencies = TRUE, quiet = TRUE) - ) - } - - # Update packages if set - if(update){ - if("INLA" %in% deps){ - # For windows - if(length(grep("Windows", osVersion, ignore.case = TRUE)) && !("INLA" %in% utils::installed.packages()[, "Package"])){ - # On windows we remove INLA and reinstall - install.packages("INLA", repos="https://inla.r-inla-download.org/R/stable") - } else { - require("INLA") - suppressPackageStartupMessages( - inla.upgrade(ask = FALSE) - ) - } - } - # Update all the package excluding INLA - suppressMessages( - utils::update.packages(deps, ask = FALSE) - ) - } - invisible() -} - -#' Options to set up ibis for parallel processing with future -#' -#' @param cores A [`numeric`] number stating the number of cores to use. -#' @param strategy A [`character`] denoting the strategy to be used for future. See help of [`future`] for options. -#' (Default: \code{"multisession"}). -#' @seealso [future] -#' @return None -#' @examples -#' \dontrun{ -#' # Starts future job -#' ibis_future(cores = 4) -#' } -#' @keywords misc -#' @export -ibis_future <- function(cores = getOption("ibis.nthread"), strategy = getOption("ibis.futurestrategy")) { - assertthat::assert_that( - is.numeric(cores), - is.character(strategy) - ) - check_package("future") - # Check that number of cores don't exceed what is possible - assertthat::assert_that(cores <= future::availableCores()) - - strategy <- match.arg(strategy, c("sequential", "multisession", "multicore", "cluster", "remote"), - several.ok = FALSE) - - if(isTRUE(Sys.info()[["sysname"]] == "Windows")){ - if(strategy == "multicore") stop("Multicore is not supported on windows!") - } - - # Define plan based on formulated strategy - if(strategy == "remote"){ - #TODO: See if a testing environment could be found. - stop("TBD. Requires specific setup.") - #e.g. cl <- makeCluster(4, type = "MPI") - } else if(strategy == "sequential") { - future::plan(strategy = future::sequential()) - } else if(strategy == "multisession"){ - future::plan(strategy = future::multisession(workers = cores) ) - } else if(strategy == "multicore"){ - future::plan(strategy = future::multicore(workers = cores) ) - } else if(strategy == "cluster"){ - future::plan(strategy = future::cluster(workers = cores) ) - } - # Register the doFuture adapate - doFuture::registerDoFuture() - invisible() -} - +#' @include utils.R +NULL + +#' Central colour repository +#' @description This command contains all the colours +#' specified for use in \pkg{ibis.iSDM}. +#' @name ibis_colours +#' @examples +#' ibis_colours[['viridis_plasma']] +#' @keywords internal +#' @noRd +ibis_colours <- list( + sdm_colour = colorRampPalette(c('grey90','steelblue4','steelblue1','gold','red1','red4'))(100), + prob_colour = colorRampPalette(c("grey90","springgreen4","cornflowerblue","dodgerblue4","yellow","orange","mediumvioletred","red"))(100), + ohsu_palette = colorRampPalette(c("white","#fbcc3f", "#56ab6c", "#5e9dcc", "#575d5f"))(100), + divg_bluegreen = c("#2C194C","#284577","#4B76A0","#8CA7C3","#D0DCE6","#D4E6D6","#98C39B","#5C9F61","#3E7229","#434C01"), + divg_bluered = c("#4E193D","#44234E","#3B3264","#34487B","#376091","#4B7BA5","#6996B6","#8DADC3","#B1BEC7","#CCC1BE","#D8B7A7", + "#D8A589","#CE8C6A","#BF724C","#A95432","#8E3821","#77231D","#661723","#5A152D","#50193B"), + viridis_orig = c("#440154FF","#482878FF","#3E4A89FF","#31688EFF","#26828EFF","#1F9E89FF","#35B779FF","#6DCD59FF","#B4DE2CFF","#FDE725FF"), + viridis_cividis = c("#00204DFF","#00336FFF","#39486BFF","#575C6DFF","#707173FF","#8A8779FF","#A69D75FF","#C4B56CFF","#E4CF5BFF","#FFEA46FF"), + viridis_plasma = c("#0D0887FF","#47039FFF","#7301A8FF","#9C179EFF","#BD3786FF","#D8576BFF","#ED7953FF","#FA9E3BFF","#FDC926FF","#F0F921FF"), + distinct_random = c("#000000", "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059", + "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87", + "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80", + "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100", + "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F", + "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09", + "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66", + "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C", + "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81", + "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00", + "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700", + "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329", + "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C") +) + +#' Print ibis options +#' +#' @description There are a number of hidden options that can be specified for ibis.iSDM. +#' Currently supported are: +#' * \code{'ibis.runparallel'} : [`logical`] value on whether processing should be run in parallel +#' * \code{'ibis.nthread'} : [`numeric`] value on how many cores should be used by default +#' * \code{'ibis.setupmessages'} : [`logical`] value indicating whether message during object creation should be shown +#' * \code{'ibis.engines'} : Returns a [`vector`] with all valid engines. +#' * \code{'ibis.use_future'} : [`logical`] on whether the \pkg{future} package should be used for parallel computing. +#' @return The output of \code{getOptions} for all ibis related variables +#' @keywords misc +#' @examples +#' \dontrun{ +#' ibis_options() +#' } +#' @export +ibis_options <- function(){ + what <- grep('ibis',names(options()),value = TRUE) + items <- options()[what] + items +} + +#' Install ibis dependencies +#' +#' @description +#' Some of the dependencies (R-Packages) that ibis.iSDM relies on are by intention +#' not added to the Description of the file to keep the number of mandatory dependencies small +#' and enable the package to run even on systems that might not have all libraries pre-installed. +#' +#' This function provides a convenience wrapper to install those missing dependencies as needed. It +#' furthermore checks which packages require updating and updates them as needed. +#' @note +#' INLA is handled in a special way as it is not available via cran. +#' @param deps A [`vector`] with the names of the packages to be installed (Default: \code{"ibis.dependencies"} in [`ibis_options`]). +#' @param update A [`logical`] flag of whether all (installed) packages should also be checked for updates (Default: \code{TRUE}). +#' @returns Nothing. Packages will be installed. +#' @examples +#' \dontrun{ +#' # Install and update all dependencies +#' ibis_dependencies() +#' } +#' @keywords misc +#' @export +ibis_dependencies <- function(deps = getOption("ibis.dependencies"), update = TRUE){ + assertthat::assert_that( + is.vector(deps), + length(deps) >= 1, + is.logical(update) + ) + # First check which packages are not installed and then do so. + new.packages <- deps[!(deps %in% utils::installed.packages()[, "Package"])] + if(length(new.packages)>0){ + if("INLA" %in% new.packages){ + suppressMessages( + utils::install.packages("INLA", repos=c(getOption("repos"), INLA="https://inla.r-inla-download.org/R/stable"), + dependencies = TRUE, quiet = TRUE) + ) + } + suppressMessages( + utils::install.packages(new.packages, dependencies = TRUE, quiet = TRUE, + repos = "https://cloud.r-project.org") + ) + } + + # Update packages if set + if(update){ + if("INLA" %in% deps){ + # For windows + if(length(grep("Windows", utils::osVersion, ignore.case = TRUE)) && !("INLA" %in% utils::installed.packages()[, "Package"])){ + # On windows we remove INLA and reinstall + utils::install.packages("INLA", repos="https://inla.r-inla-download.org/R/stable") + } else if(requireNamespace("INLA", quietly = TRUE)) { + suppressPackageStartupMessages( + INLA::inla.upgrade(ask = FALSE) + ) + } + } + # Update all the package excluding INLA + suppressMessages( + utils::update.packages(deps, ask = FALSE, repos = "https://cloud.r-project.org") + ) + } + invisible() +} + +#' Options to set up ibis for parallel processing with future +#' +#' @param cores A [`numeric`] number stating the number of cores to use. +#' @param strategy A [`character`] denoting the strategy to be used for future. See help of [`future`] for options. +#' (Default: \code{"multisession"}). +#' @seealso [future] +#' @return None +#' @examples +#' \dontrun{ +#' # Starts future job +#' ibis_future(cores = 4) +#' } +#' @keywords misc +#' @export +ibis_future <- function(cores = getOption("ibis.nthread"), strategy = getOption("ibis.futurestrategy")) { + assertthat::assert_that( + is.numeric(cores), + is.character(strategy) + ) + check_package("future") + # Check that number of cores don't exceed what is possible + assertthat::assert_that(cores <= future::availableCores()) + + strategy <- match.arg(strategy, c("sequential", "multisession", "multicore", "cluster", "remote"), + several.ok = FALSE) + + if(isTRUE(Sys.info()[["sysname"]] == "Windows")){ + if(strategy == "multicore") stop("Multicore is not supported on windows!") + } + + # Define plan based on formulated strategy + if(strategy == "remote"){ + #TODO: See if a testing environment could be found. + stop("TBD. Requires specific setup.") + #e.g. cl <- makeCluster(4, type = "MPI") + } else if(strategy == "sequential") { + future::plan(strategy = future::sequential()) + } else if(strategy == "multisession"){ + future::plan(strategy = future::multisession(workers = cores) ) + } else if(strategy == "multicore"){ + future::plan(strategy = future::multicore(workers = cores) ) + } else if(strategy == "cluster"){ + future::plan(strategy = future::cluster(workers = cores) ) + } + # Register the doFuture adapate + doFuture::registerDoFuture() + invisible() +} + diff --git a/R/plot.R b/R/plot.R index 96afd568..bdad9f4c 100644 --- a/R/plot.R +++ b/R/plot.R @@ -1,209 +1,209 @@ -#' @include utils.R -NULL - -#' Plot wrappers -#' -#' @description -#' Plots information from a given object where a plotting object is available. -#' -#' @param x Any object belonging to [DistributionModel], [BiodiversityDatasetCollection], [PredictorDataset] or [BiodiversityScenario]. -#' @param what In case a [RasterLayer] is supplied, this parameter specifies the layer to be shown (Default: \code{"mean"}). -#' @param ... Further arguments passed on to \code{x$plot}. -#' -#' @details -#' The plotted outputs vary depending on what object is being plotted. -#' For example for a fitted [DistributionModel] the output is usually the fitted spatial prediction (Default: \code{'mean'}). -#' @examples -#' \dontrun{ -#' # Build and train a model -#' mod <- distribution(background) |> -#' add_biodiversity_poipo(species) |> -#' add_predictors(predictors) |> -#' engine_glmnet() |> -#' train() -#' # Plot the resulting model -#' plot(mod) -#' } -#' @return Graphical output -#' @keywords misc -#' @name plot -NULL - -#' @rdname plot -#' @method plot DistributionModel -#' @keywords misc -#' @export -plot.DistributionModel <- function(x, ...) x$plot(...) - -#' @rdname plot -#' @method plot BiodiversityDatasetCollection -#' @keywords misc -#' @export -plot.BiodiversityDatasetCollection <- function(x, ...) x$plot(...) - -#' @rdname plot -#' @method plot PredictorDataset -#' @keywords misc -#' @export -plot.PredictorDataset <- function(x, ...) x$plot(...) - -#' @rdname plot -#' @method plot Engine -#' @keywords misc -#' @export -plot.Engine <- function(x,...) x$plot(...) - -#' @rdname plot -#' @method plot BiodiversityScenario -#' @keywords misc -#' @export -plot.BiodiversityScenario <- function(x,...) x$plot(...) - -# ------------ ' -#' Bivariate plot wrapper for distribution objects -#' -#' @description -#' Often there is an intention to display not only the predictions made with a SDM, but -#' also the uncertainty of the prediction. Uncertainty be estimated either directly by the model -#' or by calculating the variation in prediction values among a set of models. -#' -#' In particular Bayesian engines can produce not only mean estimates of fitted responses, -#' but also pixel-based estimates of uncertainty from the posterior such as the standard deviation (SD) -#' or the coefficient of variation of a given prediction. -#' -#' This function makes use of the [`biscale`] R-package to create bivariate plots of the fitted distribution object, -#' allowing to visualize two variables at once. It is mostly thought of as a convenience function to -#' create such bivariate plots for quick visualization. -#' -#' Supported Inputs are either single trained Bayesian [`DistributionModel`] with uncertainty or -#' the output of an [`ensemble()`] call. In both cases, users have to make sure that \code{"xvar"} and -#' \code{"yvar"} are set accordingly. -#' -#' @param mod A trained [`DistributionModel`] or alternatively a [`Raster`] object with \code{prediction} model within. -#' @param xvar A [`character`] denoting the value on the x-axis (Default: \code{'mean'}). -#' @param yvar A [`character`] denoting the value on the y-axis (Default: \code{'sd'}). -#' @param plot A [`logical`] indication of whether the result is to be plotted (Default: \code{TRUE})? -#' @param fname A [`character`] specifying the output filename a created figure should be written to. -#' @param title Allows to respecify the title through a [`character`] (Default: \code{NULL}). -#' @param col A [`character`] stating the colour palette to use. Has to be either a predefined value or a -#' vector of colours. See \code{"biscale::bi_pal_manual"}. Default: \code{"BlueGold"}. -#' @param ... Other engine specific parameters. -#' @seealso [partial], [plot.DistributionModel] -#' @note -#' **This function requires the biscale package to be installed.** -#' Although a work around without the package could be developed, it was not deemed necessary at this point. -#' See also this [gist](https://gist.github.com/scbrown86/2779137a9378df7b60afd23e0c45c188). -#' @return Saved bivariate plot in \code{'fname'} if specified, otherwise plot. -#' @keywords misc -#' @export -#' @name bivplot -methods::setGeneric( - "bivplot", - signature = methods::signature("mod"), - function(mod, xvar = "mean", yvar = "sd", plot = TRUE, fname = NULL, col = "BlueGold",...) standardGeneric("bivplot")) - -#' @name bivplot -#' @rdname bivplot -#' @usage \S4method{bivplot}{ANY}(mod) -methods::setMethod( - "bivplot", - methods::signature(mod = "ANY"), - function(mod, xvar = "mean", yvar = "sd", plot = TRUE, fname = NULL, title = NULL, col = "BlueGold",...) { - # Generic checks - assertthat::assert_that(is.logical(plot), - is.character(xvar), - is.character(yvar), - is.character(col) || is.vector(col), - is.null(title) || is.character(title), - is.null(fname) || is.character(fname), - isTRUE(plot) || is.character(fname) - ) - # Check whether object is a raster, otherwise extract object - if(is.Raster(mod)){ - assertthat::assert_that(raster::nlayers(mod)>1) - obj <- mod - # If number of layers equal to 2 (output from ensemble?), change xvar and yvar - if(raster::nlayers(mod)==2 && !(xvar %in% names(obj))){ - if(getOption('ibis.setupmessages')) myLog('[Parameter]','yellow','Variable not found. Changing to layer names...') - xvar <- names(obj)[1]; yvar <- names(obj)[2] - } - } else { - assertthat::assert_that(inherits(mod, "DistributionModel"), - msg = "The bivplot function currently only works with fitted distribution objects!") - # Check that distribution object has a prediction - assertthat::assert_that("prediction" %in% mod$show_rasters(), - is.Raster(mod$get_data()), - msg = "No prediction found in the provided object.") - obj <- mod$get_data() - } - - # Check that at least mean and standard deviation is available - assertthat::assert_that(xvar %in% names(obj), - yvar %in% names(obj), - msg = "Specified (default?) variables for xvar/yvar not found in model!") - - # Check whether biscale package is available - check_package('biscale') - check_package('ggplot2') - check_package("cowplot") - if(!("biscale" %in% loadedNamespaces()) || ('biscale' %notin% sessionInfo()$otherPkgs) ) { - try({requireNamespace('biscale');attachNamespace("biscale")},silent = TRUE) - } - - # Check provided colours - if(is.character(col)){ - choices <- c("Bluegill", "BlueGold", "BlueOr", "BlueYl", "Brown", - "Brown2", "DkBlue","DkBlue2", "DkCyan","DkCyan2", "DkViolet", - "DkViolet2", "GrPink","GrPink2", "PinkGrn", "PurpleGrn", "PurpleOr") - col <- match.arg(col, choices, several.ok = FALSE) - } - - # Define default title - if(is.null(title)){ - title <- paste("Bivariate plot of prediction\n (",mod$model$runname,')') - } - - # Create dimensions - legend <- biscale::bi_legend(pal = col, - dim = 3, - xlab = paste0("Larger ", xvar), - ylab = paste0("Larger ", yvar), - size = 16) - - # Create data for plotting - df <- obj[[c(xvar,yvar)]] |> predictor_transform(option = "norm") |> - raster::as.data.frame(xy = TRUE) - names(df)[3:4] <- c("var1", "var2") - suppressWarnings( - df <- biscale::bi_class(df, x = var1, y = var2, dim = 3, style = "quantile") - ) - - map <- ggplot2::ggplot() + - ggplot2::geom_raster(data = df , ggplot2::aes(x = x, y = y, fill = bi_class)) + - biscale::bi_theme(base_size = 16) + - biscale::bi_scale_fill(pal = col, na.value = "transparent") + - # coord_quickmap() + - ggplot2::labs( - title = title, - x = "", - y = "" - ) + - ggplot2::theme(legend.position = "none") - - # Add legend with cowplot - finalPlot <- cowplot::ggdraw() + - cowplot::draw_plot(map, 0, 0, 1, 1) + - cowplot::draw_plot(legend, 0.2, .65, 0.2, 0.2) - - # Print the plot - if(plot){ - print(finalPlot) - } - if(is.character(fname)){ - cowplot::ggsave2(filename = fname, plot = finalPlot) - } - - return(finalPlot) - } -) - +#' @include utils.R +NULL + +#' Plot wrappers +#' +#' @description +#' Plots information from a given object where a plotting object is available. +#' +#' @param x Any object belonging to [DistributionModel], [BiodiversityDatasetCollection], [PredictorDataset] or [BiodiversityScenario]. +#' @param what In case a [RasterLayer] is supplied, this parameter specifies the layer to be shown (Default: \code{"mean"}). +#' @param ... Further arguments passed on to \code{x$plot}. +#' +#' @details +#' The plotted outputs vary depending on what object is being plotted. +#' For example for a fitted [DistributionModel] the output is usually the fitted spatial prediction (Default: \code{'mean'}). +#' @examples +#' \dontrun{ +#' # Build and train a model +#' mod <- distribution(background) |> +#' add_biodiversity_poipo(species) |> +#' add_predictors(predictors) |> +#' engine_glmnet() |> +#' train() +#' # Plot the resulting model +#' plot(mod) +#' } +#' @return Graphical output +#' @keywords misc +#' @name plot +NULL + +#' @rdname plot +#' @method plot DistributionModel +#' @keywords misc +#' @export +plot.DistributionModel <- function(x, ...) x$plot(...) + +#' @rdname plot +#' @method plot BiodiversityDatasetCollection +#' @keywords misc +#' @export +plot.BiodiversityDatasetCollection <- function(x, ...) x$plot(...) + +#' @rdname plot +#' @method plot PredictorDataset +#' @keywords misc +#' @export +plot.PredictorDataset <- function(x, ...) x$plot(...) + +#' @rdname plot +#' @method plot Engine +#' @keywords misc +#' @export +plot.Engine <- function(x,...) x$plot(...) + +#' @rdname plot +#' @method plot BiodiversityScenario +#' @keywords misc +#' @export +plot.BiodiversityScenario <- function(x,...) x$plot(...) + +# ------------ ' +#' Bivariate plot wrapper for distribution objects +#' +#' @description +#' Often there is an intention to display not only the predictions made with a SDM, but +#' also the uncertainty of the prediction. Uncertainty be estimated either directly by the model +#' or by calculating the variation in prediction values among a set of models. +#' +#' In particular Bayesian engines can produce not only mean estimates of fitted responses, +#' but also pixel-based estimates of uncertainty from the posterior such as the standard deviation (SD) +#' or the coefficient of variation of a given prediction. +#' +#' This function makes use of the [`biscale`] R-package to create bivariate plots of the fitted distribution object, +#' allowing to visualize two variables at once. It is mostly thought of as a convenience function to +#' create such bivariate plots for quick visualization. +#' +#' Supported Inputs are either single trained Bayesian [`DistributionModel`] with uncertainty or +#' the output of an [`ensemble()`] call. In both cases, users have to make sure that \code{"xvar"} and +#' \code{"yvar"} are set accordingly. +#' +#' @param mod A trained [`DistributionModel`] or alternatively a [`Raster`] object with \code{prediction} model within. +#' @param xvar A [`character`] denoting the value on the x-axis (Default: \code{'mean'}). +#' @param yvar A [`character`] denoting the value on the y-axis (Default: \code{'sd'}). +#' @param plot A [`logical`] indication of whether the result is to be plotted (Default: \code{TRUE})? +#' @param fname A [`character`] specifying the output filename a created figure should be written to. +#' @param title Allows to respecify the title through a [`character`] (Default: \code{NULL}). +#' @param col A [`character`] stating the colour palette to use. Has to be either a predefined value or a +#' vector of colours. See \code{"biscale::bi_pal_manual"}. Default: \code{"BlueGold"}. +#' @param ... Other engine specific parameters. +#' @seealso [partial], [plot.DistributionModel] +#' @note +#' **This function requires the biscale package to be installed.** +#' Although a work around without the package could be developed, it was not deemed necessary at this point. +#' See also this [gist](https://gist.github.com/scbrown86/2779137a9378df7b60afd23e0c45c188). +#' @return Saved bivariate plot in \code{'fname'} if specified, otherwise plot. +#' @keywords misc +#' @export +#' @name bivplot +methods::setGeneric( + "bivplot", + signature = methods::signature("mod"), + function(mod, xvar = "mean", yvar = "sd", plot = TRUE, fname = NULL, col = "BlueGold",...) standardGeneric("bivplot")) + +#' @name bivplot +#' @rdname bivplot +#' @usage \S4method{bivplot}{ANY}(mod) +methods::setMethod( + "bivplot", + methods::signature(mod = "ANY"), + function(mod, xvar = "mean", yvar = "sd", plot = TRUE, fname = NULL, title = NULL, col = "BlueGold",...) { + # Generic checks + assertthat::assert_that(is.logical(plot), + is.character(xvar), + is.character(yvar), + is.character(col) || is.vector(col), + is.null(title) || is.character(title), + is.null(fname) || is.character(fname), + isTRUE(plot) || is.character(fname) + ) + # Check whether object is a raster, otherwise extract object + if(is.Raster(mod)){ + assertthat::assert_that(raster::nlayers(mod)>1) + obj <- mod + # If number of layers equal to 2 (output from ensemble?), change xvar and yvar + if(raster::nlayers(mod)==2 && !(xvar %in% names(obj))){ + if(getOption('ibis.setupmessages')) myLog('[Parameter]','yellow','Variable not found. Changing to layer names...') + xvar <- names(obj)[1]; yvar <- names(obj)[2] + } + } else { + assertthat::assert_that(inherits(mod, "DistributionModel"), + msg = "The bivplot function currently only works with fitted distribution objects!") + # Check that distribution object has a prediction + assertthat::assert_that("prediction" %in% mod$show_rasters(), + is.Raster(mod$get_data()), + msg = "No prediction found in the provided object.") + obj <- mod$get_data() + } + + # Check that at least mean and standard deviation is available + assertthat::assert_that(xvar %in% names(obj), + yvar %in% names(obj), + msg = "Specified (default?) variables for xvar/yvar not found in model!") + + # Check whether biscale package is available + check_package('biscale') + check_package('ggplot2') + check_package("cowplot") + if(!("biscale" %in% loadedNamespaces()) || ('biscale' %notin% utils::sessionInfo()$otherPkgs) ) { + try({requireNamespace('biscale');attachNamespace("biscale")},silent = TRUE) + } + + # Check provided colours + if(is.character(col)){ + choices <- c("Bluegill", "BlueGold", "BlueOr", "BlueYl", "Brown", + "Brown2", "DkBlue","DkBlue2", "DkCyan","DkCyan2", "DkViolet", + "DkViolet2", "GrPink","GrPink2", "PinkGrn", "PurpleGrn", "PurpleOr") + col <- match.arg(col, choices, several.ok = FALSE) + } + + # Define default title + if(is.null(title)){ + title <- paste("Bivariate plot of prediction\n (",mod$model$runname,')') + } + + # Create dimensions + legend <- biscale::bi_legend(pal = col, + dim = 3, + xlab = paste0("Larger ", xvar), + ylab = paste0("Larger ", yvar), + size = 16) + + # Create data for plotting + df <- obj[[c(xvar,yvar)]] |> predictor_transform(option = "norm") |> + raster::as.data.frame(xy = TRUE) + names(df)[3:4] <- c("var1", "var2") + suppressWarnings( + df <- biscale::bi_class(df, x = var1, y = var2, dim = 3, style = "quantile") + ) + + map <- ggplot2::ggplot() + + ggplot2::geom_raster(data = df , ggplot2::aes(x = x, y = y, fill = bi_class)) + + biscale::bi_theme(base_size = 16) + + biscale::bi_scale_fill(pal = col, na.value = "transparent") + + # coord_quickmap() + + ggplot2::labs( + title = title, + x = "", + y = "" + ) + + ggplot2::theme(legend.position = "none") + + # Add legend with cowplot + finalPlot <- cowplot::ggdraw() + + cowplot::draw_plot(map, 0, 0, 1, 1) + + cowplot::draw_plot(legend, 0.2, .65, 0.2, 0.2) + + # Print the plot + if(plot){ + print(finalPlot) + } + if(is.character(fname)){ + cowplot::ggsave2(filename = fname, plot = finalPlot) + } + + return(finalPlot) + } +) + diff --git a/R/project.R b/R/project.R index 86c32db6..2ad4e06f 100644 --- a/R/project.R +++ b/R/project.R @@ -1,463 +1,463 @@ -#' @include utils.R bdproto-biodiversityscenario.R -NULL - -#' Project a fitted model to a new environment and covariates -#' -#' @description -#' Equivalent to [train], this function acts as a -#' wrapper to project the model stored in a [`BiodiversityScenario-class`] object to -#' newly supplied (future) covariates. Supplied predictors are usually spatial-temporal predictors -#' which should be prepared via [`add_predictors()`] (e.g. transformations and derivates) in the same way as they have been during -#' the initial modelling with [`distribution()`]. -#' Any constrains specified in the scenario object are applied during the projection. -#' -#' @details -#' In the background the function \code{x$project()} for the respective model object is called, where -#' \code{x} is fitted model object. For specifics on the constrains, see the relevant [constrain] functions, -#' respectively: -#' * [`add_constrain()`] for generic wrapper to add any of the available constrains. -#' * [`add_constrain_dispersal()`] for specifying dispersal constrain on the temporal projections at each step. -#' * [`add_constrain_MigClim()`] Using the \pkg{MigClim} R-package to simulate dispersal in projections. -#' * [`add_constrain_connectivity()`] Apply a connectivity constrain at the projection, for instance by adding -#' a barrier that prevents migration. -#' * [`add_constrain_adaptability()`] Apply an adaptability constrain to the projection, for instance -#' constraining the speed a species is able to adapt to new conditions. -#' * [`add_constrain_boundary()`] To artificially limit the distribution change. Similar as specifying projection limits, but -#' can be used to specifically constrain a projection within a certain area (e.g. a species range or an island). -#' -#' Many constrains also requires thresholds to be calculated. Adding [`threshold()`] to a -#' [`BiodiversityScenario-class`] object enables the computation of thresholds at every step based on the threshold -#' used for the main model (threshold values are taken from there). -#' -#' Finally this function also allows temporal stabilization across prediction steps via enabling -#' the parameter \code{stabilize} and checking the \code{stablize_method} argument. Stabilization can for instance -#' be helpful in situations where environmental variables are quite dynamic, but changes in projected suitability -#' are not expected to abruptly increase or decrease. It is thus a way to smoothen out outliers from the projection. -#' Options are so far for instance \code{'loess'} which fits a [`loess()`] model per pixel and time step. This is conducted at -#' the very of the processing steps and any thresholds will be recalculated afterwards. -#' -#' @seealso [`scenario()`] -#' @param mod A [`BiodiversityScenario`] object with set predictors. -#' Note that some constrains such as [MigClim] can still simulate future change without projections. -#' @param date_interpolation A [`character`] on whether dates should be interpolated. Options -#' include \code{"none"} (Default), \code{"annual"}, \code{"monthly"}, \code{"daily"}. -#' @param stabilize A [`boolean`] value indicating whether the suitability projection should be stabilized (Default: \code{FALSE}). -#' @param stabilize_method [`character`] stating the stabilization method to be applied. Currently supported is \code{`loess`}. -#' @param layer A [`character`] specifying the layer to be projected (Default: \code{"mean"}). -#' @param ... passed on parameters. -#' @returns Saves [`stars`] objects of the obtained predictions in mod. -#' -#' @name project -#' @aliases project -#' @keywords scenarios -#' @exportMethod project -#' @export -NULL -methods::setGeneric("project", - signature = methods::signature("mod"), - function(mod, date_interpolation = "none", stabilize = FALSE, stabilize_method = "loess", - layer = "mean", ...) standardGeneric("project")) - -#' @name project -#' @rdname project -#' @usage \S4method{project}{BiodiversityScenario, character, logical, character, character}(mod, date_interpolation, stabilize, stabilize_method, layer) -methods::setMethod( - "project", - methods::signature(mod = "BiodiversityScenario"), - function(mod, date_interpolation = "none", stabilize = FALSE, stabilize_method = "loess", - layer = "mean", ...){ - # date_interpolation = "none"; stabilize = FALSE; stabilize_method = "loess"; layer="mean" - assertthat::assert_that( - inherits(mod, "BiodiversityScenario"), - !is.Waiver(mod$get_predictors()), - is.character(date_interpolation), - is.logical(stabilize), - is.character(layer) - ) - # Match methods - date_interpolation <- match.arg(date_interpolation, c("none", "yearly", "annual", "monthly", "daily"), several.ok = FALSE) - stabilize_method <- match.arg(stabilize_method, c("loess"), several.ok = FALSE) - if(!is.Waiver(mod$get_data())) if(getOption('ibis.setupmessages')) myLog('[Scenario]','red','Overwriting existing scenarios...') - - # Get the model object - fit <- mod$get_model() - # Check that coefficients and model exist - assertthat::assert_that(!is.Waiver(fit), - nrow(fit$get_coefficients())>0, - msg = "No model or coefficients found!") - # Get predictors - new_preds <- mod$get_predictors() - if(is.Waiver(new_preds)) stop('No scenario predictors found.') - new_crs <- new_preds$get_projection() - if(is.na(new_crs)) if(getOption('ibis.setupmessages')) myLog('[Scenario]','yellow','Missing projection of future predictors.') - - # Interpolate dates if set - if(date_interpolation!="none"){ - if(getOption('ibis.setupmessages')) myLog('[Scenario]','green',paste0('Interpolating dates for scenario predictors as: ', date_interpolation)) - o <- approximate_gaps(env = new_preds$get_data(), date_interpolation = date_interpolation) - # Set new data - #new_preds$set_data() - } - - # Get limits if present - if(!is.null( mod$get_limits() )){ - # FIXME: Scenarios need to be checked that the right layer is taken!! - # Get prediction - n <- fit$show_rasters()[grep("threshold",fit$show_rasters())] - tr <- fit$get_data(n)[[1]] - tr <- cbind( raster::coordinates(tr), data.frame(thresh = values(tr))) - tr[['thresh']] <- ifelse(tr[['thresh']]==0, NA, tr[['thresh']]) - tr <- tr %>% subset(., complete.cases(thresh)) - - # Get zones from the limiting area, e.g. those intersecting with input - suppressMessages( - suppressWarnings( - zones <- st_intersection(sf::st_as_sf(tr, coords = c('x','y'), crs = sf::st_crs(fit$model$background)), - mod$get_limits() - ) - ) - ) - # Limit zones - zones <- subset(mod$get_limits(), limit %in% unique(zones$limit) ) - # Now clip all provided new predictors and background to this - new_preds$crop_data(zones) - } - - # Check that predictor names are all present - mod_pred_names <- fit$model$predictors_names - pred_names <- mod$get_predictor_names() - assertthat::assert_that( all(mod_pred_names %in% pred_names), - msg = paste0('Model predictors are missing from the scenario predictor!') ) - - # Get constraints, threshold values and other parameters - scenario_threshold <- mod$get_threshold() - # Not get the baseline raster - thresh_reference <- grep('threshold',fit$show_rasters(),value = T)[1] # Use the first one always - baseline_threshold <- mod$get_model()$get_data(thresh_reference) - if(!is.Waiver(scenario_threshold)){ - if(is.na(raster::projection(baseline_threshold))) projection(baseline_threshold) <- raster::projection( fit$model$background ) - } - - if(inherits(baseline_threshold, 'RasterStack') || inherits(baseline_threshold, 'RasterBrick')){ - baseline_threshold <- baseline_threshold[[grep(layer,names(baseline_threshold))]] - } - scenario_constraints <- mod$get_constraints() - - # --- Check that everything is there --- - # Check that thresholds are set for constrains - if("dispersal" %in% names(scenario_constraints)){ - if(scenario_constraints[["dispersal"]]$method == "MigClim") { - assertthat::assert_that(is.Raster(baseline_threshold)) - } else if(scenario_constraints[["dispersal"]]$method == "kissmig"){ - assertthat::assert_that( is.Raster(baseline_threshold)) - if(!is.Waiver(scenario_threshold)) { - if(getOption('ibis.setupmessages')) myLog('[Scenario]','green','Using kissmig to calculate updated distribution thresholds.') - scenario_threshold <- new_waiver() - } - } else { - assertthat::assert_that(!is.Waiver(scenario_threshold),msg = "Other constrains require threshold option!") - } - } - if("connectivity" %in% names(scenario_constraints) && "dispersal" %notin% names(scenario_constraints)){ - if(getOption('ibis.setupmessages')) myLog('[Scenario]','yellow','Connectivity contraints make most sense with a dispersal constraint.') - } - # ----------------------------- # - # Start of projection # - # ----------------------------- # - - # Now convert to data.frame and subset - df <- new_preds$get_data(df = TRUE) - names(df)[1:3] <- tolower(names(df)[1:3]) # Assuming the first three attributes are x,y,t - assertthat::assert_that(nrow(df)>0, - hasName(df,'x'), hasName(df,'y'), hasName(df,'time'), - msg = "Error: Projection data and training data are not of equal size and format!") - df <- subset(df, select = c("x", "y", "time", mod_pred_names) ) - df$time <- to_POSIXct( df$time ) - # Convert all units classes to numeric or character to avoid problems - df <- units::drop_units(df) - - # ------------------ # - if(getOption('ibis.setupmessages')) myLog('[Scenario]','green','Starting suitability projections for ', length(unique(df$time)), ' timesteps.') - - # Now for each unique element, loop and project in order - proj <- raster::stack() - proj_thresh <- raster::stack() - - pb <- progress::progress_bar$new(format = "Creating projections (:spin) [:bar] :percent", - total = length(unique(df$time))) - # TODO: Consider doing this in parallel but sequential - times <- sort(unique(df$time)) - for(step in times){ - # Get data - nd <- subset(df, time == step) - - # Apply adaptability constrain - if("adaptability" %in% names(scenario_constraints)){ - if(scenario_constraints[["adaptability"]]$method == "nichelimit") { - nd <- .nichelimit(newdata = nd, model = mod$get_model()[['model']], - names = scenario_constraints[["adaptability"]]$params['names'], - value = scenario_constraints[["adaptability"]]$params['value'], - increment = scenario_constraints[["adaptability"]]$params['increment'], - increment_step = which(step==times) ) - } - } - - # Project suitability - out <- fit$project(newdata = nd, layer = layer) - names(out) <- paste0("suitability", "_", layer, "_", step) - if(is.na(raster::projection(out))) raster::projection(out) <- raster::projection( fit$model$background ) - - # If other constrains are set, apply them posthoc - if(!is.Waiver(scenario_constraints)){ - # Apply a resistance surface if found - if("connectivity" %in% names(scenario_constraints)){ - # Get the layer for later - resistance <- scenario_constraints$connectivity$params$resistance - # By definition a hard barrier removes all suitable again - if(any(scenario_constraints$connectivity$method == "resistance")){ - if(raster::nlayers(resistance)>1){ - ind <- which( raster::getZ(resistance) == as.Date(step) ) # Get specific step - assertthat::assert_that(is.numeric(ind)) - resistance <- resistance[[ind]] - } - out <- out * resistance - } - } else { - resistance <- NULL - } - - # Calculate dispersal constraint if set - if("dispersal" %in% names(scenario_constraints) ){ - # MigClim simulations are run posthoc - if(scenario_constraints$dispersal$method %in% c("sdd_fixed", "sdd_nexpkernel")){ - out <- switch (scenario_constraints$dispersal$method, - "sdd_fixed" = .sdd_fixed(baseline_threshold, out, - value = scenario_constraints$dispersal$params[1], - resistance = resistance ), - "sdd_nexpkernel" = .sdd_nexpkernel(baseline_threshold, out, - value = scenario_constraints$dispersal$params[1], - resistance = resistance) - ) - names(out) <- paste0('suitability_', step) - } - # For kissmig generate threshold and masked suitabilities - if(scenario_constraints$dispersal$method == "kissmig"){ - out <- .kissmig_dispersal(baseline_threshold, - new_suit = out, - resistance = resistance, - params = scenario_constraints$dispersal$params) - # Returns a layer of two with both the simulated threshold and the masked suitability raster - names(out) <- paste0(c('threshold_', 'suitability_'), step) - # Add threshold to result stack - proj_thresh <- raster::addLayer(proj_thresh, out[[1]] ) - baseline_threshold <- out[[1]] - out <- out[[2]] - } - } - - # Connectivity constraints with hard barriers - if("connectivity" %in% names(scenario_constraints)){ - # By definition a hard barrier removes all suitable again - if(any(scenario_constraints$connectivity$method == "hardbarrier")){ - out[resistance==1] <- 0 - } - } - - } - - # Recalculate thresholds if set manually - if(!is.Waiver(scenario_threshold)){ - # FIXME: Currently this works only for mean thresholds. Think of how the other are to be handled - scenario_threshold <- scenario_threshold[1] - out_thresh <- out - out_thresh[out_thresh < scenario_threshold] <- 0; out_thresh[out_thresh >= scenario_threshold] <- 1 - names(out_thresh) <- paste0('threshold_', step) - # If threshold is - if( cellStats(out_thresh, 'max') == 0){ - if(getOption('ibis.setupmessages')) myLog('[Scenario]','yellow','Thresholding removed all grid cells. Using last years threshold.') - out_thresh <- baseline_threshold - } else { baseline_threshold <- out_thresh } - # Add to result stack - proj_thresh <- raster::addLayer(proj_thresh, out_thresh) - } - # Add to result stack - proj <- raster::addLayer(proj, out) - pb$tick() - } - rm(pb) - proj <- raster::setZ(proj, times ) - if(raster::nlayers(proj_thresh)>1) proj_thresh <- raster::setZ(proj_thresh, times ) - - # Apply MigClim and other post-hoc constraints if set - # FIXME: Ideally make this whole setup more modular. So create suitability projections first - if(!is.Waiver(scenario_constraints)){ - # Calculate dispersal constraint if set - if("dispersal" %in% names(scenario_constraints) ){ - # MigClim simulations are run posthoc - if(scenario_constraints$dispersal$method == 'MigClim'){ - # Get Parameters - params <- scenario_constraints$dispersal$params - - pb <- progress::progress_bar$new(total = raster::nlayers(proj)) - for(lyr in 1:raster::nlayers(proj)){ - pb$tick() - # Normalize the projected suitability rasters to be in range 0-1000 and save - hsMap <- predictor_transform(env = proj[[lyr]], option = "norm") * 1000 - # Write as filename in the destined folder - suppressWarnings( - raster::writeRaster(x = hsMap, filename = paste0( params[["hsMap"]],lyr,".tif"), - dt = "INT2S", varNA = -9999, prj = TRUE, overwrite = TRUE) - ) - rm(hsMap) - };rm(pb) - # For threshold, define based on type - tr <- ifelse(params[["rcThreshold"]] == "continuous", 0, 750) # Set to 75% as suitability threshold - - # Now run MigClim by switching to temporary dir - dir.ori <- getwd() - setwd(params[["dtmp"]]) - try({ - m <- MigClim::MigClim.migrate( - iniDist = basename(params[["iniDist"]]), - hsMap = basename(params[["hsMap"]]), - rcThreshold = tr, - envChgSteps = raster::nlayers(proj), # Use number of projected suitability layers - dispSteps = params[["dispSteps"]], - dispKernel = params[["dispKernel"]], - barrier = "", # TBD. Loaded via another arguement - barrierType = params[["barrierType"]], - iniMatAge = params[["iniMatAge"]], - propaguleProd = params[["propaguleProdProb"]], - lddFreq = params[["lddFreq"]], - lddMinDist = params[["lddMinDist"]], lddMaxDist = params[["lddMaxDist"]], - simulName = basename(params[["simulName"]]), - replicateNb = params[["replicateNb"]], - overWrite = params[["overwrite"]], - testMode = FALSE, - fullOutput = params[["fullOutput"]], - keepTempFiles = params[["keepTempFiles"]] - ) - }) - # Get average stats - run_stats <- read.table( - file.path(basename(params[["simulName"]]), paste0(basename(params[["simulName"]]),"_stats.txt")), - header = TRUE - ) - run_sums <- read.table( - file.path(basename(params[["simulName"]]), paste0(basename(params[["simulName"]]),"_summary.txt")), - header = TRUE - ) - # Get MigClim outputs - ll <- list.files(basename(params[["simulName"]]),'asc',full.names = TRUE) - run_sims <- raster::stack(ll); names(run_sims) <- tools::file_path_sans_ext(basename(ll)) - # Condense the simulation runs into one modal prediction - run_sim <- raster::calc(run_sims, raster::modal) - raster::projection(run_sim) <- raster::projection(fit$get_data('prediction')) - all(sapply(list.files(getwd(),".tif"), file.remove)) # Cleanup - setwd(dir.ori) # Flip back to original directory - - # Wrap all results in a list - mc <- list(params = params, - stats = run_stats, - summary = run_sums, - raster = run_sim) - } - } # End of MigClim processing chain - } - # If not found, set a waiver - if(!exists("mc")) mc <- new_waiver() - - # ---- # - assertthat::assert_that( - is.Raster(proj), is.Raster(proj_thresh), - msg = "Something went wrong with the projection." - ) - - # Apply boundary constraints if set - if("boundary" %in% names(scenario_constraints)){ - if(!raster::compareRaster(proj, scenario_constraints$boundary$params$layer, stopiffalse = FALSE)){ - scenario_constraints$boundary$params$layer <- alignRasters( - scenario_constraints$boundary$params$layer, - proj, - method = "ngb", func = raster::modal, cl = FALSE - ) - } - proj <- raster::mask(proj, scenario_constraints$boundary$params$layer) - # Get background and ensure that all values outside are set to 0 - proj[is.na(proj)] <- 0 - proj <- raster::mask(proj, fit$model$background ) - # Also for thresholds if existing - if(raster::nlayers(proj_thresh)>0){ - proj_thresh <- raster::mask(proj_thresh, scenario_constraints$boundary$params$layer) - proj_thresh[is.na(proj_thresh)] <- 0 - proj_thresh <- raster::mask(proj_thresh, fit$model$background ) - } - } - - # Should stabilization be applied? - if(stabilize){ - if(getOption('ibis.setupmessages')) myLog('[Scenario]','green','Applying stabilization.') - if(stabilize_method == "loess"){ - # FIXME: Could outsource this code - impute.loess <- function(y, x.length = NULL, s = 0.75, - smooth = TRUE, na.rm, ...) { - if (is.null(x.length)) { - x.length = length(y) - } - if(length(y[!is.na(y)]) < 8) { - y <- rep(NA, x.length) - } else { - x <- 1:x.length - p <- suppressWarnings( stats::loess(y ~ x, span = s, - data.frame(x = x, y = y)) ) - if (smooth == TRUE) { - y <- stats::predict(p, x) - } else { - na.idx <- which(is.na(y)) - if (length(na.idx) > 1) { - y[na.idx] <- stats::predict(p, data.frame(x = na.idx)) - } - } - } - return(y) - } - new_proj <- raster::overlay(proj, fun = impute.loess, unstack = TRUE, forcefun = FALSE) - # Rename again - names(new_proj) <- names(proj) - new_proj <- raster::setZ(new_proj, times ) - proj <- new_proj; rm(new_proj) - # Were thresholds calculated? If yes, recalculate on the smoothed estimates - if(raster::nlayers(proj_thresh)>0){ - new_thresh <- proj - new_thresh[new_thresh < scenario_threshold[1]] <- 0 - new_thresh[new_thresh >= scenario_threshold[1]] <- 1 - names(new_thresh) <- names(proj_thresh) - thresh <- new_thresh; rm(new_thresh) - } - } - } - - # Finally convert to stars and rename - proj <- stars::st_as_stars(proj, - crs = sf::st_crs(new_crs) - ); names(proj) <- 'suitability' - - if(raster::nlayers(proj_thresh)>0){ - # Add the thresholded maps as well - proj_thresh <- stars::st_as_stars(proj_thresh, - crs = sf::st_crs(new_crs) - ); names(proj_thresh) <- 'threshold' - # Correct band if different - if(all(!stars::st_get_dimension_values(proj, 3) != stars::st_get_dimension_values(proj_thresh, 3 ))){ - proj_thresh <- stars::st_set_dimensions(proj_thresh, 3, values = stars::st_get_dimension_values(proj, 3)) - } - proj <- stars:::c.stars(proj, proj_thresh) - } - - # Return output by adding it to the scenario object - bdproto(NULL, mod, - scenarios = proj, - scenarios_migclim = mc - ) - } -) +#' @include utils.R bdproto-biodiversityscenario.R +NULL + +#' Project a fitted model to a new environment and covariates +#' +#' @description +#' Equivalent to [train], this function acts as a +#' wrapper to project the model stored in a [`BiodiversityScenario-class`] object to +#' newly supplied (future) covariates. Supplied predictors are usually spatial-temporal predictors +#' which should be prepared via [`add_predictors()`] (e.g. transformations and derivates) in the same way as they have been during +#' the initial modelling with [`distribution()`]. +#' Any constrains specified in the scenario object are applied during the projection. +#' +#' @details +#' In the background the function \code{x$project()} for the respective model object is called, where +#' \code{x} is fitted model object. For specifics on the constrains, see the relevant [constrain] functions, +#' respectively: +#' * [`add_constrain()`] for generic wrapper to add any of the available constrains. +#' * [`add_constrain_dispersal()`] for specifying dispersal constrain on the temporal projections at each step. +#' * [`add_constrain_MigClim()`] Using the \pkg{MigClim} R-package to simulate dispersal in projections. +#' * [`add_constrain_connectivity()`] Apply a connectivity constrain at the projection, for instance by adding +#' a barrier that prevents migration. +#' * [`add_constrain_adaptability()`] Apply an adaptability constrain to the projection, for instance +#' constraining the speed a species is able to adapt to new conditions. +#' * [`add_constrain_boundary()`] To artificially limit the distribution change. Similar as specifying projection limits, but +#' can be used to specifically constrain a projection within a certain area (e.g. a species range or an island). +#' +#' Many constrains also requires thresholds to be calculated. Adding [`threshold()`] to a +#' [`BiodiversityScenario-class`] object enables the computation of thresholds at every step based on the threshold +#' used for the main model (threshold values are taken from there). +#' +#' Finally this function also allows temporal stabilization across prediction steps via enabling +#' the parameter \code{stabilize} and checking the \code{stablize_method} argument. Stabilization can for instance +#' be helpful in situations where environmental variables are quite dynamic, but changes in projected suitability +#' are not expected to abruptly increase or decrease. It is thus a way to smoothen out outliers from the projection. +#' Options are so far for instance \code{'loess'} which fits a [`loess()`] model per pixel and time step. This is conducted at +#' the very of the processing steps and any thresholds will be recalculated afterwards. +#' +#' @seealso [`scenario()`] +#' @param mod A [`BiodiversityScenario`] object with set predictors. +#' Note that some constrains such as [MigClim] can still simulate future change without projections. +#' @param date_interpolation A [`character`] on whether dates should be interpolated. Options +#' include \code{"none"} (Default), \code{"annual"}, \code{"monthly"}, \code{"daily"}. +#' @param stabilize A [`boolean`] value indicating whether the suitability projection should be stabilized (Default: \code{FALSE}). +#' @param stabilize_method [`character`] stating the stabilization method to be applied. Currently supported is \code{`loess`}. +#' @param layer A [`character`] specifying the layer to be projected (Default: \code{"mean"}). +#' @param ... passed on parameters. +#' @returns Saves [`stars`] objects of the obtained predictions in mod. +#' +#' @name project +#' @aliases project +#' @keywords scenarios +#' @exportMethod project +#' @export +NULL +methods::setGeneric("project", + signature = methods::signature("mod"), + function(mod, date_interpolation = "none", stabilize = FALSE, stabilize_method = "loess", + layer = "mean", ...) standardGeneric("project")) + +#' @name project +#' @rdname project +#' @usage \S4method{project}{BiodiversityScenario, character, logical, character, character}(mod, date_interpolation, stabilize, stabilize_method, layer) +methods::setMethod( + "project", + methods::signature(mod = "BiodiversityScenario"), + function(mod, date_interpolation = "none", stabilize = FALSE, stabilize_method = "loess", + layer = "mean", ...){ + # date_interpolation = "none"; stabilize = FALSE; stabilize_method = "loess"; layer="mean" + assertthat::assert_that( + inherits(mod, "BiodiversityScenario"), + !is.Waiver(mod$get_predictors()), + is.character(date_interpolation), + is.logical(stabilize), + is.character(layer) + ) + # Match methods + date_interpolation <- match.arg(date_interpolation, c("none", "yearly", "annual", "monthly", "daily"), several.ok = FALSE) + stabilize_method <- match.arg(stabilize_method, c("loess"), several.ok = FALSE) + if(!is.Waiver(mod$get_data())) if(getOption('ibis.setupmessages')) myLog('[Scenario]','red','Overwriting existing scenarios...') + + # Get the model object + fit <- mod$get_model() + # Check that coefficients and model exist + assertthat::assert_that(!is.Waiver(fit), + nrow(fit$get_coefficients())>0, + msg = "No model or coefficients found!") + # Get predictors + new_preds <- mod$get_predictors() + if(is.Waiver(new_preds)) stop('No scenario predictors found.') + new_crs <- new_preds$get_projection() + if(is.na(new_crs)) if(getOption('ibis.setupmessages')) myLog('[Scenario]','yellow','Missing projection of future predictors.') + + # Interpolate dates if set + if(date_interpolation!="none"){ + if(getOption('ibis.setupmessages')) myLog('[Scenario]','green',paste0('Interpolating dates for scenario predictors as: ', date_interpolation)) + o <- approximate_gaps(env = new_preds$get_data(), date_interpolation = date_interpolation) + # Set new data + #new_preds$set_data() + } + + # Get limits if present + if(!is.null( mod$get_limits() )){ + # FIXME: Scenarios need to be checked that the right layer is taken!! + # Get prediction + n <- fit$show_rasters()[grep("threshold",fit$show_rasters())] + tr <- fit$get_data(n)[[1]] + tr <- cbind( raster::coordinates(tr), data.frame(thresh = values(tr))) + tr[['thresh']] <- ifelse(tr[['thresh']]==0, NA, tr[['thresh']]) + tr <- tr |> (\(.) subset(., stats::complete.cases(thresh)))() + + # Get zones from the limiting area, e.g. those intersecting with input + suppressMessages( + suppressWarnings( + zones <- st_intersection(sf::st_as_sf(tr, coords = c('x','y'), crs = sf::st_crs(fit$model$background)), + mod$get_limits() + ) + ) + ) + # Limit zones + zones <- subset(mod$get_limits(), limit %in% unique(zones$limit) ) + # Now clip all provided new predictors and background to this + new_preds$crop_data(zones) + } + + # Check that predictor names are all present + mod_pred_names <- fit$model$predictors_names + pred_names <- mod$get_predictor_names() + assertthat::assert_that( all(mod_pred_names %in% pred_names), + msg = paste0('Model predictors are missing from the scenario predictor!') ) + + # Get constraints, threshold values and other parameters + scenario_threshold <- mod$get_threshold() + # Not get the baseline raster + thresh_reference <- grep('threshold',fit$show_rasters(),value = T)[1] # Use the first one always + baseline_threshold <- mod$get_model()$get_data(thresh_reference) + if(!is.Waiver(scenario_threshold)){ + if(is.na(raster::projection(baseline_threshold))) projection(baseline_threshold) <- raster::projection( fit$model$background ) + } + + if(inherits(baseline_threshold, 'RasterStack') || inherits(baseline_threshold, 'RasterBrick')){ + baseline_threshold <- baseline_threshold[[grep(layer,names(baseline_threshold))]] + } + scenario_constraints <- mod$get_constraints() + + # --- Check that everything is there --- + # Check that thresholds are set for constrains + if("dispersal" %in% names(scenario_constraints)){ + if(scenario_constraints[["dispersal"]]$method == "MigClim") { + assertthat::assert_that(is.Raster(baseline_threshold)) + } else if(scenario_constraints[["dispersal"]]$method == "kissmig"){ + assertthat::assert_that( is.Raster(baseline_threshold)) + if(!is.Waiver(scenario_threshold)) { + if(getOption('ibis.setupmessages')) myLog('[Scenario]','green','Using kissmig to calculate updated distribution thresholds.') + scenario_threshold <- new_waiver() + } + } else { + assertthat::assert_that(!is.Waiver(scenario_threshold),msg = "Other constrains require threshold option!") + } + } + if("connectivity" %in% names(scenario_constraints) && "dispersal" %notin% names(scenario_constraints)){ + if(getOption('ibis.setupmessages')) myLog('[Scenario]','yellow','Connectivity contraints make most sense with a dispersal constraint.') + } + # ----------------------------- # + # Start of projection # + # ----------------------------- # + + # Now convert to data.frame and subset + df <- new_preds$get_data(df = TRUE) + names(df)[1:3] <- tolower(names(df)[1:3]) # Assuming the first three attributes are x,y,t + assertthat::assert_that(nrow(df)>0, + utils::hasName(df,'x'), utils::hasName(df,'y'), utils::hasName(df,'time'), + msg = "Error: Projection data and training data are not of equal size and format!") + df <- subset(df, select = c("x", "y", "time", mod_pred_names) ) + df$time <- to_POSIXct( df$time ) + # Convert all units classes to numeric or character to avoid problems + df <- units::drop_units(df) + + # ------------------ # + if(getOption('ibis.setupmessages')) myLog('[Scenario]','green','Starting suitability projections for ', length(unique(df$time)), ' timesteps.') + + # Now for each unique element, loop and project in order + proj <- raster::stack() + proj_thresh <- raster::stack() + + pb <- progress::progress_bar$new(format = "Creating projections (:spin) [:bar] :percent", + total = length(unique(df$time))) + # TODO: Consider doing this in parallel but sequential + times <- sort(unique(df$time)) + for(step in times){ + # Get data + nd <- subset(df, time == step) + + # Apply adaptability constrain + if("adaptability" %in% names(scenario_constraints)){ + if(scenario_constraints[["adaptability"]]$method == "nichelimit") { + nd <- .nichelimit(newdata = nd, model = mod$get_model()[['model']], + names = scenario_constraints[["adaptability"]]$params['names'], + value = scenario_constraints[["adaptability"]]$params['value'], + increment = scenario_constraints[["adaptability"]]$params['increment'], + increment_step = which(step==times) ) + } + } + + # Project suitability + out <- fit$project(newdata = nd, layer = layer) + names(out) <- paste0("suitability", "_", layer, "_", step) + if(is.na(raster::projection(out))) raster::projection(out) <- raster::projection( fit$model$background ) + + # If other constrains are set, apply them posthoc + if(!is.Waiver(scenario_constraints)){ + # Apply a resistance surface if found + if("connectivity" %in% names(scenario_constraints)){ + # Get the layer for later + resistance <- scenario_constraints$connectivity$params$resistance + # By definition a hard barrier removes all suitable again + if(any(scenario_constraints$connectivity$method == "resistance")){ + if(raster::nlayers(resistance)>1){ + ind <- which( raster::getZ(resistance) == as.Date(step) ) # Get specific step + assertthat::assert_that(is.numeric(ind)) + resistance <- resistance[[ind]] + } + out <- out * resistance + } + } else { + resistance <- NULL + } + + # Calculate dispersal constraint if set + if("dispersal" %in% names(scenario_constraints) ){ + # MigClim simulations are run posthoc + if(scenario_constraints$dispersal$method %in% c("sdd_fixed", "sdd_nexpkernel")){ + out <- switch (scenario_constraints$dispersal$method, + "sdd_fixed" = .sdd_fixed(baseline_threshold, out, + value = scenario_constraints$dispersal$params[1], + resistance = resistance ), + "sdd_nexpkernel" = .sdd_nexpkernel(baseline_threshold, out, + value = scenario_constraints$dispersal$params[1], + resistance = resistance) + ) + names(out) <- paste0('suitability_', step) + } + # For kissmig generate threshold and masked suitabilities + if(scenario_constraints$dispersal$method == "kissmig"){ + out <- .kissmig_dispersal(baseline_threshold, + new_suit = out, + resistance = resistance, + params = scenario_constraints$dispersal$params) + # Returns a layer of two with both the simulated threshold and the masked suitability raster + names(out) <- paste0(c('threshold_', 'suitability_'), step) + # Add threshold to result stack + proj_thresh <- raster::addLayer(proj_thresh, out[[1]] ) + baseline_threshold <- out[[1]] + out <- out[[2]] + } + } + + # Connectivity constraints with hard barriers + if("connectivity" %in% names(scenario_constraints)){ + # By definition a hard barrier removes all suitable again + if(any(scenario_constraints$connectivity$method == "hardbarrier")){ + out[resistance==1] <- 0 + } + } + + } + + # Recalculate thresholds if set manually + if(!is.Waiver(scenario_threshold)){ + # FIXME: Currently this works only for mean thresholds. Think of how the other are to be handled + scenario_threshold <- scenario_threshold[1] + out_thresh <- out + out_thresh[out_thresh < scenario_threshold] <- 0; out_thresh[out_thresh >= scenario_threshold] <- 1 + names(out_thresh) <- paste0('threshold_', step) + # If threshold is + if( cellStats(out_thresh, 'max') == 0){ + if(getOption('ibis.setupmessages')) myLog('[Scenario]','yellow','Thresholding removed all grid cells. Using last years threshold.') + out_thresh <- baseline_threshold + } else { baseline_threshold <- out_thresh } + # Add to result stack + proj_thresh <- raster::addLayer(proj_thresh, out_thresh) + } + # Add to result stack + proj <- raster::addLayer(proj, out) + pb$tick() + } + rm(pb) + proj <- raster::setZ(proj, times ) + if(raster::nlayers(proj_thresh)>1) proj_thresh <- raster::setZ(proj_thresh, times ) + + # Apply MigClim and other post-hoc constraints if set + # FIXME: Ideally make this whole setup more modular. So create suitability projections first + if(!is.Waiver(scenario_constraints)){ + # Calculate dispersal constraint if set + if("dispersal" %in% names(scenario_constraints) ){ + # MigClim simulations are run posthoc + if(scenario_constraints$dispersal$method == 'MigClim'){ + # Get Parameters + params <- scenario_constraints$dispersal$params + + pb <- progress::progress_bar$new(total = raster::nlayers(proj)) + for(lyr in 1:raster::nlayers(proj)){ + pb$tick() + # Normalize the projected suitability rasters to be in range 0-1000 and save + hsMap <- predictor_transform(env = proj[[lyr]], option = "norm") * 1000 + # Write as filename in the destined folder + suppressWarnings( + raster::writeRaster(x = hsMap, filename = paste0( params[["hsMap"]],lyr,".tif"), + dt = "INT2S", varNA = -9999, prj = TRUE, overwrite = TRUE) + ) + rm(hsMap) + };rm(pb) + # For threshold, define based on type + tr <- ifelse(params[["rcThreshold"]] == "continuous", 0, 750) # Set to 75% as suitability threshold + + # Now run MigClim by switching to temporary dir + dir.ori <- getwd() + setwd(params[["dtmp"]]) + try({ + m <- MigClim::MigClim.migrate( + iniDist = basename(params[["iniDist"]]), + hsMap = basename(params[["hsMap"]]), + rcThreshold = tr, + envChgSteps = raster::nlayers(proj), # Use number of projected suitability layers + dispSteps = params[["dispSteps"]], + dispKernel = params[["dispKernel"]], + barrier = "", # TBD. Loaded via another arguement + barrierType = params[["barrierType"]], + iniMatAge = params[["iniMatAge"]], + propaguleProd = params[["propaguleProdProb"]], + lddFreq = params[["lddFreq"]], + lddMinDist = params[["lddMinDist"]], lddMaxDist = params[["lddMaxDist"]], + simulName = basename(params[["simulName"]]), + replicateNb = params[["replicateNb"]], + overWrite = params[["overwrite"]], + testMode = FALSE, + fullOutput = params[["fullOutput"]], + keepTempFiles = params[["keepTempFiles"]] + ) + }) + # Get average stats + run_stats <- utils::read.table( + file.path(basename(params[["simulName"]]), paste0(basename(params[["simulName"]]),"_stats.txt")), + header = TRUE + ) + run_sums <- utils::read.table( + file.path(basename(params[["simulName"]]), paste0(basename(params[["simulName"]]),"_summary.txt")), + header = TRUE + ) + # Get MigClim outputs + ll <- list.files(basename(params[["simulName"]]),'asc',full.names = TRUE) + run_sims <- raster::stack(ll); names(run_sims) <- tools::file_path_sans_ext(basename(ll)) + # Condense the simulation runs into one modal prediction + run_sim <- raster::calc(run_sims, raster::modal) + raster::projection(run_sim) <- raster::projection(fit$get_data('prediction')) + all(sapply(list.files(getwd(),".tif"), file.remove)) # Cleanup + setwd(dir.ori) # Flip back to original directory + + # Wrap all results in a list + mc <- list(params = params, + stats = run_stats, + summary = run_sums, + raster = run_sim) + } + } # End of MigClim processing chain + } + # If not found, set a waiver + if(!exists("mc")) mc <- new_waiver() + + # ---- # + assertthat::assert_that( + is.Raster(proj), is.Raster(proj_thresh), + msg = "Something went wrong with the projection." + ) + + # Apply boundary constraints if set + if("boundary" %in% names(scenario_constraints)){ + if(!raster::compareRaster(proj, scenario_constraints$boundary$params$layer, stopiffalse = FALSE)){ + scenario_constraints$boundary$params$layer <- alignRasters( + scenario_constraints$boundary$params$layer, + proj, + method = "ngb", func = raster::modal, cl = FALSE + ) + } + proj <- raster::mask(proj, scenario_constraints$boundary$params$layer) + # Get background and ensure that all values outside are set to 0 + proj[is.na(proj)] <- 0 + proj <- raster::mask(proj, fit$model$background ) + # Also for thresholds if existing + if(raster::nlayers(proj_thresh)>0){ + proj_thresh <- raster::mask(proj_thresh, scenario_constraints$boundary$params$layer) + proj_thresh[is.na(proj_thresh)] <- 0 + proj_thresh <- raster::mask(proj_thresh, fit$model$background ) + } + } + + # Should stabilization be applied? + if(stabilize){ + if(getOption('ibis.setupmessages')) myLog('[Scenario]','green','Applying stabilization.') + if(stabilize_method == "loess"){ + # FIXME: Could outsource this code + impute.loess <- function(y, x.length = NULL, s = 0.75, + smooth = TRUE, na.rm, ...) { + if (is.null(x.length)) { + x.length = length(y) + } + if(length(y[!is.na(y)]) < 8) { + y <- rep(NA, x.length) + } else { + x <- 1:x.length + p <- suppressWarnings( stats::loess(y ~ x, span = s, + data.frame(x = x, y = y)) ) + if (smooth == TRUE) { + y <- stats::predict(p, x) + } else { + na.idx <- which(is.na(y)) + if (length(na.idx) > 1) { + y[na.idx] <- stats::predict(p, data.frame(x = na.idx)) + } + } + } + return(y) + } + new_proj <- raster::overlay(proj, fun = impute.loess, unstack = TRUE, forcefun = FALSE) + # Rename again + names(new_proj) <- names(proj) + new_proj <- raster::setZ(new_proj, times ) + proj <- new_proj; rm(new_proj) + # Were thresholds calculated? If yes, recalculate on the smoothed estimates + if(raster::nlayers(proj_thresh)>0){ + new_thresh <- proj + new_thresh[new_thresh < scenario_threshold[1]] <- 0 + new_thresh[new_thresh >= scenario_threshold[1]] <- 1 + names(new_thresh) <- names(proj_thresh) + thresh <- new_thresh; rm(new_thresh) + } + } + } + + # Finally convert to stars and rename + proj <- stars::st_as_stars(proj, + crs = sf::st_crs(new_crs) + ); names(proj) <- 'suitability' + + if(raster::nlayers(proj_thresh)>0){ + # Add the thresholded maps as well + proj_thresh <- stars::st_as_stars(proj_thresh, + crs = sf::st_crs(new_crs) + ); names(proj_thresh) <- 'threshold' + # Correct band if different + if(all(!stars::st_get_dimension_values(proj, 3) != stars::st_get_dimension_values(proj_thresh, 3 ))){ + proj_thresh <- stars::st_set_dimensions(proj_thresh, 3, values = stars::st_get_dimension_values(proj, 3)) + } + proj <- stars:::c.stars(proj, proj_thresh) + } + + # Return output by adding it to the scenario object + bdproto(NULL, mod, + scenarios = proj, + scenarios_migclim = mc + ) + } +) diff --git a/R/pseudoabsence.R b/R/pseudoabsence.R index 0616cdbb..76f17411 100644 --- a/R/pseudoabsence.R +++ b/R/pseudoabsence.R @@ -52,7 +52,8 @@ NULL #' @param ... Any other settings to be added to the pseudoabs settings. #' @examples #' \dontrun{ -#' # This setting generates 10000 pseudo-absence points outside the minimum convex polygon of presence points +#' # This setting generates 10000 pseudo-absence points outside the minimum convex polygon +#' of presence points #' ass1 <- pseudoabs_settings(nrpoints = 10000, method = 'mcp', inside = FALSE) #' #' # This setting would match the number of presence-absence points directly. @@ -64,8 +65,10 @@ NULL #' template = background, settings = ass1) #' } #' @references -#' * Renner IW, Elith J, Baddeley A, Fithian W, Hastie T, Phillips SJ, Popovic G, Warton DI. 2015. Point process models for presence-only analysis. Methods in Ecology and Evolution 6:366–379. DOI: 10.1111/2041-210X.12352. -#' * Renner, I. W., & Warton, D. I. (2013). Equivalence of MAXENT and Poisson point process models for species distribution modeling in ecology. Biometrics, 69(1), 274-281. +#' * Renner IW, Elith J, Baddeley A, Fithian W, Hastie T, Phillips SJ, Popovic G, Warton DI. 2015. Point process +#' models for presence-only analysis. Methods in Ecology and Evolution 6:366–379. DOI: 10.1111/2041-210X.12352. +#' * Renner, I. W., & Warton, D. I. (2013). Equivalence of MAXENT and Poisson point +#' process models for species distribution modeling in ecology. Biometrics, 69(1), 274-281. #' @name pseudoabs_settings #' @aliases pseudoabs_settings #' @keywords train @@ -148,8 +151,11 @@ methods::setMethod( #' then \code{background} in the [`pseudoabs_settings()`] has to be a [`RasterLayer`] object. #' @param settings A [`pseudoabs_settings()`] objects. Absence settings are taken from [ibis_options] otherwise (Default). #' @references -#' * Stolar, J., & Nielsen, S. E. (2015). Accounting for spatially biased sampling effort in presence‐only species distribution modelling. Diversity and Distributions, 21(5), 595-608. -#' * Bird, T.J., Bates, A.E., Lefcheck, J.S., Hill, N.A., Thomson, R.J., Edgar, G.J., Stuart-Smith, R.D., Wotherspoon, S., Krkosek, M., Stuart-Smith, J.F. and Pecl, G.T., 2014. Statistical solutions for error and bias in global citizen science datasets. Biological Conservation, 173, pp.144-154. +#' * Stolar, J., & Nielsen, S. E. (2015). Accounting for spatially biased sampling effort in +#' presence‐only species distribution modelling. Diversity and Distributions, 21(5), 595-608. +#' * Bird, T.J., Bates, A.E., Lefcheck, J.S., Hill, N.A., Thomson, R.J., Edgar, G.J., Stuart-Smith, R.D., +#' Wotherspoon, S., Krkosek, M., Stuart-Smith, J.F. and Pecl, G.T., 2014. Statistical solutions +#' for error and bias in global citizen science datasets. Biological Conservation, 173, pp.144-154. #' @keywords train #' @returns A [`data.frame`] containing the newly created pseudo absence points. #' @export @@ -227,7 +233,7 @@ add_pseudoabsence <- function(df, field_occurrence = "observed", template = NULL } else { bias <- NULL } # Rasterize the presence estimates - bg1 <- raster::rasterize(df[,c('x','y')] %>% sf::st_drop_geometry(), + bg1 <- raster::rasterize(df[,c('x','y')] |> sf::st_drop_geometry(), background, fun = 'count', background = 0) bg1 <- raster::mask(bg1, background) diff --git a/R/similarity.R b/R/similarity.R index ac356d6b..e0c14ad0 100644 --- a/R/similarity.R +++ b/R/similarity.R @@ -1,440 +1,440 @@ -#' @include utils.R utils-spatial.R bdproto-biodiversitydistribution.R bdproto-distributionmodel.R -NULL - -#' Calculate environmental similarity of reference datasets to predictors. -#' -#' @description -#' Calculate the environmental similarity of the provided covariates -#' with respect to a reference dataset. -#' Currently supported is Multivariate Environmental Similarity index -#' and the multivariate combination novelty index (NT2) based on the Mahalanobis divergence (see references). -#' -#' @param obj A [`BiodiversityDistribution`], [`DistributionModel`] or alternatively a [`Raster`] object. -#' @param ref A [`BiodiversityDistribution`], [`DistributionModel`] or alternatively -#' a [`data.frame`] with extracted values (corresponding to those given in `obj`). -#' @param ref_type A [`character`] specifying the type of biodiversity to use when obj is a [`BiodiversityDistribution`]. -#' @param method A specifc method for similarity calculation. Currently supported: \code{'mess'}, \code{'nt'}. -#' @param full should similarity values be returned for all variables (Default: \code{FALSE})? -#' @param plot Should the result be plotted? Otherwise return the output list (Default: \code{TRUE}). -#' @param ... other options (Non specified). -#' @return -#' This function returns a list containing: -#' * `similarity`: a `RasterStack` giving the environmental similarities for -#' each variable in `x` (only included when `full=TRUE`); -#' * `mis`: a `Raster` layer giving the minimum similarity value -#' across all variables for each location (i.e. the MESS); -#' * `exip`: a `Raster` layer indicating whether any model would interpolate -#' or extrapolate to this location based on environmental surface; -#' * `mod`: a factor `Raster` layer indicating which variable was most -#' dissimilar to its reference range (i.e. the MoD map, Elith et al. 2010); -#' and -#' * `mos`: a factor `Raster` layer indicating which variable was most -#' similar to its reference range. -#' -#' @details [`similarity`] implements the MESS algorithm described in Appendix S3 -#' of Elith et al. (2010) as well as the Mahalanobis dissimilarity described in Mesgaran et al. (2014) -#' @keywords mess, mahalanobis, similarity, environment -#' @references -#' * Elith, J., Kearney, M., and Phillips, S. (2010) "The art of modelling range-shifting species" https://doi.org/10.1111/j.2041-210X.2010.00036.x _Methods in Ecology and Evolution_, 1: 330-342 -#' * Mesgaran, M.B., Cousens, R.D. and Webber, B.L. (2014) "Here be dragons: a tool for quantifying novelty due to covariate range and correlation change when projecting species distribution models" https://doi.org/10.1111/ddi.12209 _Diversity and Distributions_, 20: 1147-1159. -#' @importFrom raster stack nlayers init as.data.frame raster -#' @importFrom methods is -#' @importFrom stats na.omit -#' @seealso [dismo] R-package -#' @name similarity -#' @export -#' @examples -#' \dontrun{ -#' plot( -#' similarity(x) # Where x is a distribution or Raster object -#' ) -#' } -NULL - -#' @name similarity -#' @rdname similarity -#' @exportMethod similarity -#' @export -methods::setGeneric( - "similarity", - signature = methods::signature("obj"), - function(obj, ref, ref_type = 'poipo', method = 'mess', predictor_names = NULL, full = FALSE, plot = TRUE, ...) standardGeneric("similarity")) - -#' Similarity of used predictors from a trained distribution model -#' @name similarity -#' @rdname similarity -#' @usage \S4method{similarity}{BiodiversityDistribution}(obj) -methods::setMethod( - "similarity", - methods::signature(obj = "BiodiversityDistribution"), - function(obj, ref_type = 'poipo', method = 'mess', predictor_names = NULL, full = FALSE, plot = TRUE, ...) { - assertthat::assert_that(inherits(obj, "BiodiversityDistribution"), - is.character(ref_type), - ref_type %in% c('poipo','poipa','polpo','polpa'), - is.character(method), - is.null(predictor_names) || is.character(predictor_names) - ) - # Check that data and predictors are there - assertthat::assert_that(!is.Waiver(obj$biodiversity),!is.Waiver(obj$predictors)) - assertthat::assert_that(is.null(predictor_names) || all(predictor_names %in% obj$get_predictor_names()) ) - # Match to correct spelling mistakes - method <- match.arg(tolower(method), c('mess','nt'), several.ok = FALSE) - - # Get biodiversity data - bid <- obj$biodiversity$get_id_byType(ref_type) - - # Get covariates - covs <- obj$predictors$get_data() - # Extract covariates for reference data - ref <- get_rastervalue(coords = obj$biodiversity$get_coordinates(names(bid)), - env = covs, - rm.na = FALSE) - ref <- ref[,names(covs)] - # ref <- get_ngbvalue( - # coords = obj$biodiversity$get_coordinates(names(bid)), - # env = obj$predictors$get_data(df = TRUE), - # field_space = c('x','y'), - # longlat = raster::isLonLat(covs) - # ) - # Subset if necessary - if(!is.null(predictor_names)){ covs <- covs[[predictor_names]]; ref <- ref[,predictor_names]} - - assertthat::assert_that(nlayers(covs) == ncol(ref)) - - # Run mess function - if(method == 'mess'){ - out <- .mess(covs = covs, - ref = ref, - full = full) - # Calculate interpolation/extrapolated - rip <- raster::cut(out$mis,c(cellStats(out$mis,'min'),0,cellStats(out$mis,'max')),ordered_result = TRUE) - rip <- raster::ratify(rip) - levels(rip) <- data.frame(ID = levels(rip)[[1]], - what = c('Extrapolation','Interpolation')) - out$exip <- rip;rm(rip) - - } else if(method == 'nt') { - out <- .nt12(prodat = covs, - refdat = ref) - } - - # If plot is specified, make figures. Otherwise return the list of rasters - if(plot){ - if(method == 'mess'){ - par.ori <- par(no.readonly = TRUE) - par(mfrow=c(2,2)) - raster::plot(out$mis,col = ibis_colours[['viridis_plasma']],main = paste0('Similarity surface (method: ',method,')')) - raster::plot(out$exip,col = ibis_colours[['distinct_random']][1:2],main = paste0('Extrapolated vs interpolated conditions')) - raster::plot(out$mod,col = ibis_colours[['distinct_random']][1:length(unique(out$mod))], main = paste0('Most dissimilar from reference')) - raster::plot(out$mos,col = ibis_colours[['distinct_random']][length(ibis_colours[['distinct_random']]):(length(ibis_colours[['distinct_random']])-length(unique(out$mos)))], main = paste0('Most similar to reference')) - par(par.ori) - } else if(method == 'nt'){ - par.ori <- par(no.readonly = TRUE) - par(mfrow=c(1,3)) - raster::plot(out$NT1,col = ibis_colours[['viridis_plasma']],main = paste0('Univariate extrapolation')) - raster::plot(out$NT2,col = ibis_colours[['viridis_orig']],main = paste0('Non-analogous dissimilarity')) - raster::plot(out$novel,col = ibis_colours[['distinct_random']][1:3],main = paste0('Novel conditions (method: ',method,')')) - # FIXME: add categorical legend left to it - par(par.ori) - } - } else { - return( out ) - } - } -) - -#' Similarity of used predictors by providing a RasterBrick directly -#' @name similarity -#' @rdname similarity -#' @usage \S4method{similarity}{RasterBrick}(obj) -methods::setMethod( - "similarity", - methods::signature(obj = "RasterBrick"), - function(obj, ref, method = 'mess', full = FALSE, plot = TRUE, ...) { - assertthat::assert_that(!missing(ref),msg = 'Provide a sf object of reference sites') - assertthat::assert_that(inherits(obj, "RasterBrick"), - inherits(ref, 'sf'), - is.character(method) - ) - # Convert RasterBrick to stack - obj <- raster::stack(obj) - # Now recall - similarity(obj = obj,ref = ref, method = method, - full = full,plot = plot, ...) - } -) - -#' Similarity of used predictors by providing a RasterStack directly -#' @name similarity -#' @rdname similarity -#' @usage \S4method{similarity}{RasterStack}(obj) -methods::setMethod( - "similarity", - methods::signature(obj = "RasterStack"), - function(obj, ref, method = 'mess', full = FALSE, plot = TRUE, ...) { - assertthat::assert_that(!missing(ref),msg = 'Provide a sf object of reference sites') - assertthat::assert_that(inherits(obj, "RasterStack"), - raster::nlayers(obj)>=1, - inherits(ref, 'sf'), - is.character(method) - ) - # Check that points are of same projection as raster - assertthat::assert_that(sf::st_crs(obj) == sf::st_crs(ref)) - - # Match to correct spelling mistakes - method <- match.arg(tolower(method), c('mess','nt2'), several.ok = FALSE) - - # Extract values for each provided value - ex <- raster::extract(x = obj, - y = ref, - df = TRUE) - # Subset to variables in obj and remove missing rows - ex <- subset.data.frame(ex, select = names(obj)) - ex <- subset.data.frame(ex, complete.cases(ex)) - - if(method == 'mess'){ - out <- .mess(covs = obj, - ref = ex, - full = full) - # Calculate interpolation/extrapolated - rip <- raster::cut(out$mis,c(cellStats(out$mis,'min'),0,cellStats(out$mis,'max')),ordered_result = TRUE) - rip <- raster::ratify(rip) - levels(rip) <- data.frame(ID = levels(rip)[[1]], - what = c('Extrapolation','Interpolation')) - out$exip <- rip;rm(rip) - - } else { - stop('Not yet implemented!') - } - - # If plot is specified, make figures. Otherwise return the list of rasters - if(plot){ - if(method == 'mess'){ - par.ori <- par(no.readonly = TRUE) - par(mfrow=c(2,2)) - raster::plot(out$mis,col = ibis_colours[['viridis_plasma']],main = paste0('Similarity surface (method: ',method,')')) - raster::plot(out$exip,col = ibis_colours[['distinct_random']][1:2],main = paste0('Extrapolated vs interpolated conditions')) - raster::plot(out$mod,col = ibis_colours[['distinct_random']][1:length(unique(out$mod))], main = paste0('Most dissimilar from reference')) - raster::plot(out$mos,col = ibis_colours[['distinct_random']][length(ibis_colours[['distinct_random']]):(length(ibis_colours[['distinct_random']])-length(unique(out$mos)))], main = paste0('Most similar to reference')) - par(par.ori) - } else if(method == 'nt'){ - par.ori <- par(no.readonly = TRUE) - par(mfrow=c(1,3)) - raster::plot(out$NT1,col = ibis_colours[['viridis_plasma']],main = paste0('Univariate extrapolation')) - raster::plot(out$NT2,col = ibis_colours[['viridis_orig']],main = paste0('Non-analogous dissimilarity')) - raster::plot(out$novel,col = ibis_colours[['distinct_random']][1:3],main = paste0('Novel conditions (method: ',method,')')) - par(par.ori) - } - } else { - return( out ) - } - } -) - -#' Function to calculate the multivariate combination novelty index (NT2) -#' -#' @description -#' NT1 ranges from infinite negative values to zero where zero indicates no -#' extrapolation beyond the univariate coverage of reference data" -#' (Mesgaran et al. 2014). -#' -#' "NT2 can range from zero up to unbounded positive values. NT2 values -#' ranging from zero to one indicate similarity (in terms of both univariate -#' range and multivariate combination), with values closer to zero being more -#' similar. Values larger than one are indicative of novel combinations" -#' (Mesgaran et al. 2014). -#' -#' @param prodat A [`RasterStack`]. The projected values. The layer names must -#' match the column names of \code{refdat}. -#' @param refdat A numerical [`matrix`] or [`data.frame`]. The reference values of variables organized -#' in columns. -#' -#' @references -#' * Mesgaran, M. B., R. D. Cousens, B. L. Webber, and J. Franklin. -#' 2014. Here be dragons: a tool for quantifying novelty due to covariate -#' range and correlation change when projecting species distribution models. -#' Diversity and Distributions 20:1147-1159. -#' @section Notes: The code is adapted from Bell & Schlaepfer 2015 (available -#' at \url{https://github.com/bellland/SDM.Virtual.Species_Bell.Schlaepfer}) -#' which was based on a comment by Matthew Bayly made at -#' \url{https://pvanb.wordpress.com/2014/05/13/a-new-method-and-tool-exdet-to-evaluate-novelty-environmental-conditions/}. -#' @noRd -#' @keywords internal -.nt12 <- function(prodat, refdat){ - stopifnot(requireNamespace("matrixStats")) - # Input checks - assertthat::assert_that(is.Raster(prodat), - nlayers(prodat) == ncol(refdat)) - # Make a background layer for filling - bg <- emptyraster(prodat) - # Now convert both to matrix - prodat <- raster::as.matrix(prodat) - refdat <- as.matrix(refdat) - # Further checks - assertthat::assert_that(identical(colnames(refdat), colnames(prodat)), - is.data.frame(prodat) || is.matrix(prodat), - is.data.frame(prodat) || is.matrix(prodat)) - - # First calculate univariate novelty, e.g. NT1 - # Get ranges of variables and multiply - range_ref <- t(matrixStats::colRanges(as.matrix(refdat), na.rm = TRUE)) - diffs_ref <- matrixStats::colDiffs(range_ref) - # Remove those with 0 range, e.g. singular values for the reference - range_ref <- range_ref[,which(diffs_ref!=0)] - refdat <- refdat[,which(diffs_ref!=0)] - prodat <- prodat[,which(diffs_ref!=0)] - diffs_ref <- diffs_ref[,which(diffs_ref!=0)] - - range_ref_arr <- array(range_ref, dim = c(dim(range_ref), nrow(prodat)), - dimnames = list(c("min", "max"), colnames(refdat), NULL)) - - diffs_ref_arr <- matrix(diffs_ref, nrow = nrow(prodat), ncol = ncol(prodat), - byrow = TRUE) - - # Make empty raster with 3 dimensions - iud <- array(0, dim = c(dim(prodat), 3)) - iud[ , , 2] <- prodat - t(range_ref_arr["min", ,]) - iud[ , , 3] <- t(range_ref_arr["max", ,]) - prodat - - # Univariate novelty - # NT1 ranges from infinite negative values to zero where zero indicates no extrapolation - # beyond the univariate coverage of reference data. - UDs <- apply(iud, 1:2, min) / diffs_ref_arr - nt1 <- emptyraster(bg) - nt1[] <- rowSums(UDs) - - # --- # - # Multivariate combination novelty index (NT2) - # Calculate the center of reference data: average and covariance matrix - ref_av <- colMeans(refdat, na.rm = TRUE) - ref_cov <- stats::var(refdat, na.rm = TRUE) - - # Mahalanobis distance of reference data to center of reference data - mah_ref <- stats::mahalanobis(x = refdat, center = ref_av, cov = ref_cov,tol=1e-20) - # Mahalanobis distance of projected data to center of reference data - mah_pro <- stats::mahalanobis(x = prodat, center = ref_av, cov = ref_cov,tol=1e-20) - # Correction when mah_pro is negative (numerical instability. Correct to 0) - if(min(mah_pro,na.rm = T) < 0) mah_pro[which(!is.na(mah_pro) & mah_pro < 0)] <- 0 - - # Ratio - mah_max <- max(mah_ref[is.finite(mah_ref)]) - nt2 <- emptyraster(bg) - nt2[] <- (mah_pro / mah_max) - - # Calculate most dissimilar value (MOD) - # FIXME: Implement when have time https://onlinelibrary.wiley.com/doi/full/10.1111/ddi.12209 - # mod <- emptyraster(bg) - # For any point, the MIC is the covariate which produces the highest ICp value. - # icp <- 100 * (mah_ref - icp) - # matrixStats::rowMaxs(icp) - - # --- # - # Calculate areas outside the univariate range of combinations and non-analogous novel combinations - nt_novel <- emptyraster(bg) - # First areas areas in the projection space with at least one covariate outside the univariate range of reference data - or1 <- c(raster::cellStats(nt1,'min'), 0) - o_low <- raster::cut(nt1, or1, include.lowest=T) - # Next areas with NT2 ranging from 0 to 1 that are similar to the reference data - o_mid <- nt2 %in% c(0,1) - # non-analogous covariate combinations - o_high <- nt2 > 1 - nt_novel[o_low == 1] <- 1 - nt_novel[o_mid == 1] <- 2 - nt_novel[o_high == 1] <- 3 - nt_novel <- raster::ratify(nt_novel) - levels(nt_novel) <- data.frame(ID = levels(nt_novel)[[1]], - what = c('Outside reference','Within reference','Novel combinations')) - - # Create output stack - out <- raster::stack(nt1,nt2,nt_novel, - quick = TRUE) - names(out) <- c('NT1','NT2','novel') - return(out) -} - -#' Function to calculate Multivariate Environmental Similarity index -#' @noRd -#' @keywords internal -.mess <- function(covs, ref, full=FALSE) { - # Convert to data.frame - if(!methods::is(ref, 'data.frame')) { - ref <- as.data.frame(ref) - } - # Make dummy template rasters - if(is(covs, 'Raster')) { - r <- TRUE - if(isTRUE(full)) { - out <- raster::stack(replicate( - raster::nlayers(covs), raster::init(covs, function(x) NA))) - } else { - out <- raster::init(covs, function(x) NA) - } - } else r <- FALSE - ref <- stats::na.omit(ref) # Remove NAs - if(!methods::is(covs, 'data.frame')) { - covs <- as.data.frame(covs) - } - # Calculate dimensions (range) - if(is.null(dim(ref))) { - rng <- as.data.frame(range(ref, na.rm=TRUE)) - } else { - rng <- as.data.frame(apply(ref, 2, range, na.rm=TRUE)) - } - # remove variables where max-min is 0 - rng <- rng[which(apply(rng, 2, diff)>0)] - covs <- covs[,names(rng)] - ref <- ref[,names(rng)] - - # Find intervals within ranges - pct_less <- mapply(function(x, ref) { - findInterval(x, sort(ref))/length(ref) - }, covs, ref, SIMPLIFY=FALSE) - # Calculate similarity surface - sim <- mapply(function(f, rng, p) { - ifelse(f==0, (p-rng[1])/diff(rng)*100, - ifelse(f > 0 & f <= 0.5, f*200, - ifelse(f > 0.5 & f < 1, (1-f)*200, - (rng[2]-p)/diff(rng)*100))) - }, pct_less, rng, covs) - - min_sim <- if(is.matrix(sim)) apply(sim, 1, min) else(min(sim)) - - # Get minimum similarity and most (dis)similiar values - mins <- apply(sim, 1, which.min) - most_dissimilar_vec <- unlist(ifelse(lengths(mins)==0, NA, mins)) - maxs <- apply(sim, 1, which.max) - most_similar_vec <- unlist(ifelse(lengths(maxs)==0, NA, maxs)) - - if(isTRUE(r)) { - # Calculate most dissimilar surface - most_dissimilar <- raster::raster(out) - most_dissimilar[] <- most_dissimilar_vec - most_dissimilar <- as.factor(most_dissimilar) - rat <- levels(most_dissimilar)[[1]] - rat$varname <- colnames(sim)[rat$ID] - levels(most_dissimilar) <- rat - - # Calculate most similar surface - most_similar <- raster::raster(out) - most_similar[] <- most_similar_vec - most_similar <- as.factor(most_similar) - rat <- levels(most_similar)[[1]] - rat$varname <- colnames(sim)[rat$ID] - levels(most_similar) <- rat - - # Fill template rasters - out_min <- raster::raster(out) - out_min[] <- min_sim - if(isTRUE(full)) { - out[] <- sim - list(similarity=out, mis=out_min, mod=most_dissimilar, - mos=most_similar) - } else list(mis=out_min, mod=most_dissimilar, mos=most_similar) - } else { - if(isTRUE(full)) { - list(similarity=sim, mis=min_sim, - mod=most_dissimilar_vec, mos=most_similar_vec) - } else list(mis=min_sim, mod=most_dissimilar_vec, - mos=most_similar_vec) - } -} +#' @include utils.R utils-spatial.R bdproto-biodiversitydistribution.R bdproto-distributionmodel.R +NULL + +#' Calculate environmental similarity of reference datasets to predictors. +#' +#' @description +#' Calculate the environmental similarity of the provided covariates +#' with respect to a reference dataset. +#' Currently supported is Multivariate Environmental Similarity index +#' and the multivariate combination novelty index (NT2) based on the Mahalanobis divergence (see references). +#' +#' @param obj A [`BiodiversityDistribution`], [`DistributionModel`] or alternatively a [`Raster`] object. +#' @param ref A [`BiodiversityDistribution`], [`DistributionModel`] or alternatively +#' a [`data.frame`] with extracted values (corresponding to those given in `obj`). +#' @param ref_type A [`character`] specifying the type of biodiversity to use when obj is a [`BiodiversityDistribution`]. +#' @param method A specifc method for similarity calculation. Currently supported: \code{'mess'}, \code{'nt'}. +#' @param full should similarity values be returned for all variables (Default: \code{FALSE})? +#' @param plot Should the result be plotted? Otherwise return the output list (Default: \code{TRUE}). +#' @param ... other options (Non specified). +#' @return +#' This function returns a list containing: +#' * `similarity`: a `RasterStack` giving the environmental similarities for +#' each variable in `x` (only included when `full=TRUE`); +#' * `mis`: a `Raster` layer giving the minimum similarity value +#' across all variables for each location (i.e. the MESS); +#' * `exip`: a `Raster` layer indicating whether any model would interpolate +#' or extrapolate to this location based on environmental surface; +#' * `mod`: a factor `Raster` layer indicating which variable was most +#' dissimilar to its reference range (i.e. the MoD map, Elith et al. 2010); +#' and +#' * `mos`: a factor `Raster` layer indicating which variable was most +#' similar to its reference range. +#' +#' @details [`similarity`] implements the MESS algorithm described in Appendix S3 +#' of Elith et al. (2010) as well as the Mahalanobis dissimilarity described in Mesgaran et al. (2014) +#' @keywords mess, mahalanobis, similarity, environment +#' @references +#' * Elith, J., Kearney, M., and Phillips, S. (2010) "The art of modelling range-shifting +#' species" https://doi.org/10.1111/j.2041-210X.2010.00036.x _Methods in Ecology and Evolution_, 1: 330-342 +#' * Mesgaran, M.B., Cousens, R.D. and Webber, B.L. (2014) "Here be dragons: a tool +#' for quantifying novelty due to covariate range and correlation change when projecting +#' species distribution models" https://doi.org/10.1111/ddi.12209 _Diversity and Distributions_, 20: 1147-1159. +#' @seealso [dismo] R-package +#' @name similarity +#' @export +#' @examples +#' \dontrun{ +#' plot( +#' similarity(x) # Where x is a distribution or Raster object +#' ) +#' } +NULL + +#' @name similarity +#' @rdname similarity +#' @exportMethod similarity +#' @export +methods::setGeneric( + "similarity", + signature = methods::signature("obj"), + function(obj, ref, ref_type = 'poipo', method = 'mess', predictor_names = NULL, full = FALSE, plot = TRUE, ...) standardGeneric("similarity")) + +#' Similarity of used predictors from a trained distribution model +#' @name similarity +#' @rdname similarity +#' @usage \S4method{similarity}{BiodiversityDistribution}(obj) +methods::setMethod( + "similarity", + methods::signature(obj = "BiodiversityDistribution"), + function(obj, ref_type = 'poipo', method = 'mess', predictor_names = NULL, full = FALSE, plot = TRUE, ...) { + assertthat::assert_that(inherits(obj, "BiodiversityDistribution"), + is.character(ref_type), + ref_type %in% c('poipo','poipa','polpo','polpa'), + is.character(method), + is.null(predictor_names) || is.character(predictor_names) + ) + # Check that data and predictors are there + assertthat::assert_that(!is.Waiver(obj$biodiversity),!is.Waiver(obj$predictors)) + assertthat::assert_that(is.null(predictor_names) || all(predictor_names %in% obj$get_predictor_names()) ) + # Match to correct spelling mistakes + method <- match.arg(tolower(method), c('mess','nt'), several.ok = FALSE) + + # Get biodiversity data + bid <- obj$biodiversity$get_id_byType(ref_type) + + # Get covariates + covs <- obj$predictors$get_data() + # Extract covariates for reference data + ref <- get_rastervalue(coords = obj$biodiversity$get_coordinates(names(bid)), + env = covs, + rm.na = FALSE) + ref <- ref[,names(covs)] + # ref <- get_ngbvalue( + # coords = obj$biodiversity$get_coordinates(names(bid)), + # env = obj$predictors$get_data(df = TRUE), + # field_space = c('x','y'), + # longlat = raster::isLonLat(covs) + # ) + # Subset if necessary + if(!is.null(predictor_names)){ covs <- covs[[predictor_names]]; ref <- ref[,predictor_names]} + + assertthat::assert_that(nlayers(covs) == ncol(ref)) + + # Run mess function + if(method == 'mess'){ + out <- .mess(covs = covs, + ref = ref, + full = full) + # Calculate interpolation/extrapolated + rip <- raster::cut(out$mis,c(cellStats(out$mis,'min'),0,cellStats(out$mis,'max')),ordered_result = TRUE) + rip <- raster::ratify(rip) + levels(rip) <- data.frame(ID = levels(rip)[[1]], + what = c('Extrapolation','Interpolation')) + out$exip <- rip;rm(rip) + + } else if(method == 'nt') { + out <- .nt12(prodat = covs, + refdat = ref) + } + + # If plot is specified, make figures. Otherwise return the list of rasters + if(plot){ + if(method == 'mess'){ + par.ori <- graphics::par(no.readonly = TRUE) + graphics::par(mfrow=c(2,2)) + raster::plot(out$mis,col = ibis_colours[['viridis_plasma']],main = paste0('Similarity surface (method: ',method,')')) + raster::plot(out$exip,col = ibis_colours[['distinct_random']][1:2],main = paste0('Extrapolated vs interpolated conditions')) + raster::plot(out$mod,col = ibis_colours[['distinct_random']][1:length(unique(out$mod))], main = paste0('Most dissimilar from reference')) + raster::plot(out$mos,col = ibis_colours[['distinct_random']][length(ibis_colours[['distinct_random']]):(length(ibis_colours[['distinct_random']])-length(unique(out$mos)))], main = paste0('Most similar to reference')) + graphics::par(par.ori) + } else if(method == 'nt'){ + par.ori <- graphics::par(no.readonly = TRUE) + graphics::par(mfrow=c(1,3)) + raster::plot(out$NT1,col = ibis_colours[['viridis_plasma']],main = paste0('Univariate extrapolation')) + raster::plot(out$NT2,col = ibis_colours[['viridis_orig']],main = paste0('Non-analogous dissimilarity')) + raster::plot(out$novel,col = ibis_colours[['distinct_random']][1:3],main = paste0('Novel conditions (method: ',method,')')) + # FIXME: add categorical legend left to it + graphics::par(par.ori) + } + } else { + return( out ) + } + } +) + +#' Similarity of used predictors by providing a RasterBrick directly +#' @name similarity +#' @rdname similarity +#' @usage \S4method{similarity}{RasterBrick}(obj) +methods::setMethod( + "similarity", + methods::signature(obj = "RasterBrick"), + function(obj, ref, method = 'mess', full = FALSE, plot = TRUE, ...) { + assertthat::assert_that(!missing(ref),msg = 'Provide a sf object of reference sites') + assertthat::assert_that(inherits(obj, "RasterBrick"), + inherits(ref, 'sf'), + is.character(method) + ) + # Convert RasterBrick to stack + obj <- raster::stack(obj) + # Now recall + similarity(obj = obj,ref = ref, method = method, + full = full,plot = plot, ...) + } +) + +#' Similarity of used predictors by providing a RasterStack directly +#' @name similarity +#' @rdname similarity +#' @usage \S4method{similarity}{RasterStack}(obj) +methods::setMethod( + "similarity", + methods::signature(obj = "RasterStack"), + function(obj, ref, method = 'mess', full = FALSE, plot = TRUE, ...) { + assertthat::assert_that(!missing(ref),msg = 'Provide a sf object of reference sites') + assertthat::assert_that(inherits(obj, "RasterStack"), + raster::nlayers(obj)>=1, + inherits(ref, 'sf'), + is.character(method) + ) + # Check that points are of same projection as raster + assertthat::assert_that(sf::st_crs(obj) == sf::st_crs(ref)) + + # Match to correct spelling mistakes + method <- match.arg(tolower(method), c('mess','nt2'), several.ok = FALSE) + + # Extract values for each provided value + ex <- raster::extract(x = obj, + y = ref, + df = TRUE) + # Subset to variables in obj and remove missing rows + ex <- subset.data.frame(ex, select = names(obj)) + ex <- subset.data.frame(ex, stats::complete.cases(ex)) + + if(method == 'mess'){ + out <- .mess(covs = obj, + ref = ex, + full = full) + # Calculate interpolation/extrapolated + rip <- raster::cut(out$mis,c(cellStats(out$mis,'min'),0,cellStats(out$mis,'max')),ordered_result = TRUE) + rip <- raster::ratify(rip) + levels(rip) <- data.frame(ID = levels(rip)[[1]], + what = c('Extrapolation','Interpolation')) + out$exip <- rip;rm(rip) + + } else { + stop('Not yet implemented!') + } + + # If plot is specified, make figures. Otherwise return the list of rasters + if(plot){ + if(method == 'mess'){ + par.ori <- graphics::par(no.readonly = TRUE) + graphics::par(mfrow=c(2,2)) + raster::plot(out$mis,col = ibis_colours[['viridis_plasma']],main = paste0('Similarity surface (method: ',method,')')) + raster::plot(out$exip,col = ibis_colours[['distinct_random']][1:2],main = paste0('Extrapolated vs interpolated conditions')) + raster::plot(out$mod,col = ibis_colours[['distinct_random']][1:length(unique(out$mod))], main = paste0('Most dissimilar from reference')) + raster::plot(out$mos,col = ibis_colours[['distinct_random']][length(ibis_colours[['distinct_random']]):(length(ibis_colours[['distinct_random']])-length(unique(out$mos)))], main = paste0('Most similar to reference')) + graphics::par(par.ori) + } else if(method == 'nt'){ + par.ori <- graphics::par(no.readonly = TRUE) + graphics::par(mfrow=c(1,3)) + raster::plot(out$NT1,col = ibis_colours[['viridis_plasma']],main = paste0('Univariate extrapolation')) + raster::plot(out$NT2,col = ibis_colours[['viridis_orig']],main = paste0('Non-analogous dissimilarity')) + raster::plot(out$novel,col = ibis_colours[['distinct_random']][1:3],main = paste0('Novel conditions (method: ',method,')')) + graphics::par(par.ori) + } + } else { + return( out ) + } + } +) + +#' Function to calculate the multivariate combination novelty index (NT2) +#' +#' @description +#' NT1 ranges from infinite negative values to zero where zero indicates no +#' extrapolation beyond the univariate coverage of reference data" +#' (Mesgaran et al. 2014). +#' +#' "NT2 can range from zero up to unbounded positive values. NT2 values +#' ranging from zero to one indicate similarity (in terms of both univariate +#' range and multivariate combination), with values closer to zero being more +#' similar. Values larger than one are indicative of novel combinations" +#' (Mesgaran et al. 2014). +#' +#' @param prodat A [`RasterStack`]. The projected values. The layer names must +#' match the column names of \code{refdat}. +#' @param refdat A numerical [`matrix`] or [`data.frame`]. The reference values of variables organized +#' in columns. +#' +#' @references +#' * Mesgaran, M. B., R. D. Cousens, B. L. Webber, and J. Franklin. +#' 2014. Here be dragons: a tool for quantifying novelty due to covariate +#' range and correlation change when projecting species distribution models. +#' Diversity and Distributions 20:1147-1159. +#' @section Notes: The code is adapted from Bell & Schlaepfer 2015 (available +#' at \url{https://github.com/bellland/SDM.Virtual.Species_Bell.Schlaepfer}) +#' which was based on a comment by Matthew Bayly made at +#' \url{https://pvanb.wordpress.com/2014/05/13/a-new-method-and-tool-exdet-to-evaluate-novelty-environmental-conditions/}. +#' @noRd +#' @keywords internal +.nt12 <- function(prodat, refdat){ + stopifnot(requireNamespace("matrixStats")) + # Input checks + assertthat::assert_that(is.Raster(prodat), + nlayers(prodat) == ncol(refdat)) + # Make a background layer for filling + bg <- emptyraster(prodat) + # Now convert both to matrix + prodat <- raster::as.matrix(prodat) + refdat <- as.matrix(refdat) + # Further checks + assertthat::assert_that(identical(colnames(refdat), colnames(prodat)), + is.data.frame(prodat) || is.matrix(prodat), + is.data.frame(prodat) || is.matrix(prodat)) + + # First calculate univariate novelty, e.g. NT1 + # Get ranges of variables and multiply + range_ref <- t(matrixStats::colRanges(as.matrix(refdat), na.rm = TRUE)) + diffs_ref <- matrixStats::colDiffs(range_ref) + # Remove those with 0 range, e.g. singular values for the reference + range_ref <- range_ref[,which(diffs_ref!=0)] + refdat <- refdat[,which(diffs_ref!=0)] + prodat <- prodat[,which(diffs_ref!=0)] + diffs_ref <- diffs_ref[,which(diffs_ref!=0)] + + range_ref_arr <- array(range_ref, dim = c(dim(range_ref), nrow(prodat)), + dimnames = list(c("min", "max"), colnames(refdat), NULL)) + + diffs_ref_arr <- matrix(diffs_ref, nrow = nrow(prodat), ncol = ncol(prodat), + byrow = TRUE) + + # Make empty raster with 3 dimensions + iud <- array(0, dim = c(dim(prodat), 3)) + iud[ , , 2] <- prodat - t(range_ref_arr["min", ,]) + iud[ , , 3] <- t(range_ref_arr["max", ,]) - prodat + + # Univariate novelty + # NT1 ranges from infinite negative values to zero where zero indicates no extrapolation + # beyond the univariate coverage of reference data. + UDs <- apply(iud, 1:2, min) / diffs_ref_arr + nt1 <- emptyraster(bg) + nt1[] <- rowSums(UDs) + + # --- # + # Multivariate combination novelty index (NT2) + # Calculate the center of reference data: average and covariance matrix + ref_av <- colMeans(refdat, na.rm = TRUE) + ref_cov <- stats::var(refdat, na.rm = TRUE) + + # Mahalanobis distance of reference data to center of reference data + mah_ref <- stats::mahalanobis(x = refdat, center = ref_av, cov = ref_cov,tol=1e-20) + # Mahalanobis distance of projected data to center of reference data + mah_pro <- stats::mahalanobis(x = prodat, center = ref_av, cov = ref_cov,tol=1e-20) + # Correction when mah_pro is negative (numerical instability. Correct to 0) + if(min(mah_pro,na.rm = T) < 0) mah_pro[which(!is.na(mah_pro) & mah_pro < 0)] <- 0 + + # Ratio + mah_max <- max(mah_ref[is.finite(mah_ref)]) + nt2 <- emptyraster(bg) + nt2[] <- (mah_pro / mah_max) + + # Calculate most dissimilar value (MOD) + # FIXME: Implement when have time https://onlinelibrary.wiley.com/doi/full/10.1111/ddi.12209 + # mod <- emptyraster(bg) + # For any point, the MIC is the covariate which produces the highest ICp value. + # icp <- 100 * (mah_ref - icp) + # matrixStats::rowMaxs(icp) + + # --- # + # Calculate areas outside the univariate range of combinations and non-analogous novel combinations + nt_novel <- emptyraster(bg) + # First areas areas in the projection space with at least one covariate outside the univariate range of reference data + or1 <- c(raster::cellStats(nt1,'min'), 0) + o_low <- raster::cut(nt1, or1, include.lowest=T) + # Next areas with NT2 ranging from 0 to 1 that are similar to the reference data + o_mid <- nt2 %in% c(0,1) + # non-analogous covariate combinations + o_high <- nt2 > 1 + nt_novel[o_low == 1] <- 1 + nt_novel[o_mid == 1] <- 2 + nt_novel[o_high == 1] <- 3 + nt_novel <- raster::ratify(nt_novel) + levels(nt_novel) <- data.frame(ID = levels(nt_novel)[[1]], + what = c('Outside reference','Within reference','Novel combinations')) + + # Create output stack + out <- raster::stack(nt1,nt2,nt_novel, + quick = TRUE) + names(out) <- c('NT1','NT2','novel') + return(out) +} + +#' Function to calculate Multivariate Environmental Similarity index +#' @noRd +#' @keywords internal +.mess <- function(covs, ref, full=FALSE) { + # Convert to data.frame + if(!methods::is(ref, 'data.frame')) { + ref <- as.data.frame(ref) + } + # Make dummy template rasters + if(is(covs, 'Raster')) { + r <- TRUE + if(isTRUE(full)) { + out <- raster::stack(replicate( + raster::nlayers(covs), raster::init(covs, function(x) NA))) + } else { + out <- raster::init(covs, function(x) NA) + } + } else r <- FALSE + ref <- stats::na.omit(ref) # Remove NAs + if(!methods::is(covs, 'data.frame')) { + covs <- as.data.frame(covs) + } + # Calculate dimensions (range) + if(is.null(dim(ref))) { + rng <- as.data.frame(range(ref, na.rm=TRUE)) + } else { + rng <- as.data.frame(apply(ref, 2, range, na.rm=TRUE)) + } + # remove variables where max-min is 0 + rng <- rng[which(apply(rng, 2, diff)>0)] + covs <- covs[,names(rng)] + ref <- ref[,names(rng)] + + # Find intervals within ranges + pct_less <- mapply(function(x, ref) { + findInterval(x, sort(ref))/length(ref) + }, covs, ref, SIMPLIFY=FALSE) + # Calculate similarity surface + sim <- mapply(function(f, rng, p) { + ifelse(f==0, (p-rng[1])/diff(rng)*100, + ifelse(f > 0 & f <= 0.5, f*200, + ifelse(f > 0.5 & f < 1, (1-f)*200, + (rng[2]-p)/diff(rng)*100))) + }, pct_less, rng, covs) + + min_sim <- if(is.matrix(sim)) apply(sim, 1, min) else(min(sim)) + + # Get minimum similarity and most (dis)similiar values + mins <- apply(sim, 1, which.min) + most_dissimilar_vec <- unlist(ifelse(lengths(mins)==0, NA, mins)) + maxs <- apply(sim, 1, which.max) + most_similar_vec <- unlist(ifelse(lengths(maxs)==0, NA, maxs)) + + if(isTRUE(r)) { + # Calculate most dissimilar surface + most_dissimilar <- raster::raster(out) + most_dissimilar[] <- most_dissimilar_vec + most_dissimilar <- as.factor(most_dissimilar) + rat <- levels(most_dissimilar)[[1]] + rat$varname <- colnames(sim)[rat$ID] + levels(most_dissimilar) <- rat + + # Calculate most similar surface + most_similar <- raster::raster(out) + most_similar[] <- most_similar_vec + most_similar <- as.factor(most_similar) + rat <- levels(most_similar)[[1]] + rat$varname <- colnames(sim)[rat$ID] + levels(most_similar) <- rat + + # Fill template rasters + out_min <- raster::raster(out) + out_min[] <- min_sim + if(isTRUE(full)) { + out[] <- sim + list(similarity=out, mis=out_min, mod=most_dissimilar, + mos=most_similar) + } else list(mis=out_min, mod=most_dissimilar, mos=most_similar) + } else { + if(isTRUE(full)) { + list(similarity=sim, mis=min_sim, + mod=most_dissimilar_vec, mos=most_similar_vec) + } else list(mis=min_sim, mod=most_dissimilar_vec, + mos=most_similar_vec) + } +} diff --git a/R/summary.R b/R/summary.R index f2c91556..166f636a 100644 --- a/R/summary.R +++ b/R/summary.R @@ -11,18 +11,18 @@ NULL #' #' When unsure, it is usually a good strategy to run [summary] on any object. #' -#' @param x Any prepared object. +#' @param object Any prepared object. #' @param ... not used. #' #' @seealso [base::summary()]. #' @examples #' \dontrun{ #' # Example with a trained model -#' x <- distribution(background) %>% +#' x <- distribution(background) |> #' # Presence-absence data -#' add_biodiversity_poipa(surveydata) %>% +#' add_biodiversity_poipa(surveydata) |> #' # Add predictors and scale them -#' add_predictors(env = predictors) %>% +#' add_predictors(env = predictors) |> #' # Use glmnet and lasso regression for estimation #' engine_glmnet(alpha = 1) #' # Train the model @@ -43,37 +43,37 @@ NULL #' @method summary distribution #' @keywords summary #' @export -summary.distribution <- function(x, ...) x$summary() +summary.distribution <- function(object, ...) object$summary() #' @rdname summary #' @method summary DistributionModel #' @keywords summary #' @export -summary.DistributionModel <- function(x, ...) x$summary() +summary.DistributionModel <- function(object, ...) object$summary() #' @rdname summary #' @method summary PredictorDataset #' @keywords summary #' @export -summary.PredictorDataset <- function(x, ...) x$summary() +summary.PredictorDataset <- function(object, ...) object$summary() #' @rdname summary #' @method summary BiodiversityScenario #' @keywords summary #' @export -summary.BiodiversityScenario <- function(x, ...) x$summary(...) +summary.BiodiversityScenario <- function(object, ...) object$summary() #' @rdname summary #' @method summary PriorList #' @keywords summary #' @export -summary.PriorList <- function(x, ...) x$summary() +summary.PriorList <- function(object, ...) object$summary() #' @rdname summary #' @method summary Settings #' @keywords summary #' @export -summary.Settings <- function(x, ...) x$summary() +summary.Settings <- function(object, ...) object$summary() #' Obtains the coefficients of a trained model #' @@ -95,4 +95,4 @@ NULL #' @method coef DistributionModel #' @keywords coef #' @export -coef.DistributionModel <- function(x, ...) x$get_coefficients() +coef.DistributionModel <- function(object, ...) object$get_coefficients() diff --git a/R/threshold.R b/R/threshold.R index 30881202..866a2c7a 100644 --- a/R/threshold.R +++ b/R/threshold.R @@ -1,376 +1,376 @@ -#' @include utils.R -NULL - -#' Threshold a continuous prediction to a categorical layer -#' -#' @description -#' It is common in many applications of species distribution modelling that estimated -#' continuous suitability surfaces are converted into discrete representations of where -#' suitable habitat might or might not exist. This so called *threshold'ing* -#' can be done in various ways which are further described in the details. -#' -#' In case a [RasterLayer] or [RasterBrick] is provided as input in this function -#' for \code{obj}, it is furthermore necessary to provide a [`sf`] object for validation as -#' there is no [`DistributionModel`] to read this information from. -#' **Note:** This of course also allows to estimate the threshold based on withheld data, for instance -#' those created from an a-priori cross-validation procedure. -#' -#' For [`BiodiversityScenario`] objects, adding this function to the processing pipeline -#' stores a threshold attribute in the created [scenario] object. -#' -#' @param obj A trained [`DistributionModel`] or alternatively a [`Raster`] object. -#' @param method A specifc method for thresholding. See details for available options. -#' @param value A [`numeric`] value for thresholding if method is fixed (Default: \code{NULL}). -#' @param poi A [`sf`] object containing observational data used for model training. -#' @param format [`character`] indication of whether \code{"binary"}, \code{"normalize"} or \code{"percentile"} -#' formatted thresholds are to be created (Default: \code{"binary"}). Also see Muscatello et al. (2021). -#' @param ... other parameters not yet set. -#' @param return_threshold Should threshold value be returned instead (Default: \code{FALSE}) -#' @details -#' The following options are currently implemented: -#' * \code{'fixed'} = applies a single pre-determined threshold. Requires \code{value} to be set. -#' * \code{'mtp'} = minimum training presence is used to find and set the lowest predicted suitability for any occurrence point. -#' * \code{'percentile'} = For a percentile threshold. A \code{value} as parameter has to be set here. -#' * \code{'min.cv'} = Threshold the raster so to minimize the coefficient of variation (cv) of the posterior. Uses the lowest tercile of the cv in space. Only feasible with Bayesian engines. -#' * \code{'TSS'} = Determines the optimal TSS (True Skill Statistic). Requires the [modEvA] package to be installed. -#' * \code{'kappa'} = Determines the optimal kappa value (Kappa). Requires the [modEvA] package to be installed. -#' * \code{'F1score'} = Determines the optimal F1score (also known as Sorensen similarity). Requires the [modEvA] package to be installed. -#' * \code{'F1score'} = Determines the optimal sensitivity of presence records. Requires the [modEvA] package to be installed. -#' * \code{'Sensitivity'} = Determines the optimal sensitivity of presence records. Requires the [modEvA] package to be installed. -#' * \code{'Specificity'} = Determines the optimal sensitivity of presence records. Requires the [modEvA] package to be installed. -#' @name threshold -#' @references -#' * Lawson, C.R., Hodgson, J.A., Wilson, R.J., Richards, S.A., 2014. Prevalence, thresholds and the performance of presence-absence models. Methods Ecol. Evol. 5, 54–64. https://doi.org/10.1111/2041-210X.12123 -#' * Liu, C., White, M., Newell, G., 2013. Selecting thresholds for the prediction of species occurrence with presence-only data. J. Biogeogr. 40, 778–789. https://doi.org/10.1111/jbi.12058 -#' * Muscatello, A., Elith, J., Kujala, H., 2021. How decisions about fitting species distribution models affect conservation outcomes. Conserv. Biol. 35, 1309–1320. https://doi.org/10.1111/cobi.13669 -#' @seealso [modEvA] -#' @returns A [RasterLayer] if used with a [Raster] object as input. -#' Otherwise the threshold is added to the respective [`DistributionModel`] or [`BiodiversityScenario`] object. -#' @examples -#' \dontrun{ -#' # Where mod is an estimated DistributionModel -#' tr <- threshold(mod) -#' tr$plot_threshold() -#' } -#' @export -NULL - -#' @name threshold -#' @rdname threshold -#' @exportMethod threshold -#' @export -methods::setGeneric( - "threshold", - signature = methods::signature("obj", "method", "value"), - function(obj, method = 'mtp', value = NULL, poi = NULL, format = "binary", return_threshold = FALSE, ...) standardGeneric("threshold")) - -#' Generic threshold with supplied DistributionModel object -#' @name threshold -#' @rdname threshold -#' @usage \S4method{threshold}{ANY}(obj) -methods::setMethod( - "threshold", - methods::signature(obj = "ANY"), - function(obj, method = 'mtp', value = NULL, format = "binary", return_threshold = FALSE, ...) { - assertthat::assert_that(any( class(obj) %in% getOption('ibis.engines') ), - is.character(method), - is.null(value) || is.numeric(value), - is.character(format) - ) - # Check other and add legacy handling - dots <- list(...) - if("truncate" %in% names(dots)) format <- ifelse(dots[[truncate]],"normalize", "binary") - format <- match.arg(format, c("binary", "normalize", "percentile"), several.ok = FALSE) - - # Get prediction raster - ras <- obj$get_data('prediction') - # Get model object - model <- obj$model - - # Check that the object actually contains a prediction - assertthat::assert_that( - is.Raster(ras), - !is.Waiver(ras), - msg = 'No fitted prediction in object!' - ) - # Matching for correct method - method <- match.arg(method, c('fixed','mtp','percentile','min.cv', - 'TSS','kappa','F1score','Sensitivity','Specificity'), several.ok = FALSE) - - # If method is min.cv, check that posterior is accessible - if(method == "min.cv") assertthat::assert_that("cv" %in% names(ras), msg = "Method min.cv requires a posterior prediction and coefficient of variation!") - - # Get all point data in distribution model - poi <- do.call(sf:::rbind.sf, - lapply(obj$model$biodiversity, function(y){ - o <- guess_sf(y$observations) - o$name <- y$name; o$type <- y$type - subset(o, select = c('observed', "name", "type", "geometry")) - } ) - ) %>% tibble::remove_rownames() - suppressWarnings( - poi <- sf::st_set_crs(poi, value = sf::st_crs(obj$get_data('prediction'))) - ) - - # If TSS or kappa is chosen, check whether there is poipa data among the sources - if((!any(poi$observed==0) & method %in% c('TSS','kappa','F1score','Sensitivity','Specificity')) || length(unique(poi$name)) > 1){ - if(getOption('ibis.setupmessages')) myLog('[Threshold]','red','Threshold method needs absence-data. Generating some now...') - bg <- raster::rasterize(obj$model$background, emptyraster(obj$get_data('prediction'))) - abs <- add_pseudoabsence(df = poi, - field_occurrence = 'observed', - template = bg, - # Assuming that settings are comparable among objects - settings = model$biodiversity[[1]]$pseudoabsence_settings - ) - - abs <- subset(abs, select = c('x','y'));abs$observed <- 0 - abs <- guess_sf(abs) - abs$name <- 'Background point'; abs$type <- "generated" - suppressWarnings( - abs <- sf::st_set_crs(abs, value = sf::st_crs(obj$get_data('prediction'))) - ) - poi <- subset(poi, select = c("observed", "name", "type","geometry")) - abs <- subset(abs, select = c("observed", "name", "type","geometry")) - poi <- rbind(poi, abs);rm(abs) - } - - # Convert to sf - if(!inherits(poi,"sf")){ poi <- guess_sf(poi) } - - # Now self call threshold - out <- threshold(ras, method = method, value = value, poi = poi, format = format,...) - assertthat::assert_that(is.Raster(out)) - # Add result to new obj - new_obj <- obj - if(inherits(out,'RasterLayer')){ - new_obj <- new_obj$set_data(names(out), out) - } else if(inherits(out,'RasterStack')) { - # When stack loop through and add - new_obj <- new_obj$set_data(paste0("threshold_", method), out) - } - # Return altered object - return(new_obj) - } -) - -#' @noRd -#' @keywords internal -.stackthreshold <- function(obj, method = 'fixed', value = NULL, - poi = NULL, format = "binary", return_threshold = FALSE, ...) { - assertthat::assert_that(is.Raster(obj), - is.character(method), - inherits(poi,'sf'), - is.null(value) || is.numeric(value), - is.character(format) - ) - # Match format - format <- match.arg(format, c("binary", "normalize", "percentile"), several.ok = FALSE) - - # Apply threshold on each entry - if(return_threshold){ - # Return the threshold directly - out <- vector() - for(i in names(obj)) out <- c(out, threshold(obj[[i]], method = method, - value = value, poi = poi, format = format, return_threshold = return_threshold, ...) ) - names(out) <- names(obj) - } else { - # Return the raster instead - out <- raster::stack() - if(method == "min.cv"){ - # If the coefficient of variation is to be minmized, mask first all values with the threshold only - assertthat::assert_that(raster::nlayers(obj)>2, "sd" %in% names(obj)) - # Get global coefficient of variation - errortr <- quantile(obj[["cv"]], .3) - assertthat::assert_that(is.numeric(errortr)) - # Create mask - mm <- obj[["cv"]] - mm[mm > errortr] <- NA - obj <- raster::mask(obj, mm); rm(mm) - # Set the value to errortr - value <- errortr - } - # Now loop - for(i in names(obj)) out <- raster::addLayer(out, threshold(obj[[i]], method = method, - value = value, poi = poi, format = format, return_threshold = return_threshold, ...) ) - } - return(out) -} - -#' @name threshold -#' @rdname threshold -#' @inheritParams threshold -#' @usage \S4method{threshold}{RasterBrick}(obj) -methods::setMethod("threshold",methods::signature(obj = "RasterBrick"),.stackthreshold) -#' @usage \S4method{threshold}{RasterStack}(obj) -methods::setMethod("threshold",methods::signature(obj = "RasterStack"),.stackthreshold) - -#' @name threshold -#' @rdname threshold -#' @usage \S4method{threshold}{RasterLayer}(obj) -methods::setMethod( - "threshold", - methods::signature(obj = "RasterLayer"), - function(obj, method = 'fixed', value = NULL, poi = NULL, format = "binary", return_threshold = FALSE, plot = FALSE) { - assertthat::assert_that(is.Raster(obj), - inherits(obj,'RasterLayer'), - is.character(method), - is.null(value) || is.numeric(value), - is.character(format) - ) - # Match format - format <- match.arg(format, c("binary", "normalize", "percentile"), several.ok = FALSE) - - # If poi is set, try to convert sf - if(!is.null(poi)) try({poi <- sf::st_as_sf(poi)}, silent = TRUE) - assertthat::assert_that(is.null(poi) || inherits(poi,'sf')) - - # If observed is a factor, convert to numeric - if(is.factor(poi$observed)){ - poi$observed <- as.numeric(as.character( poi$observed )) - } - - # Match to correct spelling mistakes - method <- match.arg(method, c('fixed','mtp','percentile','min.cv', - 'TSS','kappa','F1score','Sensitivity','Specificity'), several.ok = FALSE) - - # Check that raster has at least a mean prediction in name - if(!is.null(poi)) { - assertthat::assert_that(unique(sf::st_geometry_type(poi)) %in% c('POINT','MULTIPOINT')) - assertthat::assert_that(hasName(poi, 'observed')) - poi_pres <- subset(poi, observed > 0) # Remove any eventual absence data for a poi_pres evaluation - } - # Get the raster layer - raster_thresh <- obj - - # Specify by type: - if(method == "fixed"){ - # Fixed threshold. Confirm to be set - assertthat::assert_that(is.numeric(value), msg = 'Fixed value is missing!') - tr <- value - } else if(method == "mtp"){ - # minimum training presence - pointVals <- raster::extract(raster_thresh, poi_pres) # Extract point only estimates - # Minimum threshold - tr <- min( na.omit(pointVals) ) - - } else if(method == "percentile"){ - pointVals <- raster::extract(raster_thresh, poi_pres) # Extract point only estimates - pointVals <- subset(pointVals, complete.cases(pointVals)) # Remove any NA or NAN data here - # percentile training threshold - if(is.null(value)) value <- 0.1 # If value is not set, use 10% - if(length(pointVals) < 10) { - perc <- floor(length(pointVals) * (1 - value)) - } else { - perc <- ceiling(length(pointVals) * (1 - value)) - } - tr <- rev(sort(pointVals))[perc] # Percentile threshold - - } else if(method == "min.cv"){ - assertthat::assert_that(!is.null(value),msg = "Global minimum cv needs to be set!") - pointVals <- raster::extract(raster_thresh, poi_pres) # Extract point only estimates - - # Get standard deviation and calculate percentile - tr <- min( na.omit(pointVals) ) - names(tr) <- "tr" - names(value) <- "min.cv" - # Combine as a vector - tr <- c(tr, value) - - } else { - # Optimized threshold statistics using the modEvA package - # FIXME: Could think of porting these functions but too much effort for now. Rather have users install the package here - check_package("modEvA") - # Assure that point data is correctly specified - assertthat::assert_that(inherits(poi, 'sf'), hasName(poi, 'observed')) - poi$observed <- ifelse(poi$observed>1,1,poi$observed) # Ensure that observed is <=1 - assertthat::assert_that(all( unique(poi$observed) %in% c(0,1) )) - - # Re-extract point vals but with the full dataset - pointVals <- raster::extract(raster_thresh, poi) - assertthat::assert_that(length(pointVals)>2) - # Calculate the optimal thresholds - suppressWarnings( - opt <- modEvA::optiThresh(obs = poi$observed, pred = pointVals, - measures = c("TSS","kappa","F1score","Misclass","Omission","Commission", - "Sensitivity","Specificity"), - optimize = "each", plot = plot) - ) - if(method %in% opt$optimals.each$measure){ - tr <- opt$optimals.each$threshold[which(opt$optimals.each$measure==method)] - } else { - # Returning a collection of them as vector - tr <- opt$optimals.each$threshold; names(tr) <- opt$optimals.each$measure - } - } - # Security check - assertthat::assert_that(is.numeric(tr) || is.vector(tr)) - - # -- Threshold -- # - if(return_threshold){ - names(tr) <- method - return(tr) - } else { - # Finally threshold the raster - # Process depending on format - if(format == "binary"){ - # Default is to create a binary presence-absence. Otherwise truncated hinge - raster_thresh[raster_thresh < tr[1]] <- 0 - raster_thresh[raster_thresh >= tr[1]] <- 1 - raster_thresh <- raster::asFactor(raster_thresh) - } else if(format == "normalize"){ - raster_thresh[raster_thresh < tr[1]] <- NA - # If truncate, ensure that resulting values are normalized - raster_thresh <- predictor_transform(raster_thresh, option = "norm") - raster_thresh[is.na(raster_thresh)] <- 0 - raster_thresh <- raster::mask(raster_thresh, obj) - base::attr(raster_thresh, 'truncate') <- TRUE - - base::attr(raster_thresh, 'truncate') <- TRUE # Legacy truncate attribute - } else if(format == "percentile") { - raster_thresh[raster_thresh < tr[1]] <- NA - raster_thresh <- predictor_transform(raster_thresh, option = "percentile") - raster_thresh <- raster::mask(raster_thresh, obj) - base::attr(raster_thresh, 'truncate') <- TRUE - } - names(raster_thresh) <- paste0('threshold_',names(obj),'_',method) - # Assign attributes - base::attr(raster_thresh, 'method') <- method - base::attr(raster_thresh, 'format') <- format - base::attr(raster_thresh, 'threshold') <- tr - } - # Return result - return(raster_thresh) - } -) - -#### For scenarios #### - -#' Thresholds in scenario estimation -#' -#' @name threshold -#' @inheritParams threshold -#' @rdname threshold -#' @usage \S4method{threshold}{BiodiversityScenario}(obj) -methods::setMethod( - "threshold", - methods::signature(obj = "BiodiversityScenario"), - function(obj, tr = new_waiver(), ...) { - # Assert that predicted raster is present - assertthat::assert_that( is.Raster(obj$get_model()$get_data('prediction')) ) - # Unless set, check - if(is.Waiver(tr)){ - # Check that a threshold layer is available and get the methods and data from it - assertthat::assert_that( length( grep('threshold', obj$get_model()$show_rasters()) ) >0 , - msg = 'Call \' threshold \' for prediction first!') - # Get threshold layer - tr_lyr <- grep('threshold', obj$get_model()$show_rasters(),value = TRUE) - if(length(tr_lyr)>1) warning("There appear to be multiple thresholds. Using the first one.") - ras_tr <- obj$get_model()$get_data( tr_lyr[1] ) - tr <- attr(ras_tr[[1]], 'threshold') - names(tr) <- attr(ras_tr[[1]], 'method') - } else { - assertthat::assert_that(is.numeric(tr)) - } - bdproto(NULL, obj, threshold = tr) - } -) +#' @include utils.R +NULL + +#' Threshold a continuous prediction to a categorical layer +#' +#' @description +#' It is common in many applications of species distribution modelling that estimated +#' continuous suitability surfaces are converted into discrete representations of where +#' suitable habitat might or might not exist. This so called *threshold'ing* +#' can be done in various ways which are further described in the details. +#' +#' In case a [RasterLayer] or [RasterBrick] is provided as input in this function +#' for \code{obj}, it is furthermore necessary to provide a [`sf`] object for validation as +#' there is no [`DistributionModel`] to read this information from. +#' **Note:** This of course also allows to estimate the threshold based on withheld data, for instance +#' those created from an a-priori cross-validation procedure. +#' +#' For [`BiodiversityScenario`] objects, adding this function to the processing pipeline +#' stores a threshold attribute in the created [scenario] object. +#' +#' @param obj A trained [`DistributionModel`] or alternatively a [`Raster`] object. +#' @param method A specifc method for thresholding. See details for available options. +#' @param value A [`numeric`] value for thresholding if method is fixed (Default: \code{NULL}). +#' @param poi A [`sf`] object containing observational data used for model training. +#' @param format [`character`] indication of whether \code{"binary"}, \code{"normalize"} or \code{"percentile"} +#' formatted thresholds are to be created (Default: \code{"binary"}). Also see Muscatello et al. (2021). +#' @param ... other parameters not yet set. +#' @param return_threshold Should threshold value be returned instead (Default: \code{FALSE}) +#' @details +#' The following options are currently implemented: +#' * \code{'fixed'} = applies a single pre-determined threshold. Requires \code{value} to be set. +#' * \code{'mtp'} = minimum training presence is used to find and set the lowest predicted suitability for any occurrence point. +#' * \code{'percentile'} = For a percentile threshold. A \code{value} as parameter has to be set here. +#' * \code{'min.cv'} = Threshold the raster so to minimize the coefficient of variation (cv) of the posterior. Uses the lowest tercile of the cv in space. Only feasible with Bayesian engines. +#' * \code{'TSS'} = Determines the optimal TSS (True Skill Statistic). Requires the [modEvA] package to be installed. +#' * \code{'kappa'} = Determines the optimal kappa value (Kappa). Requires the [modEvA] package to be installed. +#' * \code{'F1score'} = Determines the optimal F1score (also known as Sorensen similarity). Requires the [modEvA] package to be installed. +#' * \code{'F1score'} = Determines the optimal sensitivity of presence records. Requires the [modEvA] package to be installed. +#' * \code{'Sensitivity'} = Determines the optimal sensitivity of presence records. Requires the [modEvA] package to be installed. +#' * \code{'Specificity'} = Determines the optimal sensitivity of presence records. Requires the [modEvA] package to be installed. +#' @name threshold +#' @references +#' * Lawson, C.R., Hodgson, J.A., Wilson, R.J., Richards, S.A., 2014. Prevalence, thresholds and the performance of presence-absence models. Methods Ecol. Evol. 5, 54–64. https://doi.org/10.1111/2041-210X.12123 +#' * Liu, C., White, M., Newell, G., 2013. Selecting thresholds for the prediction of species occurrence with presence-only data. J. Biogeogr. 40, 778–789. https://doi.org/10.1111/jbi.12058 +#' * Muscatello, A., Elith, J., Kujala, H., 2021. How decisions about fitting species distribution models affect conservation outcomes. Conserv. Biol. 35, 1309–1320. https://doi.org/10.1111/cobi.13669 +#' @seealso [modEvA] +#' @returns A [RasterLayer] if used with a [Raster] object as input. +#' Otherwise the threshold is added to the respective [`DistributionModel`] or [`BiodiversityScenario`] object. +#' @examples +#' \dontrun{ +#' # Where mod is an estimated DistributionModel +#' tr <- threshold(mod) +#' tr$plot_threshold() +#' } +#' @export +NULL + +#' @name threshold +#' @rdname threshold +#' @exportMethod threshold +#' @export +methods::setGeneric( + "threshold", + signature = methods::signature("obj", "method", "value"), + function(obj, method = 'mtp', value = NULL, poi = NULL, format = "binary", return_threshold = FALSE, ...) standardGeneric("threshold")) + +#' Generic threshold with supplied DistributionModel object +#' @name threshold +#' @rdname threshold +#' @usage \S4method{threshold}{ANY}(obj) +methods::setMethod( + "threshold", + methods::signature(obj = "ANY"), + function(obj, method = 'mtp', value = NULL, format = "binary", return_threshold = FALSE, ...) { + assertthat::assert_that(any( class(obj) %in% getOption('ibis.engines') ), + is.character(method), + is.null(value) || is.numeric(value), + is.character(format) + ) + # Check other and add legacy handling + dots <- list(...) + if("truncate" %in% names(dots)) format <- ifelse(dots[[truncate]],"normalize", "binary") + format <- match.arg(format, c("binary", "normalize", "percentile"), several.ok = FALSE) + + # Get prediction raster + ras <- obj$get_data('prediction') + # Get model object + model <- obj$model + + # Check that the object actually contains a prediction + assertthat::assert_that( + is.Raster(ras), + !is.Waiver(ras), + msg = 'No fitted prediction in object!' + ) + # Matching for correct method + method <- match.arg(method, c('fixed','mtp','percentile','min.cv', + 'TSS','kappa','F1score','Sensitivity','Specificity'), several.ok = FALSE) + + # If method is min.cv, check that posterior is accessible + if(method == "min.cv") assertthat::assert_that("cv" %in% names(ras), msg = "Method min.cv requires a posterior prediction and coefficient of variation!") + + # Get all point data in distribution model + poi <- do.call(sf:::rbind.sf, + lapply(obj$model$biodiversity, function(y){ + o <- guess_sf(y$observations) + o$name <- y$name; o$type <- y$type + subset(o, select = c('observed', "name", "type", "geometry")) + } ) + ) |> tibble::remove_rownames() + suppressWarnings( + poi <- sf::st_set_crs(poi, value = sf::st_crs(obj$get_data('prediction'))) + ) + + # If TSS or kappa is chosen, check whether there is poipa data among the sources + if((!any(poi$observed==0) & method %in% c('TSS','kappa','F1score','Sensitivity','Specificity')) || length(unique(poi$name)) > 1){ + if(getOption('ibis.setupmessages')) myLog('[Threshold]','red','Threshold method needs absence-data. Generating some now...') + bg <- raster::rasterize(obj$model$background, emptyraster(obj$get_data('prediction'))) + abs <- add_pseudoabsence(df = poi, + field_occurrence = 'observed', + template = bg, + # Assuming that settings are comparable among objects + settings = model$biodiversity[[1]]$pseudoabsence_settings + ) + + abs <- subset(abs, select = c('x','y'));abs$observed <- 0 + abs <- guess_sf(abs) + abs$name <- 'Background point'; abs$type <- "generated" + suppressWarnings( + abs <- sf::st_set_crs(abs, value = sf::st_crs(obj$get_data('prediction'))) + ) + poi <- subset(poi, select = c("observed", "name", "type","geometry")) + abs <- subset(abs, select = c("observed", "name", "type","geometry")) + poi <- rbind(poi, abs);rm(abs) + } + + # Convert to sf + if(!inherits(poi,"sf")){ poi <- guess_sf(poi) } + + # Now self call threshold + out <- threshold(ras, method = method, value = value, poi = poi, format = format,...) + assertthat::assert_that(is.Raster(out)) + # Add result to new obj + new_obj <- obj + if(inherits(out,'RasterLayer')){ + new_obj <- new_obj$set_data(names(out), out) + } else if(inherits(out,'RasterStack')) { + # When stack loop through and add + new_obj <- new_obj$set_data(paste0("threshold_", method), out) + } + # Return altered object + return(new_obj) + } +) + +#' @noRd +#' @keywords internal +.stackthreshold <- function(obj, method = 'fixed', value = NULL, + poi = NULL, format = "binary", return_threshold = FALSE, ...) { + assertthat::assert_that(is.Raster(obj), + is.character(method), + inherits(poi,'sf'), + is.null(value) || is.numeric(value), + is.character(format) + ) + # Match format + format <- match.arg(format, c("binary", "normalize", "percentile"), several.ok = FALSE) + + # Apply threshold on each entry + if(return_threshold){ + # Return the threshold directly + out <- vector() + for(i in names(obj)) out <- c(out, threshold(obj[[i]], method = method, + value = value, poi = poi, format = format, return_threshold = return_threshold, ...) ) + names(out) <- names(obj) + } else { + # Return the raster instead + out <- raster::stack() + if(method == "min.cv"){ + # If the coefficient of variation is to be minmized, mask first all values with the threshold only + assertthat::assert_that(raster::nlayers(obj)>2, "sd" %in% names(obj)) + # Get global coefficient of variation + errortr <- quantile(obj[["cv"]], .3) + assertthat::assert_that(is.numeric(errortr)) + # Create mask + mm <- obj[["cv"]] + mm[mm > errortr] <- NA + obj <- raster::mask(obj, mm); rm(mm) + # Set the value to errortr + value <- errortr + } + # Now loop + for(i in names(obj)) out <- raster::addLayer(out, threshold(obj[[i]], method = method, + value = value, poi = poi, format = format, return_threshold = return_threshold, ...) ) + } + return(out) +} + +#' @name threshold +#' @rdname threshold +#' @inheritParams threshold +#' @usage \S4method{threshold}{RasterBrick}(obj) +methods::setMethod("threshold",methods::signature(obj = "RasterBrick"),.stackthreshold) +#' @usage \S4method{threshold}{RasterStack}(obj) +methods::setMethod("threshold",methods::signature(obj = "RasterStack"),.stackthreshold) + +#' @name threshold +#' @rdname threshold +#' @usage \S4method{threshold}{RasterLayer}(obj) +methods::setMethod( + "threshold", + methods::signature(obj = "RasterLayer"), + function(obj, method = 'fixed', value = NULL, poi = NULL, format = "binary", return_threshold = FALSE, plot = FALSE) { + assertthat::assert_that(is.Raster(obj), + inherits(obj,'RasterLayer'), + is.character(method), + is.null(value) || is.numeric(value), + is.character(format) + ) + # Match format + format <- match.arg(format, c("binary", "normalize", "percentile"), several.ok = FALSE) + + # If poi is set, try to convert sf + if(!is.null(poi)) try({poi <- sf::st_as_sf(poi)}, silent = TRUE) + assertthat::assert_that(is.null(poi) || inherits(poi,'sf')) + + # If observed is a factor, convert to numeric + if(is.factor(poi$observed)){ + poi$observed <- as.numeric(as.character( poi$observed )) + } + + # Match to correct spelling mistakes + method <- match.arg(method, c('fixed','mtp','percentile','min.cv', + 'TSS','kappa','F1score','Sensitivity','Specificity'), several.ok = FALSE) + + # Check that raster has at least a mean prediction in name + if(!is.null(poi)) { + assertthat::assert_that(unique(sf::st_geometry_type(poi)) %in% c('POINT','MULTIPOINT')) + assertthat::assert_that(utils::hasName(poi, 'observed')) + poi_pres <- subset(poi, observed > 0) # Remove any eventual absence data for a poi_pres evaluation + } + # Get the raster layer + raster_thresh <- obj + + # Specify by type: + if(method == "fixed"){ + # Fixed threshold. Confirm to be set + assertthat::assert_that(is.numeric(value), msg = 'Fixed value is missing!') + tr <- value + } else if(method == "mtp"){ + # minimum training presence + pointVals <- raster::extract(raster_thresh, poi_pres) # Extract point only estimates + # Minimum threshold + tr <- min( stats::na.omit(pointVals) ) + + } else if(method == "percentile"){ + pointVals <- raster::extract(raster_thresh, poi_pres) # Extract point only estimates + pointVals <- subset(pointVals, stats::complete.cases(pointVals)) # Remove any NA or NAN data here + # percentile training threshold + if(is.null(value)) value <- 0.1 # If value is not set, use 10% + if(length(pointVals) < 10) { + perc <- floor(length(pointVals) * (1 - value)) + } else { + perc <- ceiling(length(pointVals) * (1 - value)) + } + tr <- rev(sort(pointVals))[perc] # Percentile threshold + + } else if(method == "min.cv"){ + assertthat::assert_that(!is.null(value),msg = "Global minimum cv needs to be set!") + pointVals <- raster::extract(raster_thresh, poi_pres) # Extract point only estimates + + # Get standard deviation and calculate percentile + tr <- min( stats::na.omit(pointVals) ) + names(tr) <- "tr" + names(value) <- "min.cv" + # Combine as a vector + tr <- c(tr, value) + + } else { + # Optimized threshold statistics using the modEvA package + # FIXME: Could think of porting these functions but too much effort for now. Rather have users install the package here + check_package("modEvA") + # Assure that point data is correctly specified + assertthat::assert_that(inherits(poi, 'sf'), utils::hasName(poi, 'observed')) + poi$observed <- ifelse(poi$observed>1,1,poi$observed) # Ensure that observed is <=1 + assertthat::assert_that(all( unique(poi$observed) %in% c(0,1) )) + + # Re-extract point vals but with the full dataset + pointVals <- raster::extract(raster_thresh, poi) + assertthat::assert_that(length(pointVals)>2) + # Calculate the optimal thresholds + suppressWarnings( + opt <- modEvA::optiThresh(obs = poi$observed, pred = pointVals, + measures = c("TSS","kappa","F1score","Misclass","Omission","Commission", + "Sensitivity","Specificity"), + optimize = "each", plot = plot) + ) + if(method %in% opt$optimals.each$measure){ + tr <- opt$optimals.each$threshold[which(opt$optimals.each$measure==method)] + } else { + # Returning a collection of them as vector + tr <- opt$optimals.each$threshold; names(tr) <- opt$optimals.each$measure + } + } + # Security check + assertthat::assert_that(is.numeric(tr) || is.vector(tr)) + + # -- Threshold -- # + if(return_threshold){ + names(tr) <- method + return(tr) + } else { + # Finally threshold the raster + # Process depending on format + if(format == "binary"){ + # Default is to create a binary presence-absence. Otherwise truncated hinge + raster_thresh[raster_thresh < tr[1]] <- 0 + raster_thresh[raster_thresh >= tr[1]] <- 1 + raster_thresh <- raster::asFactor(raster_thresh) + } else if(format == "normalize"){ + raster_thresh[raster_thresh < tr[1]] <- NA + # If truncate, ensure that resulting values are normalized + raster_thresh <- predictor_transform(raster_thresh, option = "norm") + raster_thresh[is.na(raster_thresh)] <- 0 + raster_thresh <- raster::mask(raster_thresh, obj) + base::attr(raster_thresh, 'truncate') <- TRUE + + base::attr(raster_thresh, 'truncate') <- TRUE # Legacy truncate attribute + } else if(format == "percentile") { + raster_thresh[raster_thresh < tr[1]] <- NA + raster_thresh <- predictor_transform(raster_thresh, option = "percentile") + raster_thresh <- raster::mask(raster_thresh, obj) + base::attr(raster_thresh, 'truncate') <- TRUE + } + names(raster_thresh) <- paste0('threshold_',names(obj),'_',method) + # Assign attributes + base::attr(raster_thresh, 'method') <- method + base::attr(raster_thresh, 'format') <- format + base::attr(raster_thresh, 'threshold') <- tr + } + # Return result + return(raster_thresh) + } +) + +#### For scenarios #### + +#' Thresholds in scenario estimation +#' +#' @name threshold +#' @inheritParams threshold +#' @rdname threshold +#' @usage \S4method{threshold}{BiodiversityScenario}(obj) +methods::setMethod( + "threshold", + methods::signature(obj = "BiodiversityScenario"), + function(obj, tr = new_waiver(), ...) { + # Assert that predicted raster is present + assertthat::assert_that( is.Raster(obj$get_model()$get_data('prediction')) ) + # Unless set, check + if(is.Waiver(tr)){ + # Check that a threshold layer is available and get the methods and data from it + assertthat::assert_that( length( grep('threshold', obj$get_model()$show_rasters()) ) >0 , + msg = 'Call \' threshold \' for prediction first!') + # Get threshold layer + tr_lyr <- grep('threshold', obj$get_model()$show_rasters(),value = TRUE) + if(length(tr_lyr)>1) warning("There appear to be multiple thresholds. Using the first one.") + ras_tr <- obj$get_model()$get_data( tr_lyr[1] ) + tr <- attr(ras_tr[[1]], 'threshold') + names(tr) <- attr(ras_tr[[1]], 'method') + } else { + assertthat::assert_that(is.numeric(tr)) + } + bdproto(NULL, obj, threshold = tr) + } +) diff --git a/R/train.R b/R/train.R index 5fed138e..22157633 100644 --- a/R/train.R +++ b/R/train.R @@ -17,20 +17,25 @@ NULL #' a [RasterLayer] object named [`prediction`] that contains the spatial prediction of the model. #' These objects can be requested via \code{object$get_data("fit_best")}. #' -#' Available options in this function include: +#' Other parameters in this function: +#' +#' * \code{"filter_predictors"} The parameter can be set to various options to remove highly correlated variables or those +#' with little additional information gain from the model prior to any estimation. Available options are \code{"none"} (Default) \code{"pearson"} for +#' applying a \code{0.7} correlation cutoff, \code{"abess"} for the regularization framework by Zhu et al. (2020), or \code{"RF"} or \code{"randomforest"} +#' for removing the least important variables according to a randomForest model. **Note**: This function is only applied on +#' predictors for which no prior has been provided (e.g. potentially non-informative ones). +#' +#' * \code{"optim_hyperparam"} This option allows to make use of hyper-parameter search for several models, which can improve +#' prediction accuracy although through the a substantial increase in computational cost. #' -#' * \code{"rm_corPred"} Setting this to \code{TRUE} removes highly correlated variables for the observation -#' prior to fitting. -#' * \code{"varsel"} This option allows to make use of hyper-parameter search for several models (\code{"reg"}) or -#' alternatively of variable selection methods to further reduce model complexity. Generally substantially increases -#' runtime. The option makes use of the \code{"abess"} approach (Zhu et al. 2020) to identify and remove the least-important -#' variables. #' * \code{"method_integration"} Only relevant if more than one [`BiodiversityDataset`] is supplied and when #' the engine does not support joint integration of likelihoods. #' See also Miller et al. (2019) in the references for more details on different types of integration. Of course, #' if users want more control about this aspect, another option is to fit separate models #' and make use of the [add_offset], [add_offset_range] and [ensemble] functionalities. -#' * \code{"clamp"} Clamps the projection predictors to the range of values observed during model training. +#' +#' * \code{"clamp"} Boolean parameter to support a clamping of the projection predictors to the range of values observed +#' during model training. #' #' @note #' There are no silver bullets in (correlative) species distribution modelling and for each model the analyst has to @@ -39,16 +44,19 @@ NULL #' #' @param x [distribution()] (i.e. [`BiodiversityDistribution-class`]) object). #' @param runname A [`character`] name of the trained run. -#' @param rm_corPred Remove highly correlated predictors (Default: \code{FALSE}). This option -#' removes - based on pairwise comparisons - those covariates that are highly collinear (Pearson's \code{r >= 0.7}). -#' @param varsel Perform a variable selection on the set of predictors either prior to building the model -#' or via variable selection / regularization of the model. Available options are: -#' * [`none`] for no or default priors and no extensive hyperparameter search. -#' * [`reg`] Model selection either through DIC or regularization / hyperparameter tuning depending on the -#' engine (Default). -#' * [`abess`] A-priori adaptive best subset selection of covariates via the [abess] package (see References). -#' Note that this effectively fits a separate generalized linear model to reduce the number of covariates. -#' Can be helpful for engines that don't directly support efficient variable regularization and when \code{N>100}. +#' @param filter_predictors A [`character`] defining if and how highly correlated predictors are to be removed +#' prior to any model estimation. +#' Available options are: +#' * \code{"none"} No prior variable removal is performed (Default). +#' * \code{"pearson"}, \code{"spearman"} or \code{"kendall"} Makes use of pairwise comparisons to identify and +#' remove highly collinear predictors (Pearson's \code{r >= 0.7}). +#' * \code{"abess"} A-priori adaptive best subset selection of covariates via the [abess] package (see References). Note that this +#' effectively fits a separate generalized linear model to reduce the number of covariates. +#' * \code{"boruta"} Uses the [Boruta] package to identify non-informative features. +#' +#' @param optim_hyperparam Perform a variable selection on the set of predictors either prior +#' to building the model (Default: \code{FALSE}). +#' #' @param inference_only By default the [engine] is used to create #' a spatial prediction of the suitability surface, which can take time. If only inferences of #' the strength of relationship between covariates and observations are required, this parameter @@ -61,6 +69,7 @@ NULL #' than one [`BiodiversityDataset-class`] object is provided in \code{x}. Particular relevant for engines #' that do not support the integration of more than one dataset. Integration methods are generally sensitive #' to the order in which they have been added to the [`BiodiversityDistribution`] object. +#' #' Available options are: #' * \code{"predictor"} The predicted output of the first (or previously fitted) models are #' added to the predictor stack and thus are predictors for subsequent models (Default). @@ -91,15 +100,15 @@ NULL #' @examples #' \dontrun{ #' # Fit a linear penalized logistic regression model via stan -#' x <- distribution(background) %>% +#' x <- distribution(background) |> #' # Presence-absence data -#' add_biodiversity_poipa(surveydata) %>% +#' add_biodiversity_poipa(surveydata) |> #' # Add predictors and scale them -#' add_predictors(env = predictors, transform = "scale", derivates = "none") %>% -#' # Use stan for estimation +#' add_predictors(env = predictors, transform = "scale", derivates = "none") |> +#' # Use Stan for estimation #' engine_stan(chains = 2, iter = 1000, warmup = 500) #' # Train the model -#' mod <- train(x, only_linear = TRUE, varsel = 'none') +#' mod <- train(x, only_linear = TRUE, filter_predictors = 'pearson') #' mod #' } #' @name train @@ -115,7 +124,7 @@ NULL methods::setGeneric( "train", signature = methods::signature("x"), - function(x, runname, rm_corPred = FALSE, varsel = "none", inference_only = FALSE, + function(x, runname, filter_predictors = "none", optim_hyperparam = FALSE, inference_only = FALSE, only_linear = TRUE, method_integration = "predictor", aggregate_observations = TRUE, clamp = FALSE, verbose = FALSE,...) standardGeneric("train")) @@ -125,7 +134,7 @@ methods::setGeneric( methods::setMethod( "train", methods::signature(x = "BiodiversityDistribution"), - function(x, runname, rm_corPred = FALSE, varsel = "none", inference_only = FALSE, + function(x, runname, filter_predictors = "none", optim_hyperparam = FALSE, inference_only = FALSE, only_linear = TRUE, method_integration = "predictor", aggregate_observations = TRUE, clamp = FALSE, verbose = FALSE,...) { if(missing(runname)) runname <- "Unnamed run" @@ -134,7 +143,8 @@ methods::setMethod( assertthat::assert_that( inherits(x, "BiodiversityDistribution"), is.character(runname), - is.logical(rm_corPred), + is.logical(optim_hyperparam), + is.character(filter_predictors), is.logical(inference_only), is.logical(only_linear), is.character(method_integration), @@ -150,15 +160,14 @@ methods::setMethod( # Messenger if(getOption('ibis.setupmessages')) myLog('[Estimation]','green','Collecting input parameters.') # --- # - #rm_corPred = TRUE; varsel = "none"; runname = "test";inference_only = FALSE; verbose = TRUE;only_linear=TRUE;method_integration="predictor";aggregate_observations = TRUE; clamp = FALSE + #filter_predictors = "none"; optim_hyperparam = FALSE; runname = "test";inference_only = FALSE; verbose = TRUE;only_linear=TRUE;method_integration="predictor";aggregate_observations = TRUE; clamp = FALSE # Match variable selection - if(is.logical(varsel)) varsel <- ifelse(varsel, "reg", "none") - varsel <- match.arg(varsel, c("none", "reg", "abess"), several.ok = FALSE) + filter_predictors <- match.arg(filter_predictors, c("none", "pearson", "spearman", "kendall", "abess", "RF", "randomForest", "boruta"), several.ok = FALSE) method_integration <- match.arg(method_integration, c("predictor", "offset", "interaction", "prior", "weight"), several.ok = FALSE) # Define settings object for any other information settings <- bdproto(NULL, Settings) - settings$set('rm_corPred', rm_corPred) - settings$set('varsel', varsel) + settings$set('filter_predictors', filter_predictors) + settings$set('optim_hyperparam', optim_hyperparam) settings$set('only_linear',only_linear) settings$set('inference_only', inference_only) settings$set('clamp', clamp) @@ -199,7 +208,7 @@ methods::setMethod( } else { dummy <- raster::raster(extent(x$background),nrow=100,ncol=100,val=1);names(dummy) <- 'dummy' } - model[['predictors']] <- as.data.frame(dummy, xy = TRUE) + model[['predictors']] <- raster::as.data.frame(dummy, xy = TRUE) model[['predictors_names']] <- 'dummy' model[['predictors_types']] <- data.frame(predictors = 'dummy', type = 'numeric') model[['predictors_object']] <- bdproto(NULL, PredictorDataset, id = new_id(), data = dummy) @@ -280,7 +289,7 @@ methods::setMethod( # Then calculate ras <- st_kde(points = poi, background = bg, bandwidth = 3) # Add to predictor objects, names, types and the object - model[['predictors']] <- cbind.data.frame( model[['predictors']], as.data.frame(ras) ) + model[['predictors']] <- cbind.data.frame( model[['predictors']], raster::as.data.frame(ras) ) model[['predictors_names']] <- c( model[['predictors_names']], names(ras) ) model[['predictors_types']] <- rbind.data.frame(model[['predictors_types']], data.frame(predictors = names(ras), @@ -307,12 +316,12 @@ methods::setMethod( rm(ras, o ) } # Add to predictor objects, names, types and the object - model[['predictors']] <- cbind.data.frame( model[['predictors']], as.data.frame(cc) ) + model[['predictors']] <- cbind.data.frame( model[['predictors']], raster::as.data.frame(cc) ) model[['predictors_names']] <- c( model[['predictors_names']], names(cc) ) model[['predictors_types']] <- rbind.data.frame(model[['predictors_types']], data.frame(predictors = names(cc), type = "numeric" ) - ) + ) if( !all(names(cc) %in% model[['predictors_object']]$get_names()) ){ model[['predictors_object']]$data <- raster::addLayer(model[['predictors_object']]$data, cc) } @@ -336,7 +345,7 @@ methods::setMethod( names(ras_of) <- "spatial_offset" } # Save overall offset - ofs <- as.data.frame(ras_of, xy = TRUE) + ofs <- raster::as.data.frame(ras_of, xy = TRUE) names(ofs)[which(names(ofs)==names(ras_of))] <- "spatial_offset" model[['offset']] <- ofs # Also add offset object for faster extraction @@ -380,7 +389,7 @@ methods::setMethod( # --- # # Rename observation column to 'observed'. Needs to be consistent for INLA # FIXME: try and not use dplyr as dependency (although it is probably loaded already) - model$biodiversity[[id]]$observations <- model$biodiversity[[id]]$observations %>% dplyr::rename('observed' = x$biodiversity$get_columns_occ()[[id]]) + model$biodiversity[[id]]$observations <- model$biodiversity[[id]]$observations |> dplyr::rename('observed' = x$biodiversity$get_columns_occ()[[id]]) names(model$biodiversity[[id]]$observations) <- tolower(names(model$biodiversity[[id]]$observations)) # Also generally transfer everything to lower case # If the type is polygon, convert to regular sampled points per covered grid cells @@ -389,7 +398,7 @@ methods::setMethod( poly = guess_sf(model$biodiversity[[id]]$observations), template = emptyraster(x$predictors$get_data(df = FALSE)), field_occurrence = "observed" # renamed above - ) + ) model[['biodiversity']][[id]][['observations']] <- o |> as.data.frame() model[['biodiversity']][[id]][['type']] <- ifelse(model[['biodiversity']][[id]][['type']] == 'polpo', 'poipo', 'poipa') # Check and reset multiplication weights @@ -431,7 +440,7 @@ methods::setMethod( rm.na = FALSE) # Remove missing values as several engines can't deal with those easily - miss <- complete.cases(env) + miss <- stats::complete.cases(env) if(sum( !miss )>0 && getOption('ibis.setupmessages')) { myLog('[Setup]','yellow', 'Excluded ', sum( !miss ), ' observations owing to missing values in covariates!' ) } @@ -465,9 +474,11 @@ methods::setMethod( all( apply(env, 1, function(x) all(!is.na(x) )) ),msg = 'Missing values in extracted environmental predictors.' ) - # Check whether predictors should be refined and do so - if(settings$get('rm_corPred') && ('dummy' %in% model[['predictors_names']])){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Removing highly correlated variables...') + # Biodiversity dataset specific predictor refinement if the option is set + if(settings$get("filter_predictors")!= "none"){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow', paste0('Filtering predictors via ', + settings$get("filter_predictors"),'...')) + # Make backups test <- env;test$x <- NULL;test$y <- NULL;test$Intercept <- NULL # Ignore variables for which we have priors @@ -475,73 +486,41 @@ methods::setMethod( keep <- unique( as.character(x$priors$varnames()) ) if('spde'%in% keep) keep <- keep[which(keep!='spde')] # Remove SPDE where existing test <- test[,-which(names(test) %in% keep)] - assert_that(!any(keep %in% names(test))) - } else keep <- NULL - - co <- find_correlated_predictors(env = test, - keep = keep, - cutoff = getOption('ibis.corPred'), # Probably keep default, but maybe sth. to vary in the future - method = 'pearson') + assertthat::assert_that(!any(keep %in% names(test))) + } else {keep <- NULL} + # Add bias variable to keep as we risk filtering it out otherwise + if(!is.Waiver(settings$get("bias_variable"))) keep <- c(keep, settings$get("bias_variable") ) + + # Filter the predictors + # Depending on the option this function returns the variables to be removed. + co <- predictor_filter(env = test, + keep = keep, + cutoff = getOption('ibis.corPred'), # Probably keep default, but maybe sth. to vary in the future + method = settings$get("filter_predictors"), + observed = model[['biodiversity']][[id]]$observations[['observed']], + family = model[['biodiversity']][[id]]$family, + tune.type = "gic", + weight = NULL, + verbose = getOption('ibis.setupmessages') + ) # For all factor variables, remove those with only the minimal value (e.g. 0) fac_min <- apply(test[,model$predictors_types$predictors[which(model$predictors_types$type=='factor')]], 2, function(x) min(x,na.rm = TRUE)) fac_mean <- apply(test[,model$predictors_types$predictors[which(model$predictors_types$type=='factor')]], 2, function(x) mean(x,na.rm = TRUE)) co <- unique(co, names(which(fac_mean == fac_min)) ) # Now add to co all those variables where the mean equals the minimum, indicating only absences + # Remove variables if found if(length(co)>0){ - env %>% dplyr::select(-dplyr::all_of(co)) -> env + env |> dplyr::select(-dplyr::all_of(co)) -> env } - } else { co <- NULL } - # Make use of adaptive best subset selection - if(settings$get("varsel") == "abess"){ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Applying abess method to reduce predictors...') - if(!is.Waiver(x$priors)){ - keep <- unique( as.character(x$priors$varnames()) ) - if('spde'%in% keep) keep <- keep[which(keep!='spde')] # Remove SPDE where existing - } else keep <- NULL - - # If PPM, calculate points per grid cell first - if(model[['biodiversity']][[id]]$family == "poisson"){ - bg <- x$engine$get_data("template") - if(!is.Raster(bg)) bg <- emptyraster(x$predictors$get_data() ) - - obs <- aggregate_observations2grid(df = model[['biodiversity']][[id]]$observations, - template = bg, field_occurrence = "observed") |> - # Add pseudo absences - add_pseudoabsence(template = bg, settings = getOption("ibis.pseudoabsence")) - - envs <- get_rastervalue( - coords = obs[,c('x','y')], - env = model$predictors_object$get_data(df = FALSE)[[ model[['predictors_names']][which( model[['predictors_names']] %notin% co )] ]], - rm.na = T - ) - # Assert observations match environmental data points - obs <- obs[envs$ID,] - envs$ID <- NULL - assertthat::assert_that(nrow(obs) == nrow(envs), nrow(obs)>0, nrow(envs)>0) - } else { - obs <- model[['biodiversity']][[id]]$observations - envs <- env[,model[['predictors_names']][which( model[['predictors_names']] %notin% co )]] - assertthat::assert_that(any(obs$observed == 0), - nrow(obs)==nrow(envs)) - } - # Add abess here - co2 <- find_subset_of_predictors( - env = envs, - observed = obs$observed, - family = model[['biodiversity']][[id]]$family, - tune.type = "gic", - weight = NULL, - keep = keep - ) - co <- c(co, co2) |> unique() - } + } else { co <- NULL } # Save predictors extracted for biodiversity extraction model[['biodiversity']][[id]][['predictors']] <- env model[['biodiversity']][[id]][['predictors_names']] <- model[['predictors_names']][which( model[['predictors_names']] %notin% co )] model[['biodiversity']][[id]][['predictors_types']] <- model[['predictors_types']][model[['predictors_types']]$predictors %notin% co,] - } + } + # If the method of integration is weights and there are more than 2 datasets, combine if(method_integration == "weight" && length(model$biodiversity)>=2){ if(getOption('ibis.setupmessages')) myLog('[Setup]','yellow','Experimental: Integration by weights assumes identical data parameters!') @@ -577,9 +556,9 @@ methods::setMethod( } # Warning if Np is larger than Nb - if(settings$get("varsel") == "none"){ + if(settings$get("filter_predictors") == "none"){ if( sum(x$biodiversity$get_observations() )-1 <= length(model$predictors_names)){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','red', 'There are more predictors than observations! Consider setting varsel= to \"reg\" ') + if(getOption('ibis.setupmessages')) myLog('[Setup]','red', 'More predictors than observations! Consider settings optim_hyperparam or filter_predictors!') } } @@ -630,8 +609,8 @@ methods::setMethod( if(nrow(zones)>0){ # Now clip all predictors and background to this model$background <- suppressMessages( - suppressWarnings( sf::st_union( sf::st_intersection(zones, model$background), by_feature = TRUE) %>% - sf::st_buffer(dist = 0) %>% # 0 distance buffer trick + suppressWarnings( sf::st_union( sf::st_intersection(zones, model$background), by_feature = TRUE) |> + sf::st_buffer(dist = 0) |> # 0 distance buffer trick sf::st_cast("MULTIPOLYGON") ) ) @@ -678,7 +657,7 @@ methods::setMethod( length(model[['biodiversity']][[1]][['expect']])>1, all(c("predictors","background","biodiversity") %in% names(model) ), length(model$biodiversity[[1]]$expect) == nrow(model$biodiversity[[1]]$predictors) - ) + ) # --------------------------------------------------------------------- # #### Engine specific code starts below #### # --------------------------------------------------------------------- # @@ -831,7 +810,7 @@ methods::setMethod( "poisson" = ilink(new[], link = "log") ) if(is.Waiver(model$offset)){ - ofs <- as.data.frame(new, xy = TRUE) + ofs <- raster::as.data.frame(new, xy = TRUE) names(ofs)[which(names(ofs)==names(new))] <- "spatial_offset" model[['offset']] <- ofs # Also add offset object for faster extraction @@ -841,7 +820,7 @@ methods::setMethod( news <- sum( model[['offset_object']], new, na.rm = TRUE) news <- raster::mask(news, x$background) model[['offset_object']] <- news - ofs <- as.data.frame(news, xy = TRUE) + ofs <- raster::as.data.frame(news, xy = TRUE) names(ofs)[which(names(ofs)=="layer")] <- "spatial_offset" model[['offset']] <- ofs rm(news) @@ -927,11 +906,11 @@ methods::setMethod( new <- out$get_data("prediction") # Back transforming offset to linear scale new[] <- switch (model$biodiversity[[id]]$family, - "binomial" = ilink(new[], link = "logit"), - "poisson" = ilink(new[], link = "log") + "binomial" = ilink(new[], link = "logit"), + "poisson" = ilink(new[], link = "log") ) if(is.Waiver(model$offset)){ - ofs <- as.data.frame(new, xy = TRUE) + ofs <- raster::as.data.frame(new, xy = TRUE) names(ofs)[which(names(ofs)==names(new))] <- "spatial_offset" model[['offset']] <- ofs # Also add offset object for faster extraction @@ -941,7 +920,7 @@ methods::setMethod( news <- sum( model[['offset_object']], new, na.rm = TRUE) news <- raster::mask(news, x$background) model[['offset_object']] <- news - ofs <- as.data.frame(news, xy = TRUE) + ofs <- raster::as.data.frame(news, xy = TRUE) names(ofs)[which(names(ofs)=="layer")] <- "spatial_offset" model[['offset']] <- ofs rm(news) @@ -1032,7 +1011,7 @@ methods::setMethod( names(new) <- "spatial_offset" if(is.Waiver(model$offset)){ - ofs <- as.data.frame(new, xy = TRUE) + ofs <- raster::as.data.frame(new, xy = TRUE) names(ofs)[which(names(ofs)==names(new))] <- "spatial_offset" model[['offset']] <- ofs # Also add offset object for faster extraction @@ -1043,7 +1022,7 @@ methods::setMethod( news <- raster::mask(news, x$background) names(news) <- "spatial_offset" model[['offset_object']] <- news - ofs <- as.data.frame(news, xy = TRUE) + ofs <- raster::as.data.frame(news, xy = TRUE) names(ofs)[which(names(ofs)=="layer")] <- "spatial_offset" model[['offset']] <- ofs rm(news) @@ -1080,102 +1059,102 @@ methods::setMethod( } else if (inherits(x$engine,"BREG-Engine") ){ - # ----------------------------------------------------------- # - #### BREG Engine #### - assertthat::assert_that( - !(method_integration == "offset" && any(types == "poipa")), - msg = "Due to engine limitations BREG models do not support offsets for presence-absence models!" - ) - # For each formula, process in sequence - for(id in ids){ + # ----------------------------------------------------------- # + #### BREG Engine #### + assertthat::assert_that( + !(method_integration == "offset" && any(types == "poipa")), + msg = "Due to engine limitations BREG models do not support offsets for presence-absence models!" + ) + # For each formula, process in sequence + for(id in ids){ - model$biodiversity[[id]]$equation <- built_formula_breg( model$biodiversity[[id]] ) + model$biodiversity[[id]]$equation <- built_formula_breg( model$biodiversity[[id]] ) - # Remove those not part of the modelling - model2 <- model - model2$biodiversity <- NULL; model2$biodiversity[[id]] <- model$biodiversity[[id]] + # Remove those not part of the modelling + model2 <- model + model2$biodiversity <- NULL; model2$biodiversity[[id]] <- model$biodiversity[[id]] - # Run the engine setup script - model2 <- x$engine$setup(model2, settings) + # Run the engine setup script + model2 <- x$engine$setup(model2, settings) - # Now train the model and create a predicted distribution model - settings2 <- settings - if(id != ids[length(ids)] && method_integration == "prior") { - # No need to make predictions if we use priors only - settings2$set('inference_only', TRUE) - } else if(id != ids[length(ids)]){ - # For predictors and offsets - settings2$set('inference_only', FALSE) - } else { - settings2$set('inference_only', inference_only) - } - out <- x$engine$train(model2, settings2) - - # Add Prediction of model to next object if multiple are supplied - if(length(ids)>1 && id != ids[length(ids)]){ - if(method_integration == "predictor"){ - # Add to predictors frame - new <- out$get_data("prediction")[["mean"]] - pred_name <- paste0(model$biodiversity[[id]]$type, "_", make.names(model$biodiversity[[id]]$name),"_mean") - names(new) <- pred_name - # Add the object to the overall prediction object - model$predictors_object$data <- raster::addLayer(model$predictors_object$get_data(), new) - - # Now for each biodiversity dataset and the overall predictors - # extract and add as variable - for(k in names(model$biodiversity)){ - env <- as.data.frame( - raster::extract(new, model$biodiversity[[k]]$observations[,c('x','y')]) ) - # Rename to current id dataset - names(env) <- pred_name - # Add - model$biodiversity[[k]]$predictors <- cbind(model$biodiversity[[k]]$predictors, env) - model$biodiversity[[k]]$predictors_names <- c(model$biodiversity[[k]]$predictors_names, - names(env) ) - model$biodiversity[[k]]$predictors_types <- rbind( - model$biodiversity[[k]]$predictors_types, - data.frame(predictors = names(env), type = c('numeric')) + # Now train the model and create a predicted distribution model + settings2 <- settings + if(id != ids[length(ids)] && method_integration == "prior") { + # No need to make predictions if we use priors only + settings2$set('inference_only', TRUE) + } else if(id != ids[length(ids)]){ + # For predictors and offsets + settings2$set('inference_only', FALSE) + } else { + settings2$set('inference_only', inference_only) + } + out <- x$engine$train(model2, settings2) + + # Add Prediction of model to next object if multiple are supplied + if(length(ids)>1 && id != ids[length(ids)]){ + if(method_integration == "predictor"){ + # Add to predictors frame + new <- out$get_data("prediction")[["mean"]] + pred_name <- paste0(model$biodiversity[[id]]$type, "_", make.names(model$biodiversity[[id]]$name),"_mean") + names(new) <- pred_name + # Add the object to the overall prediction object + model$predictors_object$data <- raster::addLayer(model$predictors_object$get_data(), new) + + # Now for each biodiversity dataset and the overall predictors + # extract and add as variable + for(k in names(model$biodiversity)){ + env <- as.data.frame( + raster::extract(new, model$biodiversity[[k]]$observations[,c('x','y')]) ) + # Rename to current id dataset + names(env) <- pred_name + # Add + model$biodiversity[[k]]$predictors <- cbind(model$biodiversity[[k]]$predictors, env) + model$biodiversity[[k]]$predictors_names <- c(model$biodiversity[[k]]$predictors_names, + names(env) ) + model$biodiversity[[k]]$predictors_types <- rbind( + model$biodiversity[[k]]$predictors_types, + data.frame(predictors = names(env), type = c('numeric')) + ) + } + # Add to overall predictors + model$predictors <- cbind(model$predictors, as.data.frame(new)) + model$predictors_names <- c(model$predictors_names, names(new)) + model$predictors_types <- rbind(model$predictors_types, + data.frame(predictors = names(new), type = c('numeric'))) + + } else if(method_integration == "offset"){ + # Adding the prediction as offset + new <- out$get_data("prediction") + # Back transforming offset to linear scale + new[] <- switch (model$biodiversity[[id]]$family, + "binomial" = ilink(new[], link = "logit"), + "poisson" = ilink(new[], link = "log") ) + if(is.Waiver(model$offset)){ + ofs <- raster::as.data.frame(new, xy = TRUE) + names(ofs)[which(names(ofs)==names(new))] <- "spatial_offset" + model[['offset']] <- ofs + # Also add offset object for faster extraction + model[['offset_object']] <- new + } else { + # New offset + news <- sum( model[['offset_object']], new, na.rm = TRUE) + news <- raster::mask(news, x$background) + model[['offset_object']] <- news + ofs <- raster::as.data.frame(news, xy = TRUE) + names(ofs)[which(names(ofs)=="layer")] <- "spatial_offset" + model[['offset']] <- ofs + rm(news) + } + rm(new) + } else if(method_integration == "prior"){ + # Use the previous model to define and set priors + po <- get_priors(out, x$engine$name) + model$priors <- po } - # Add to overall predictors - model$predictors <- cbind(model$predictors, as.data.frame(new)) - model$predictors_names <- c(model$predictors_names, names(new)) - model$predictors_types <- rbind(model$predictors_types, - data.frame(predictors = names(new), type = c('numeric'))) - - } else if(method_integration == "offset"){ - # Adding the prediction as offset - new <- out$get_data("prediction") - # Back transforming offset to linear scale - new[] <- switch (model$biodiversity[[id]]$family, - "binomial" = ilink(new[], link = "logit"), - "poisson" = ilink(new[], link = "log") - ) - if(is.Waiver(model$offset)){ - ofs <- as.data.frame(new, xy = TRUE) - names(ofs)[which(names(ofs)==names(new))] <- "spatial_offset" - model[['offset']] <- ofs - # Also add offset object for faster extraction - model[['offset_object']] <- new - } else { - # New offset - news <- sum( model[['offset_object']], new, na.rm = TRUE) - news <- raster::mask(news, x$background) - model[['offset_object']] <- news - ofs <- as.data.frame(news, xy = TRUE) - names(ofs)[which(names(ofs)=="layer")] <- "spatial_offset" - model[['offset']] <- ofs - rm(news) - } - rm(new) - } else if(method_integration == "prior"){ - # Use the previous model to define and set priors - po <- get_priors(out, x$engine$name) - model$priors <- po - } } # End of multiple ides - } + } } else if (inherits(x$engine,"GLMNET-Engine") ){ # ----------------------------------------------------------- # #### GLMNET Engine #### @@ -1245,7 +1224,7 @@ methods::setMethod( "poisson" = ilink(new[], link = "log") ) if(is.Waiver(model$offset)){ - ofs <- as.data.frame(new, xy = TRUE) + ofs <- raster::as.data.frame(new, xy = TRUE) names(ofs)[which(names(ofs)==names(new))] <- "spatial_offset" model[['offset']] <- ofs # Also add offset object for faster extraction @@ -1255,7 +1234,7 @@ methods::setMethod( news <- sum( model[['offset_object']], new, na.rm = TRUE) news <- raster::mask(news, x$background) model[['offset_object']] <- news - ofs <- as.data.frame(news, xy = TRUE) + ofs <- raster::as.data.frame(news, xy = TRUE) names(ofs)[which(names(ofs)=="layer")] <- "spatial_offset" model[['offset']] <- ofs rm(news) @@ -1271,24 +1250,24 @@ methods::setMethod( # End of GLMNET engine } else { stop('Specified Engine not implemented yet.') } - if(is.null(out)) return(NULL) + if(is.null(out)) return(NULL) - if(getOption('ibis.setupmessages')) myLog('[Done]','green',paste0('Completed after ', round( as.numeric(out$settings$duration()), 2),' ',attr(out$settings$duration(),'units') )) + if(getOption('ibis.setupmessages')) myLog('[Done]','green',paste0('Completed after ', round( as.numeric(out$settings$duration()), 2),' ',attr(out$settings$duration(),'units') )) - # Clip to limits again to be sure - if(!is.Waiver(x$limits)) { - if(settings$get('inference_only')==FALSE){ - out <- out$set_data("prediction", raster::mask(out$get_data("prediction"), model$background)) + # Clip to limits again to be sure + if(!is.Waiver(x$limits)) { + if(settings$get('inference_only')==FALSE){ + out <- out$set_data("prediction", raster::mask(out$get_data("prediction"), model$background)) + } + out$settings$set("has_limits", TRUE) + } else { + out$settings$set("has_limits", FALSE) } - out$settings$set("has_limits", TRUE) - } else { - out$settings$set("has_limits", FALSE) - } - # Stop logging if specified - if(!is.Waiver(x$log)) x$log$close() + # Stop logging if specified + if(!is.Waiver(x$log)) x$log$close() - # Return created object - return(out) + # Return created object + return(out) } ) diff --git a/R/utils-bart.R b/R/utils-bart.R index 9dcf96cf..b5304a9b 100644 --- a/R/utils-bart.R +++ b/R/utils-bart.R @@ -29,16 +29,16 @@ built_formula_bart <- function(obj){ if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') form <- to_formula(obj$equation) # If response is missing, add manually - if(attr(terms(form), "response")==0){ + if(attr(stats::terms(form), "response")==0){ if(obj$type == "poipo"){ - form <- update.formula(form, "observed/w ~ .") + form <- stats::update.formula(form, "observed/w ~ .") } else { - form <- update.formula(form, "observed ~ .") + form <- stats::update.formula(form, "observed ~ .") } } assertthat::assert_that( is.formula(form), - attr(terms(form), "response")==1, # Has Response + attr(stats::terms(form), "response")==1, # Has Response all( all.vars(form) %in% c('observed','w', model[['predictors_names']]) ) ) } @@ -170,7 +170,7 @@ bart_partial_effect <- function (model, x.vars = NULL, equal = FALSE, ) ) names(ms) <- c("mean","sd", "q05", "q50", "q95", "mode") - if(transform) ms[,c("mean","q05","q50","q95","mode")] <- apply(ms[,c("mean","q05","q50","q95","mode")], 2, pnorm) + if(transform) ms[,c("mean","q05","q50","q95","mode")] <- apply(ms[,c("mean","q05","q50","q95","mode")], 2, stats::pnorm) ms$cv <- ms$sd / ms$mean ms$variable <- pd$xlbs[[i]] df <- cbind(df, ms) @@ -263,10 +263,10 @@ bart_partial_space <- function(model, envs, x.vars = NULL, equal = FALSE, smooth colnames(dfbin) <- c(0, 1) dfbin <- reshape2::melt(dfbin) if (transform == TRUE) { - dfbin$value <- pnorm(dfbin$value) + dfbin$value <- stats::pnorm(dfbin$value) } # FIXME: To replace with base::aggregate to get rid of dplyr dependency - dfbin <- dfbin %>% group_by(variable) %>% summarize(value = median(value)) %>% + dfbin <- dfbin |> dplyr::group_by(variable) |> dplyr::summarize(value = stats::median(value)) |> data.frame() colnames(dfbin) <- c("is", "becomes") dfbin$is <- as.numeric(as.character(dfbin$is)) @@ -287,8 +287,8 @@ bart_partial_space <- function(model, envs, x.vars = NULL, equal = FALSE, smooth } } else { # Nothing binary, calculate median - q50 <- pnorm(apply(pd$fd[[i]], 2, median)) - if (transform == TRUE) { q50 <- pnorm(q50) } + q50 <- stats::pnorm(apply(pd$fd[[i]], 2, median)) + if (transform == TRUE) { q50 <- stats::pnorm(q50) } df <- data.frame(x = pd$levs[[i]], med = q50) nmax <- length(df$x) xmeds <- (df$x[2:nmax] - df$x[1:(nmax - 1)])/2 + df$x[1:(nmax - 1)] diff --git a/R/utils-breg.R b/R/utils-breg.R index 62460060..22754d41 100644 --- a/R/utils-breg.R +++ b/R/utils-breg.R @@ -1,194 +1,194 @@ -#' Built formula for BREG model -#' -#' @description -#' This function built a formula for a `engine_breg()` model. -#' @param obj A [`list()`] object containing the prepared model data for a given biodiversity dataset. -#' @author Martin Jung -#' @note Function is not meant to be run outside the train() call. -#' @keywords internal -#' @noRd -built_formula_breg <- function(obj){ - assertthat::assert_that( - is.list(obj), - length(obj) > 0, - assertthat::has_name(obj, "observations"), - assertthat::has_name(obj, "equation"), - assertthat::has_name(obj, "predictors_names"), - msg = "Error in model object. This function is not meant to be called outside ouf train()." - ) - - # Default equation found - if(obj$equation =='' || is.Waiver(obj$equation)){ - # Construct formula with all variables - form <- paste( 'observed' , ' ~ ') - # Add linear predictors - form <- paste(form, paste0(obj$predictors_names, collapse = ' + ')) - # Convert to formula - form <- to_formula(form) - } else{ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') - form <- to_formula(obj$equation) - # If response is missing, add manually - if(attr(terms(form), "response")==0){ - form <- update.formula(form, "observed ~ .") - } - # Security checks - assertthat::assert_that( - is.formula(form), - attr(terms(form), "response")==1, # Has Response - all( all.vars(form) %in% c('observed', obj[['predictors_names']]) ) - ) - } - - return(form) -} - -#' Setup a prior for `Boom` engine model -#' -#' @description -#' For more information see helpfile of [`BREGPrior`] and the respective package -#' helpfiles -#' @param form A [`formula`] object. -#' @param data A [`data.frame`] with all variables (response and predictors) in the formula. -#' Needs to contain the observed y_hat variable as well. -#' @param priors A [`PriorList`] object with [`BREGPrior`] priors. -#' @param family A [`character`] object giving either `poisson` or `binomial`. -#' @param exposure A [`numeric`] vector giving the exposure for `poisson` family priors. -#' @returns A [`SpikeSlabPriorBase`] object for use with a [`Boom`] engine trained model -#' @keywords utils, internal -#' @noRd -setup_prior_boom <- function(form, data, priors, family, exposure = NULL){ - assertthat::assert_that( - is.formula(form), - is.data.frame(data), - inherits(priors, "PriorList"), - is.null(exposure) || is.numeric(exposure), - is.character(family) - ) - family <- match.arg(family, c("poisson", "binomial"), several.ok = FALSE) - - # Check that all terms are in the data.frame - assertthat::assert_that( - all( attr(terms.formula(form),"term.labels") %in% names(data) ), - "observed" %in% names(data) - ) - # Create model matrix - mm <- model.matrix.default(object = form, data = data) - - # Expected model size: - # Conservatively just assume the priors are relevant (Default is 1) - esize <- priors$length() - - # Get variable names - vars <- priors$varnames() - - # Optional coefficient estimates - # If any are set, define optional coefficient estimates - co <- vector(length = ncol(mm));co[] <- 0;names(co) <- colnames(mm) - co["(Intercept)"] <- mean( mean(data[["observed"]]) ) - for(val in vars){ - z <- priors$get(val, what = "value") - if(is.null(z)) next() - co[val] <- z - } - - # Option inclusion probabilities - # Probability of inclusion for each variable - if(esize < (ncol(mm)) ) { - #Specify default as each having equal probability - co.ip <- rep(esize/ncol(mm), ncol(mm)) - } else { - co.ip <- rep(1, ncol(mm)) - } - names(co.ip) <- colnames(mm) - # Now set priors for those where set - for(val in vars){ - z <- priors$get(val, what = "prob") - if(is.null(z)) next() - co.ip[val] <- z - } - - # Create priors depending on input family - if(family == "poisson"){ - assertthat::assert_that(!is.null(exposure)) - pp <- PoissonZellnerPrior(predictors = mm, - counts = data$observed, - exposure = exposure, - expected.model.size = esize, - optional.coefficient.estimate = co, - prior.inclusion.probabilities = co.ip - ) - } else { - # For binomial - pp <- LogitZellnerPrior(predictors = mm, - successes = data$observed, - trials = exposure, - expected.model.size = esize, - optional.coefficient.estimate = co, - prior.inclusion.probabilities = co.ip - ) - } - # Return the created prior object - assertthat::assert_that(inherits(pp, "SpikeSlabPriorBase")) - return(pp) -} - -#' Prediction with `Boom` package for breg models -#' -#' @note By Default 20% of the iterations are considered as burnin. -#' @param obj A [list] containing the fitted model -#' @param newdata A [`data.frame`] with all the predictor used for model fitting -#' @param fam A [`character`] denoting the family used for estimation. This -#' is necessary as prediction methods differ among the various functions. -#' @param params A [`list`] with paramters for estimation. Normally created during -#' model fitting. -#' @param w A [`numeric`] [`vector`] containing the exposure variables for PPMs. Can -#' be \code{NULL} if the model is not a PPM. -#' @returns A [`data.frame`] with the respective prediction -#' @keywords utils, internal -#' @noRd -predict_boom <- function(obj, newdata, fam, params, w = NULL) { - assertthat::assert_that( - is.list(obj), - is.data.frame(newdata) || inherits(newdata, "SpatialPixelsDataFrame"), - is.character(fam), - is.list(params), - is.null(w) || is.numeric(w) - ) - check_package("BoomSpikeSlab") - - # Make a prediction - if(fam == "poisson"){ - suppressWarnings( - pred_breg <- BoomSpikeSlab::predict.poisson.spike( - object = obj, - newdata = newdata, - exposure = w, - burn = ceiling(params$iter*0.2), - type = params$type, - mean.only = FALSE # Return full posterior - ) - ) - } else if(fam == "binomial"){ - suppressWarnings( - pred_breg <- BoomSpikeSlab::predict.logit.spike( - object = obj, - newdata = newdata, - burn = ceiling(params$iter*0.2), - type = params$type, - mean.only = FALSE # Return full posterior - ) - ) - } else { - suppressWarnings( - pred_breg <- BoomSpikeSlab::predict.lm.spike( - object = obj, - newdata = newdata, - burn = ceiling(params$iter*0.2), - type = params$type, - mean.only = FALSE # Return full posterior - ) - ) - } - return(pred_breg) -} +#' Built formula for BREG model +#' +#' @description +#' This function built a formula for a `engine_breg()` model. +#' @param obj A [`list()`] object containing the prepared model data for a given biodiversity dataset. +#' @author Martin Jung +#' @note Function is not meant to be run outside the train() call. +#' @keywords internal +#' @noRd +built_formula_breg <- function(obj){ + assertthat::assert_that( + is.list(obj), + length(obj) > 0, + assertthat::has_name(obj, "observations"), + assertthat::has_name(obj, "equation"), + assertthat::has_name(obj, "predictors_names"), + msg = "Error in model object. This function is not meant to be called outside ouf train()." + ) + + # Default equation found + if(obj$equation =='' || is.Waiver(obj$equation)){ + # Construct formula with all variables + form <- paste( 'observed' , ' ~ ') + # Add linear predictors + form <- paste(form, paste0(obj$predictors_names, collapse = ' + ')) + # Convert to formula + form <- to_formula(form) + } else{ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') + form <- to_formula(obj$equation) + # If response is missing, add manually + if(attr(stats::terms(form), "response")==0){ + form <- stats::update.formula(form, "observed ~ .") + } + # Security checks + assertthat::assert_that( + is.formula(form), + attr(stats::terms(form), "response")==1, # Has Response + all( all.vars(form) %in% c('observed', obj[['predictors_names']]) ) + ) + } + + return(form) +} + +#' Setup a prior for `Boom` engine model +#' +#' @description +#' For more information see helpfile of [`BREGPrior`] and the respective package +#' helpfiles +#' @param form A [`formula`] object. +#' @param data A [`data.frame`] with all variables (response and predictors) in the formula. +#' Needs to contain the observed y_hat variable as well. +#' @param priors A [`PriorList`] object with [`BREGPrior`] priors. +#' @param family A [`character`] object giving either `poisson` or `binomial`. +#' @param exposure A [`numeric`] vector giving the exposure for `poisson` family priors. +#' @returns A [`SpikeSlabPriorBase`] object for use with a [`Boom`] engine trained model +#' @keywords utils, internal +#' @noRd +setup_prior_boom <- function(form, data, priors, family, exposure = NULL){ + assertthat::assert_that( + is.formula(form), + is.data.frame(data), + inherits(priors, "PriorList"), + is.null(exposure) || is.numeric(exposure), + is.character(family) + ) + family <- match.arg(family, c("poisson", "binomial"), several.ok = FALSE) + + # Check that all terms are in the data.frame + assertthat::assert_that( + all( attr(stats::terms.formula(form),"term.labels") %in% names(data) ), + "observed" %in% names(data) + ) + # Create model matrix + mm <- stats::model.matrix.default(object = form, data = data) + + # Expected model size: + # Conservatively just assume the priors are relevant (Default is 1) + esize <- priors$length() + + # Get variable names + vars <- priors$varnames() + + # Optional coefficient estimates + # If any are set, define optional coefficient estimates + co <- vector(length = ncol(mm));co[] <- 0;names(co) <- colnames(mm) + co["(Intercept)"] <- mean( mean(data[["observed"]]) ) + for(val in vars){ + z <- priors$get(val, what = "value") + if(is.null(z)) next() + co[val] <- z + } + + # Option inclusion probabilities + # Probability of inclusion for each variable + if(esize < (ncol(mm)) ) { + #Specify default as each having equal probability + co.ip <- rep(esize/ncol(mm), ncol(mm)) + } else { + co.ip <- rep(1, ncol(mm)) + } + names(co.ip) <- colnames(mm) + # Now set priors for those where set + for(val in vars){ + z <- priors$get(val, what = "prob") + if(is.null(z)) next() + co.ip[val] <- z + } + + # Create priors depending on input family + if(family == "poisson"){ + assertthat::assert_that(!is.null(exposure)) + pp <- BoomSpikeSlab::PoissonZellnerPrior(predictors = mm, + counts = data$observed, + exposure = exposure, + expected.model.size = esize, + optional.coefficient.estimate = co, + prior.inclusion.probabilities = co.ip + ) + } else { + # For binomial + pp <- BoomSpikeSlab::LogitZellnerPrior(predictors = mm, + successes = data$observed, + trials = exposure, + expected.model.size = esize, + optional.coefficient.estimate = co, + prior.inclusion.probabilities = co.ip + ) + } + # Return the created prior object + assertthat::assert_that(inherits(pp, "SpikeSlabPriorBase")) + return(pp) +} + +#' Prediction with `Boom` package for breg models +#' +#' @note By Default 20% of the iterations are considered as burnin. +#' @param obj A [list] containing the fitted model +#' @param newdata A [`data.frame`] with all the predictor used for model fitting +#' @param fam A [`character`] denoting the family used for estimation. This +#' is necessary as prediction methods differ among the various functions. +#' @param params A [`list`] with paramters for estimation. Normally created during +#' model fitting. +#' @param w A [`numeric`] [`vector`] containing the exposure variables for PPMs. Can +#' be \code{NULL} if the model is not a PPM. +#' @returns A [`data.frame`] with the respective prediction +#' @keywords utils, internal +#' @noRd +predict_boom <- function(obj, newdata, fam, params, w = NULL) { + assertthat::assert_that( + is.list(obj), + is.data.frame(newdata) || inherits(newdata, "SpatialPixelsDataFrame"), + is.character(fam), + is.list(params), + is.null(w) || is.numeric(w) + ) + check_package("BoomSpikeSlab") + + # Make a prediction + if(fam == "poisson"){ + suppressWarnings( + pred_breg <- BoomSpikeSlab::predict.poisson.spike( + object = obj, + newdata = newdata, + exposure = w, + burn = ceiling(params$iter*0.2), + type = params$type, + mean.only = FALSE # Return full posterior + ) + ) + } else if(fam == "binomial"){ + suppressWarnings( + pred_breg <- BoomSpikeSlab::predict.logit.spike( + object = obj, + newdata = newdata, + burn = ceiling(params$iter*0.2), + type = params$type, + mean.only = FALSE # Return full posterior + ) + ) + } else { + suppressWarnings( + pred_breg <- BoomSpikeSlab::predict.lm.spike( + object = obj, + newdata = newdata, + burn = ceiling(params$iter*0.2), + type = params$type, + mean.only = FALSE # Return full posterior + ) + ) + } + return(pred_breg) +} diff --git a/R/utils-gdb.R b/R/utils-gdb.R index cecb405b..9f73f701 100644 --- a/R/utils-gdb.R +++ b/R/utils-gdb.R @@ -1,300 +1,300 @@ -#' Built formula for GDB model -#' -#' @description -#' This function built a formula for a `engine_gdb()` model. -#' @param model A [`list()`] object containing the prepared model data for a given biodiversity dataset. -#' @param x A [`BiodiversityDistribution`] object. -#' @param id The id for the species formula. -#' @param settings A [`Settings`] object. -#' @author Martin Jung -#' @note Function is not meant to be run outside the train() call. -#' @keywords internal -#' @noRd -built_formula_gdb <- function(model, id, x, settings){ - assertthat::assert_that( - is.list(model), - length(model) > 0, - assertthat::has_name(model, "predictors"), - inherits(x, "BiodiversityDistribution"), - inherits(settings, 'Settings'), - is.character(id) || is.Id(id), - msg = "Error in model object. This function is not meant to be called outside ouf train()." - ) - # Get object for id - obj <- model$biodiversity[[id]] - # Extract basic stats from the model object - types <- as.character( sapply( model$biodiversity, function(x) x$type ) ) - fams <- as.character( sapply( model$biodiversity, function(z) z$family ) ) - bionames = sapply(model$biodiversity, function(x) x$name) - ids <- names(model$biodiversity) - priors <- model$priors - - # Default equation found - if(obj$equation == '' || is.Waiver(obj$equation)){ - # Construct formula with all variables - form <- "observed ~ " - - # Use only variables that have sufficient covariate range for training - # Finally check that a minimum of unique numbers are present in the predictor range and if not, remove them - covariates <- rm_insufficient_covs(model = obj, tr = 4) - if(is.null(covariates)) stop("Not enough variance in training data to fit a SDM!") - - if(!is.Waiver(priors)){ - # Loop through all provided GDB priors - supplied_priors <- as.vector(priors$varnames()) - for(v in supplied_priors){ - if(v %notin% covariates) next() # In case the variable has been removed - # First add linear effects - form <- paste(form, paste0('bmono(', v, - ', constraint = \'', priors$get(v) ,'\'', - ')', collapse = ' + ' ), ' + ' ) - } - # Add linear and smooth effects for all missing ones - miss <- covariates[covariates %notin% supplied_priors] - if(length(miss)>0){ - # Add linear predictors - form <- paste(form, paste0('bols(',miss,')', collapse = ' + ')) - if(!settings$get('only_linear')){ - # And smooth effects for all numeric data - miss <- miss[ miss %in% obj$predictors_types$predictors[which(obj$predictors_types$type=="numeric")] ] - form <- paste(form, ' + ', paste0('bbs(', miss,', knots = 4)', - collapse = ' + ' - )) - } - } - } else { - # Add linear predictors - form <- paste(form, paste0('bols(',covariates,')',collapse = ' + ')) - if(settings$get('only_linear') == FALSE){ - # And smooth effects - form <- paste(form, ' + ', paste0('bbs(', - covariates[which(covariates %in% obj$predictors_types$predictors[obj$predictors_types$type=="numeric"] )],', knots = 4)', - collapse = ' + ' - )) - } - # Add also random effect if there are any factors? THIS currently crashes when there are too few factors - # if(any(model$predictors_types$type=="factor")){ - # form <- paste(form, ' + ' ,paste0('brandom(', - # model$biodiversity[[id]]$predictors_types$predictors[which(model$biodiversity[[id]]$predictors_types$type == 'factor')], - # ')',collapse = " + ")) - # } - } - # Convert to formula - form <- to_formula(form) - # Add offset if specified - if(!is.Waiver(model$offset) ){ form <- update.formula(form, paste0('~ . + offset(spatial_offset)') ) } - if( length( grep('Spatial',x$get_latent() ) ) > 0 ){ - # Update with spatial term - form <- update.formula(form, paste0(" ~ . + ", - x$engine$get_equation_latent_spatial()) - ) - } - } else{ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') - form <- to_formula(obj$equation) - # If response is missing, add manually - if(attr(terms(form), "response")==0){ - form <- update.formula(form, "observed ~ .") - } - # Check that bols/bbs are in terms and if not add them for each variable - if( settings$get("only_linear") ){ - if( length(grep("bols",attr(terms(form2), "term.labels") ))==0 ){ - # Assume that each variable as none, so add - form <- as.formula(paste0("observed ~", paste0("bols(", obj$predictors_names, ")",collapse = " + "))) - } - } else { - if( length(grep("bbs",attr(terms(form2), "term.labels") ))==0 ){ - # Assume that each variable as none, so add - form <- as.formula(paste0("observed ~", paste0("bbs(", obj$predictors_names, ")",collapse = " + "))) - } - } - - assertthat::assert_that( - is.formula(form), - attr(terms(form), "response")==1, # Has Response - all( all.vars(form) %in% c('observed', obj[['predictors_names']]) ) - ) - } - - return(form) -} - -#' Use a fitted model for creating a new class prediction in raster form -#' -#' @param fit A fitted [`mboost`] model with [`binomial`] distribution -#' @param nd A new data.frame with all predictiors used in fit -#' @param template A [`Raster`] object that can be used as spatial template. -#' @returns A [`RasterLayer`] containing a presence-absence prediction. -#' @keywords utils -#' @noRd -predict_gdbclass <- function(fit, nd, template){ - assertthat::assert_that( - inherits(fit, 'mboost'), - !is.null( grep('Binomial', fit$family@name,ignore.case = TRUE) ), - is.data.frame(nd), - inherits(template,'RasterLayer') - ) - # Redo a template to be sure - template <- emptyraster(template) - - # Remove missing data in newdata data.frame - nd$cellid <- rownames(nd) - nd <- subset(nd, complete.cases(nd)) - - suppressWarnings( - pred_gdb <- mboost::predict.mboost(object = fit, newdata = nd, - type = 'class', aggregate = 'sum') - ) - # Fill output - prediction <- emptyraster(template) - prediction[as.numeric(nd$cellid)] <- pred_gdb - prediction[prediction < max(as.numeric(pred_gdb),na.rm = T)] <- 0; prediction[prediction >0] <- 1 - names(prediction) <- 'presence' - return(prediction) -} - -#' Sanitize mboost summary output -#' -#' @description -#' Extracts the coefficients and selection frequencies from a [`mboost`] model. -#' @param obj A fitted [`mboost`] object. -#' @noRd -#' @keywords internal -clean_mboost_summary <- function(obj){ - assertthat::assert_that( - inherits(obj, "mboost") - ) - - # Get Variable importance - vi <- mboost::varimp( obj ) - vi <- sort( vi[which(vi>0)],decreasing = TRUE ) - - # Get coefficients - co <- mboost:::extract(obj, "coefficient") - co <- co[names(vi)] - assertthat::assert_that(all(names(vi) == names(co))) - # Calculate coefficient. If smooth, calculate mean of the differential between knots - co <- sapply(co, function(x) { - if(length(x)>2){ - mean(diff(x)) - } else x[2] - }) - - # Now split up the names and types - types <- sapply(names(vi), function(z) strsplit(z, "\\(")[[1]][1]) |> as.vector() - vars <- sapply(names(vi), function(z) strsplit(z, "\\(")[[1]][2]) |> as.vector() - vars <- gsub(vars, pattern="\\)",replacement="") - - # Construct tibble - out <- tibble::tibble( - variable = vars, - type = types, - varimp = vi, - beta = co - ) - return(out) -} - -#' Check training predictor complexity -#' -#' @description -#' This internal function tests an existing set of covariates (usually the training data) -#' on the number of unique values within. -#' If fewer values than a given threshold (\code{'tr'}) is detected, then the predictor is removed, thus -#' reducing complexity. -#' @note Maybe in the future a more cleverer solution could be thought of, for instance using a singular value decompoistion? -#' @param model A [`list`] of a model object containing the various predictors and biodiversity occurence information. -#' @param tr A [`numeric`] value describing a threshold of minimum unique values to be retained. -#' @returns A [`vector`] with the variables that fullfill the threshold. -#' @keywords internal -#' @noRd -rm_insufficient_covs <- function(model, tr = 5){ - assertthat::assert_that( - is.list(model), - is.numeric(tr), - tr > 0 - ) - - # Check that biodiversity information is present - assertthat::assert_that( - assertthat::has_name(model, "observations"), - assertthat::has_name(model, "predictors_names") - ) - - # Now get all continuous ones - vars_num <- model$predictors_types$predictors[model$predictors_types$type=="numeric"] - vars_fac <- model$predictors_types$predictors[model$predictors_types$type=="factor"] - - vars_uniques <- apply(model$predictors[,vars_num], 2, function(x) length(unique(x,na.rm = TRUE)) ) - - # Get all variables smaller than the threshold and return the original data.frame without them - sufficient <- which(vars_uniques >= tr) - - # Get the factor variables in it as well - if(length(vars_fac)>0){ - vars_uniques <- apply(model$predictors[vars_fac], 2, function(x) length(unique(x,na.rm = TRUE)) ) - # Get all factor variables with at least 2 levels - sufficient_fac <- which(vars_uniques >= 2) - if(length(sufficient_fac)>0){ - sufficient <- c(names(sufficient), names(sufficient_fac)) - } - } else { - sufficient <- names(sufficient) - } - assertthat::assert_that(all(sufficient %in% model$predictors_names)) # This should return a character of covariate names - if(length(sufficient)==0){ - return(NULL) - } else { - return(sufficient) - } -} - -#' Calculate weights for Point Process models -#' -#' @param df The [`data.frame`] for which weights are to be calculated. -#' @param presence A [`vector`] with the observed species. Has to be in range \code{0} to \code{Inf}. -#' @param bg A background [`raster`] layer. -#' @param use_area A [`logical`] on whether area is to be used instead of grid counts. -#' @param weight A [`numeric`] weight to be used in down-weighted regressions. -#' @param type Accepting either “Infinitely weighted logistic regression” \code{'IWLR'} for use with binomial -#' logistic regressions or “Down-weighted Poisson regression” \code{"DWPR"} (Default). -#' @references -#' * Renner, I.W., Elith, J., Baddeley, A., Fithian, W., Hastie, T., Phillips, S.J., Popovic, G. and Warton, D.I., 2015. Point process models for presence‐only analysis. Methods in Ecology and Evolution, 6(4), pp.366-379. -#' * Fithian, W. & Hastie, T. (2013) Finite-sample equivalence in statistical models for presence-only data. The Annals of Applied Statistics 7, 1917–1939 -#' @return A vector with the weights -#' @keywords utils -#' @noRd -ppm_weights <- function(df, pa, bg, use_area = FALSE, weight = 1e-6, type = "DWPR"){ - assertthat::assert_that( - is.data.frame(df), - length(unique(pa)) > 1, - nrow(df) == length(pa), - is.logical(use_area), - is.numeric(weight), - is.character(type) - ) - type <- match.arg(type, c("DWPR", "IWLR"),several.ok = FALSE) - - if(use_area){ - suppressWarnings( ar <- raster::area(bg) ) - ar <- raster::mask(ar, bg) - nc <- cellStats(ar, sum) - } else { - # number of non-NA cells - nc <- cellStats(!is.na(bg), sum) - } - - # Set output weight as default - if(type == "DWPR"){ - w <- rep( weight, nrow(df) ) - w[which(pa == 0)] = nc / sum(pa == 0) - } else { - w = (10^6)^(1 - pa) - } - - assertthat::assert_that( - length(unique(w)) > 1, - length(w) == nrow(df) - ) - return(w) -} +#' Built formula for GDB model +#' +#' @description +#' This function built a formula for a `engine_gdb()` model. +#' @param model A [`list()`] object containing the prepared model data for a given biodiversity dataset. +#' @param x A [`BiodiversityDistribution`] object. +#' @param id The id for the species formula. +#' @param settings A [`Settings`] object. +#' @author Martin Jung +#' @note Function is not meant to be run outside the train() call. +#' @keywords internal +#' @noRd +built_formula_gdb <- function(model, id, x, settings){ + assertthat::assert_that( + is.list(model), + length(model) > 0, + assertthat::has_name(model, "predictors"), + inherits(x, "BiodiversityDistribution"), + inherits(settings, 'Settings'), + is.character(id) || is.Id(id), + msg = "Error in model object. This function is not meant to be called outside ouf train()." + ) + # Get object for id + obj <- model$biodiversity[[id]] + # Extract basic stats from the model object + types <- as.character( sapply( model$biodiversity, function(x) x$type ) ) + fams <- as.character( sapply( model$biodiversity, function(z) z$family ) ) + bionames = sapply(model$biodiversity, function(x) x$name) + ids <- names(model$biodiversity) + priors <- model$priors + + # Default equation found + if(obj$equation == '' || is.Waiver(obj$equation)){ + # Construct formula with all variables + form <- "observed ~ " + + # Use only variables that have sufficient covariate range for training + # Finally check that a minimum of unique numbers are present in the predictor range and if not, remove them + covariates <- rm_insufficient_covs(model = obj, tr = 4) + if(is.null(covariates)) stop("Not enough variance in training data to fit a SDM!") + + if(!is.Waiver(priors)){ + # Loop through all provided GDB priors + supplied_priors <- as.vector(priors$varnames()) + for(v in supplied_priors){ + if(v %notin% covariates) next() # In case the variable has been removed + # First add linear effects + form <- paste(form, paste0('bmono(', v, + ', constraint = \'', priors$get(v) ,'\'', + ')', collapse = ' + ' ), ' + ' ) + } + # Add linear and smooth effects for all missing ones + miss <- covariates[covariates %notin% supplied_priors] + if(length(miss)>0){ + # Add linear predictors + form <- paste(form, paste0('bols(',miss,')', collapse = ' + ')) + if(!settings$get('only_linear')){ + # And smooth effects for all numeric data + miss <- miss[ miss %in% obj$predictors_types$predictors[which(obj$predictors_types$type=="numeric")] ] + form <- paste(form, ' + ', paste0('bbs(', miss,', knots = 4)', + collapse = ' + ' + )) + } + } + } else { + # Add linear predictors + form <- paste(form, paste0('bols(',covariates,')',collapse = ' + ')) + if(settings$get('only_linear') == FALSE){ + # And smooth effects + form <- paste(form, ' + ', paste0('bbs(', + covariates[which(covariates %in% obj$predictors_types$predictors[obj$predictors_types$type=="numeric"] )],', knots = 4)', + collapse = ' + ' + )) + } + # Add also random effect if there are any factors? THIS currently crashes when there are too few factors + # if(any(model$predictors_types$type=="factor")){ + # form <- paste(form, ' + ' ,paste0('brandom(', + # model$biodiversity[[id]]$predictors_types$predictors[which(model$biodiversity[[id]]$predictors_types$type == 'factor')], + # ')',collapse = " + ")) + # } + } + # Convert to formula + form <- to_formula(form) + # Add offset if specified + if(!is.Waiver(model$offset) ){ form <- stats::update.formula(form, paste0('~ . + offset(spatial_offset)') ) } + if( length( grep('Spatial',x$get_latent() ) ) > 0 ){ + # Update with spatial term + form <- stats::update.formula(form, paste0(" ~ . + ", + x$engine$get_equation_latent_spatial()) + ) + } + } else{ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') + form <- to_formula(obj$equation) + # If response is missing, add manually + if(attr(stats::terms(form), "response")==0){ + form <- stats::update.formula(form, "observed ~ .") + } + # Check that bols/bbs are in terms and if not add them for each variable + if( settings$get("only_linear") ){ + if( length(grep("bols",attr(stats::terms(form2), "term.labels") ))==0 ){ + # Assume that each variable as none, so add + form <- stats::as.formula(paste0("observed ~", paste0("bols(", obj$predictors_names, ")",collapse = " + "))) + } + } else { + if( length(grep("bbs",attr(stats::terms(form2), "term.labels") ))==0 ){ + # Assume that each variable as none, so add + form <- stats::as.formula(paste0("observed ~", paste0("bbs(", obj$predictors_names, ")",collapse = " + "))) + } + } + + assertthat::assert_that( + is.formula(form), + attr(stats::terms(form), "response")==1, # Has Response + all( all.vars(form) %in% c('observed', obj[['predictors_names']]) ) + ) + } + + return(form) +} + +#' Use a fitted model for creating a new class prediction in raster form +#' +#' @param fit A fitted [`mboost`] model with [`binomial`] distribution +#' @param nd A new data.frame with all predictiors used in fit +#' @param template A [`Raster`] object that can be used as spatial template. +#' @returns A [`RasterLayer`] containing a presence-absence prediction. +#' @keywords utils +#' @noRd +predict_gdbclass <- function(fit, nd, template){ + assertthat::assert_that( + inherits(fit, 'mboost'), + !is.null( grep('Binomial', fit$family@name,ignore.case = TRUE) ), + is.data.frame(nd), + inherits(template,'RasterLayer') + ) + # Redo a template to be sure + template <- emptyraster(template) + + # Remove missing data in newdata data.frame + nd$cellid <- rownames(nd) + nd <- subset(nd, stats::complete.cases(nd)) + + suppressWarnings( + pred_gdb <- mboost::predict.mboost(object = fit, newdata = nd, + type = 'class', aggregate = 'sum') + ) + # Fill output + prediction <- emptyraster(template) + prediction[as.numeric(nd$cellid)] <- pred_gdb + prediction[prediction < max(as.numeric(pred_gdb),na.rm = T)] <- 0; prediction[prediction >0] <- 1 + names(prediction) <- 'presence' + return(prediction) +} + +#' Sanitize mboost summary output +#' +#' @description +#' Extracts the coefficients and selection frequencies from a [`mboost`] model. +#' @param obj A fitted [`mboost`] object. +#' @noRd +#' @keywords internal +clean_mboost_summary <- function(obj){ + assertthat::assert_that( + inherits(obj, "mboost") + ) + + # Get Variable importance + vi <- mboost::varimp( obj ) + vi <- sort( vi[which(vi>0)],decreasing = TRUE ) + + # Get coefficients + co <- mboost::extract(obj, "coefficient") + co <- co[names(vi)] + assertthat::assert_that(all(names(vi) == names(co))) + # Calculate coefficient. If smooth, calculate mean of the differential between knots + co <- sapply(co, function(x) { + if(length(x)>2){ + mean(diff(x)) + } else x[2] + }) + + # Now split up the names and types + types <- sapply(names(vi), function(z) strsplit(z, "\\(")[[1]][1]) |> as.vector() + vars <- sapply(names(vi), function(z) strsplit(z, "\\(")[[1]][2]) |> as.vector() + vars <- gsub(vars, pattern="\\)",replacement="") + + # Construct tibble + out <- tibble::tibble( + variable = vars, + type = types, + varimp = vi, + beta = co + ) + return(out) +} + +#' Check training predictor complexity +#' +#' @description +#' This internal function tests an existing set of covariates (usually the training data) +#' on the number of unique values within. +#' If fewer values than a given threshold (\code{'tr'}) is detected, then the predictor is removed, thus +#' reducing complexity. +#' @note Maybe in the future a more cleverer solution could be thought of, for instance using a singular value decompoistion? +#' @param model A [`list`] of a model object containing the various predictors and biodiversity occurence information. +#' @param tr A [`numeric`] value describing a threshold of minimum unique values to be retained. +#' @returns A [`vector`] with the variables that fullfill the threshold. +#' @keywords internal +#' @noRd +rm_insufficient_covs <- function(model, tr = 5){ + assertthat::assert_that( + is.list(model), + is.numeric(tr), + tr > 0 + ) + + # Check that biodiversity information is present + assertthat::assert_that( + assertthat::has_name(model, "observations"), + assertthat::has_name(model, "predictors_names") + ) + + # Now get all continuous ones + vars_num <- model$predictors_types$predictors[model$predictors_types$type=="numeric"] + vars_fac <- model$predictors_types$predictors[model$predictors_types$type=="factor"] + + vars_uniques <- apply(model$predictors[,vars_num], 2, function(x) length(unique(x,na.rm = TRUE)) ) + + # Get all variables smaller than the threshold and return the original data.frame without them + sufficient <- which(vars_uniques >= tr) + + # Get the factor variables in it as well + if(length(vars_fac)>0){ + vars_uniques <- apply(model$predictors[vars_fac], 2, function(x) length(unique(x,na.rm = TRUE)) ) + # Get all factor variables with at least 2 levels + sufficient_fac <- which(vars_uniques >= 2) + if(length(sufficient_fac)>0){ + sufficient <- c(names(sufficient), names(sufficient_fac)) + } + } else { + sufficient <- names(sufficient) + } + assertthat::assert_that(all(sufficient %in% model$predictors_names)) # This should return a character of covariate names + if(length(sufficient)==0){ + return(NULL) + } else { + return(sufficient) + } +} + +#' Calculate weights for Point Process models +#' +#' @param df The [`data.frame`] for which weights are to be calculated. +#' @param presence A [`vector`] with the observed species. Has to be in range \code{0} to \code{Inf}. +#' @param bg A background [`raster`] layer. +#' @param use_area A [`logical`] on whether area is to be used instead of grid counts. +#' @param weight A [`numeric`] weight to be used in down-weighted regressions. +#' @param type Accepting either “Infinitely weighted logistic regression” \code{'IWLR'} for use with binomial +#' logistic regressions or “Down-weighted Poisson regression” \code{"DWPR"} (Default). +#' @references +#' * Renner, I.W., Elith, J., Baddeley, A., Fithian, W., Hastie, T., Phillips, S.J., Popovic, G. and Warton, D.I., 2015. Point process models for presence‐only analysis. Methods in Ecology and Evolution, 6(4), pp.366-379. +#' * Fithian, W. & Hastie, T. (2013) Finite-sample equivalence in statistical models for presence-only data. The Annals of Applied Statistics 7, 1917–1939 +#' @return A vector with the weights +#' @keywords utils +#' @noRd +ppm_weights <- function(df, pa, bg, use_area = FALSE, weight = 1e-6, type = "DWPR"){ + assertthat::assert_that( + is.data.frame(df), + length(unique(pa)) > 1, + nrow(df) == length(pa), + is.logical(use_area), + is.numeric(weight), + is.character(type) + ) + type <- match.arg(type, c("DWPR", "IWLR"),several.ok = FALSE) + + if(use_area){ + suppressWarnings( ar <- raster::area(bg) ) + ar <- raster::mask(ar, bg) + nc <- cellStats(ar, sum) + } else { + # number of non-NA cells + nc <- cellStats(!is.na(bg), sum) + } + + # Set output weight as default + if(type == "DWPR"){ + w <- rep( weight, nrow(df) ) + w[which(pa == 0)] = nc / sum(pa == 0) + } else { + w = (10^6)^(1 - pa) + } + + assertthat::assert_that( + length(unique(w)) > 1, + length(w) == nrow(df) + ) + return(w) +} diff --git a/R/utils-glmnet.R b/R/utils-glmnet.R index e6913237..6d1aa0e3 100644 --- a/R/utils-glmnet.R +++ b/R/utils-glmnet.R @@ -1,164 +1,164 @@ -#' Built formula for glmnet model -#' -#' @description -#' This function builds a formula for a `engine_glmnet()` model. -#' @param obj A [`list()`] object containing the prepared model data for a given biodiversity dataset. -#' @note Function is not meant to be run outside the train() call. -#' @author Martin Jung -#' @keywords internal -#' @noRd -built_formula_glmnet <- function(obj){ - assertthat::assert_that( - is.list(obj), - length(obj) > 0, - assertthat::has_name(obj, "observations"), - assertthat::has_name(obj, "equation"), - assertthat::has_name(obj, "predictors_names"), - msg = "Error in model object. This function is not meant to be called outside ouf train()." - ) - - # Default equation found - if(obj$equation =='' || is.Waiver(obj$equation)){ - # Construct formula with all variables - form <- paste( 'observed', ifelse(obj$family=='poisson', '/w', ''), ' ~ ') - # Add linear predictors - form <- paste(form, paste0(obj$predictors_names, collapse = ' + ')) - # Convert to formula - form <- to_formula(form) - } else{ - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') - form <- to_formula(obj$equation) - # If response is missing, add manually - if(attr(terms(form), "response")==0){ - form <- update.formula(form, "observed ~ .") - } - # Security checks - assertthat::assert_that( - is.formula(form), - attr(terms(form), "response")==1, # Has Response - all( all.vars(form) %in% c('observed', obj[['predictors_names']]) ) - ) - } - - return(form) -} - -#' Default regularization constant -#' -#' @description -#' This function was taken from the [`maxnet`] R-package to get some more informed -#' default lambda values for the regularization. -#' -#' @param p A [`vector`] of \code{1} (for presence) or \code{0} (for background). -#' @param m A [`model.matrix`] object -#' @references -#' * Phillips S (2021). _maxnet: Fitting 'Maxent' Species Distribution Models with 'glmnet'_. R package version 0.1.4, . -#' @source maxnet -#' @keywords internal, utils -#' @noRd -default.regularization <- function(p, m){ - isproduct <- function(x) grepl(":", x) & !grepl("\\(", x) - isquadratic <- function(x) grepl("^I\\(.*\\^2\\)", x) - ishinge <- function(x) grepl("^hinge\\(", x) - isthreshold <- function(x) grepl("^thresholds\\(", x) - iscategorical <- function(x) grepl("^categorical\\(", x) - regtable <- function(name, default) { - if (ishinge(name)) - return(list(c(0, 1), c(0.5, 0.5))) - if (iscategorical(name)) - return(list(c(0, 10, 17), c(0.65, 0.5, 0.25))) - if (isthreshold(name)) - return(list(c(0, 100), c(2, 1))) - default - } - lregtable <- list(c(0, 10, 30, 100), c(1, 1, 0.2, 0.05)) - qregtable <- list(c(0, 10, 17, 30, 100), c(1.3, 0.8, 0.5, - 0.25, 0.05)) - pregtable <- list(c(0, 10, 17, 30, 100), c(2.6, 1.6, 0.9, - 0.55, 0.05)) - mm <- m[p == 1, ] - np <- nrow(mm) - lqpreg <- lregtable - if (sum(isquadratic(colnames(mm)))) - lqpreg <- qregtable - if (sum(isproduct(colnames(mm)))) - lqpreg <- pregtable - classregularization <- sapply(colnames(mm), function(n) { - t <- regtable(n, lqpreg) - approx(t[[1]], t[[2]], np, rule = 2)$y - })/sqrt(np) - ishinge <- grepl("^hinge\\(", colnames(mm)) - hmindev <- sapply(1:ncol(mm), function(i) { - if (!ishinge[i]) - return(0) - avg <- mean(mm[, i]) - std <- max(sd(mm[, i]), 1/sqrt(np)) - std * 0.5/sqrt(np) - }) - tmindev <- sapply(1:ncol(mm), function(i) { - ifelse(isthreshold(colnames(mm)[i]) && (sum(mm[, i]) == - 0 || sum(mm[, i]) == nrow(mm)), 1, 0) - }) - pmax(0.001 * (apply(m, 2, max) - apply(m, 2, min)), hmindev, - tmindev, apply(as.matrix(mm), 2, sd) * classregularization) -} - -#' Determine best lambda -#' -#' @description -#' For glmnet fits the variables are often overregularized. This helper function picks -#' the best lambda estimate from the model. -#' By default use the one within 1 SE of minimum lambda, unless it falls on the very first value, -#' likely indicating an overregularized model. In this case take the minimum value -#' of all lambda's. -#' @param obj A \code{"glmnet"} object. -#' @keywords internal, utils -#' @noRd -determine_lambda <- function(obj){ - assertthat::assert_that( - inherits(obj, "cv.glmnet"), - is.numeric(obj$lambda.1se), - is.numeric(obj$lambda.min), - is.numeric(obj$lambda) - ) - if(obj$lambda.1se != obj$lambda[1]) { - la <- obj$lambda.1se - } else if (obj$lambda.min != obj$lambda[1]){ - la <- obj$lambda.min - } else { - la <- tail(obj$lambda ,1) - } - return(la) -} - -#' Summarize cross-validated glmnet model -#' -#' @description -#' This helper function summarizes the coefficients from a glmnet model. -#' The optimal lambda is determined through the [`determine_lambda`] function. -#' @param obj An object created with \code{'cv.glmnet'}. -#' @keywords internal, utils -#' @noRd -tidy_glmnet_summary <- function(obj){ - assertthat::assert_that( - inherits(obj, "cv.glmnet") - ) - # Determine best lambda - lambda <- determine_lambda(obj) - - # Summarise coefficients within 1 standard deviation - ms <- coef(obj, s = lambda) |> - as.matrix() |> as.data.frame() - names(ms) <- "mean" - ms$variable <- rownames(ms) - ms <- ms[,c("variable", "mean")] - ms <- subset(ms, mean != 0) # Remove regularized coefficients for some clean up. - if(nrow(ms)>0){ - # Reorder - ms <- ms[order(ms$mean,decreasing = TRUE),] # Sort - rownames(ms) <- NULL - } else { - ms <- data.frame() - } - return(ms) -} +#' Built formula for glmnet model +#' +#' @description +#' This function builds a formula for a `engine_glmnet()` model. +#' @param obj A [`list()`] object containing the prepared model data for a given biodiversity dataset. +#' @note Function is not meant to be run outside the train() call. +#' @author Martin Jung +#' @keywords internal +#' @noRd +built_formula_glmnet <- function(obj){ + assertthat::assert_that( + is.list(obj), + length(obj) > 0, + assertthat::has_name(obj, "observations"), + assertthat::has_name(obj, "equation"), + assertthat::has_name(obj, "predictors_names"), + msg = "Error in model object. This function is not meant to be called outside ouf train()." + ) + + # Default equation found + if(obj$equation =='' || is.Waiver(obj$equation)){ + # Construct formula with all variables + form <- paste( 'observed', ifelse(obj$family=='poisson', '/w', ''), ' ~ ') + # Add linear predictors + form <- paste(form, paste0(obj$predictors_names, collapse = ' + ')) + # Convert to formula + form <- to_formula(form) + } else{ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') + form <- to_formula(obj$equation) + # If response is missing, add manually + if(attr(stats::terms(form), "response")==0){ + form <- stats::update.formula(form, "observed ~ .") + } + # Security checks + assertthat::assert_that( + is.formula(form), + attr(stats::terms(form), "response")==1, # Has Response + all( all.vars(form) %in% c('observed', obj[['predictors_names']]) ) + ) + } + + return(form) +} + +#' Default regularization constant +#' +#' @description +#' This function was taken from the [`maxnet`] R-package to get some more informed +#' default lambda values for the regularization. +#' +#' @param p A [`vector`] of \code{1} (for presence) or \code{0} (for background). +#' @param m A [`model.matrix`] object +#' @references +#' * Phillips S (2021). _maxnet: Fitting 'Maxent' Species Distribution Models with 'glmnet'_. R package version 0.1.4, . +#' @source maxnet +#' @keywords internal, utils +#' @noRd +default.regularization <- function(p, m){ + isproduct <- function(x) grepl(":", x) & !grepl("\\(", x) + isquadratic <- function(x) grepl("^I\\(.*\\^2\\)", x) + ishinge <- function(x) grepl("^hinge\\(", x) + isthreshold <- function(x) grepl("^thresholds\\(", x) + iscategorical <- function(x) grepl("^categorical\\(", x) + regtable <- function(name, default) { + if (ishinge(name)) + return(list(c(0, 1), c(0.5, 0.5))) + if (iscategorical(name)) + return(list(c(0, 10, 17), c(0.65, 0.5, 0.25))) + if (isthreshold(name)) + return(list(c(0, 100), c(2, 1))) + default + } + lregtable <- list(c(0, 10, 30, 100), c(1, 1, 0.2, 0.05)) + qregtable <- list(c(0, 10, 17, 30, 100), c(1.3, 0.8, 0.5, + 0.25, 0.05)) + pregtable <- list(c(0, 10, 17, 30, 100), c(2.6, 1.6, 0.9, + 0.55, 0.05)) + mm <- m[p == 1, ] + np <- nrow(mm) + lqpreg <- lregtable + if (sum(isquadratic(colnames(mm)))) + lqpreg <- qregtable + if (sum(isproduct(colnames(mm)))) + lqpreg <- pregtable + classregularization <- sapply(colnames(mm), function(n) { + t <- regtable(n, lqpreg) + stats::approx(t[[1]], t[[2]], np, rule = 2)$y + })/sqrt(np) + ishinge <- grepl("^hinge\\(", colnames(mm)) + hmindev <- sapply(1:ncol(mm), function(i) { + if (!ishinge[i]) + return(0) + avg <- mean(mm[, i]) + std <- max(sd(mm[, i]), 1/sqrt(np)) + std * 0.5/sqrt(np) + }) + tmindev <- sapply(1:ncol(mm), function(i) { + ifelse(isthreshold(colnames(mm)[i]) && (sum(mm[, i]) == + 0 || sum(mm[, i]) == nrow(mm)), 1, 0) + }) + pmax(0.001 * (apply(m, 2, max) - apply(m, 2, min)), hmindev, + tmindev, apply(as.matrix(mm), 2, sd) * classregularization) +} + +#' Determine best lambda +#' +#' @description +#' For glmnet fits the variables are often overregularized. This helper function picks +#' the best lambda estimate from the model. +#' By default use the one within 1 SE of minimum lambda, unless it falls on the very first value, +#' likely indicating an overregularized model. In this case take the minimum value +#' of all lambda's. +#' @param obj A \code{"glmnet"} object. +#' @keywords internal, utils +#' @noRd +determine_lambda <- function(obj){ + assertthat::assert_that( + inherits(obj, "cv.glmnet"), + is.numeric(obj$lambda.1se), + is.numeric(obj$lambda.min), + is.numeric(obj$lambda) + ) + if(obj$lambda.1se != obj$lambda[1]) { + la <- obj$lambda.1se + } else if (obj$lambda.min != obj$lambda[1]){ + la <- obj$lambda.min + } else { + la <- tail(obj$lambda ,1) + } + return(la) +} + +#' Summarize cross-validated glmnet model +#' +#' @description +#' This helper function summarizes the coefficients from a glmnet model. +#' The optimal lambda is determined through the [`determine_lambda`] function. +#' @param obj An object created with \code{'cv.glmnet'}. +#' @keywords internal, utils +#' @noRd +tidy_glmnet_summary <- function(obj){ + assertthat::assert_that( + inherits(obj, "cv.glmnet") + ) + # Determine best lambda + lambda <- determine_lambda(obj) + + # Summarise coefficients within 1 standard deviation + ms <- stats::coef(obj, s = lambda) |> + as.matrix() |> as.data.frame() + names(ms) <- "mean" + ms$variable <- rownames(ms) + ms <- ms[,c("variable", "mean")] + ms <- subset(ms, mean != 0) # Remove regularized coefficients for some clean up. + if(nrow(ms)>0){ + # Reorder + ms <- ms[order(ms$mean,decreasing = TRUE),] # Sort + rownames(ms) <- NULL + } else { + ms <- data.frame() + } + return(ms) +} diff --git a/R/utils-inla.R b/R/utils-inla.R index 2daa4aa8..427fbc2f 100644 --- a/R/utils-inla.R +++ b/R/utils-inla.R @@ -139,11 +139,11 @@ built_formula_inla <- function(model, id, x, settings){ } form <- to_formula(form) # Convert to formula # Add offset if specified - if(!is.Waiver(x$offset) ){ form <- update.formula(form, paste0('~ . + offset(spatial_offset)') ) } + if(!is.Waiver(x$offset) ){ form <- stats::update.formula(form, paste0('~ . + offset(spatial_offset)') ) } if( length( grep('Spatial', x$get_latent() ) ) > 0 ){ if(attr(x$get_latent(), "method") != "poly"){ # Update with spatial term - form <- update.formula(form, paste0(" ~ . + ", + form <- stats::update.formula(form, paste0(" ~ . + ", x$engine$get_equation_latent_spatial( method = attr(x$get_latent(),'method'), vars = which(ids == id), @@ -158,13 +158,13 @@ built_formula_inla <- function(model, id, x, settings){ if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation') form <- to_formula( obj$equation ) # If response is missing, add manually - if(attr(terms(form), "response")==0){ - form <- update.formula(form, "observed ~ .") + if(attr(stats::terms(form), "response")==0){ + form <- stats::update.formula(form, "observed ~ .") } # Security checks assertthat::assert_that( is.formula(form), - attr(terms(form), "response")==1, # Has Response + attr(stats::terms(form), "response")==1, # Has Response all( all.vars(form) %in% c('observed', obj[['predictors_names']]) ) ) } @@ -193,11 +193,11 @@ built_formula_inla <- function(model, id, x, settings){ # Add offset if specified # TODO: Not quite sure if this formulation works for inlabru predictor expressions - if(!is.Waiver(x$offset) ){ form <- update.formula(form, paste0('~ . + offset(spatial_offset)') ) } + if(!is.Waiver(x$offset) ){ form <- stats::update.formula(form, paste0('~ . + offset(spatial_offset)') ) } if( length( grep('Spatial', x$get_latent() ) ) > 0 ){ if(attr(x$get_latent(), "method") != "poly"){ # Update with spatial term - form <- update.formula(form, paste0(" ~ . + ", + form <- stats::update.formula(form, paste0(" ~ . + ", # For SPDE components, simply add spatial.field paste0("spatial.field", which(ids == id)) ) @@ -214,11 +214,11 @@ built_formula_inla <- function(model, id, x, settings){ # Convert to formula to be safe form <- to_formula( obj$equation ) # Add generic Intercept if not set in formula - if("Intercept" %notin% all.vars(form)) form <- update.formula(form, ". ~ . + Intercept") + if("Intercept" %notin% all.vars(form)) form <- stats::update.formula(form, ". ~ . + Intercept") # If length of ids is larger than 1, add dataset specific intercept too # Check whether to use dataset specific intercepts if(length(types)>1 && obj$use_intercept){ - form <- update.formula(form, + form <- stats::update.formula(form, paste0(". ~ . + ", paste0('Intercept_', make.names(tolower( bionames )), '_', types) @@ -231,7 +231,7 @@ built_formula_inla <- function(model, id, x, settings){ if( length( grep('Spatial',x$get_latent() ) ) > 0 ){ if(attr(x$get_latent(), "method") != "poly"){ # Update with spatial term - form <- update.formula(form, paste0(" ~ . + ", + form <- stats::update.formula(form, paste0(" ~ . + ", # For SPDE components, simply add spatial.field paste0("spatial.field",which(ids == id)) ) @@ -299,15 +299,15 @@ mesh_area = function(mesh, region.poly = NULL, variant = 'gpc', relative = FALSE SP = sp::SpatialPolygons(polys, proj4string = sp::CRS(sp::proj4string(x))) voronoi = sp::SpatialPolygonsDataFrame(SP, data = data.frame(x = crds[,1], y = crds[, 2], - area = sapply(slot(SP, "polygons"),slot, "area"), - row.names = sapply(slot(SP, "polygons"),slot, "ID"))) + area = sapply(methods::slot(SP, "polygons"), methods::slot, "area"), + row.names = sapply(methods::slot(SP, "polygons"), methods::slot, "ID"))) if (!is.null(bounding.polygon)) { bounding.polygon <- rgeos::gUnion(bounding.polygon, bounding.polygon) voronoi.clipped <- rgeos::gIntersection(voronoi, bounding.polygon, byid = TRUE, id = row.names(voronoi)) df <- data.frame(voronoi) - df$area <- sapply(slot(voronoi.clipped, "polygons"), - slot, "area") + df$area <- sapply(methods::slot(voronoi.clipped, "polygons"), + methods::slot, "area") voronoi <- sp::SpatialPolygonsDataFrame(voronoi.clipped, df) } @@ -320,14 +320,14 @@ mesh_area = function(mesh, region.poly = NULL, variant = 'gpc', relative = FALSE if(variant == 'gpc'){ # Try to convert to spatial already - if(!inherits(region.poly, 'Spatial')) region.poly <- as(region.poly,'Spatial') + if(!inherits(region.poly, 'Spatial')) region.poly <- methods::as(region.poly,'Spatial') - poly.gpc <- as(region.poly@polygons[[1]]@Polygons[[1]]@coords,'gpc.poly') - w <- sapply(tiles, function(p) rgeos::area.poly(rgeos::intersect(as(cbind(p$x, p$y), 'gpc.poly'), poly.gpc))) + poly.gpc <- methods::as(region.poly@polygons[[1]]@Polygons[[1]]@coords,'gpc.poly') + w <- sapply(tiles, function(p) rgeos::area.poly(rgeos::intersect(methods::as(cbind(p$x, p$y), 'gpc.poly'), poly.gpc))) if(relative) w <- w / sum(w) } else if (variant == 'gpc2'){ # Try to convert to spatial already - if(!inherits(region.poly, 'Spatial')) region.poly <- as(region.poly,'Spatial') + if(!inherits(region.poly, 'Spatial')) region.poly <- methods::as(region.poly,'Spatial') tiles <- voronoi.polygons(sp::SpatialPoints(mesh$loc[, 1:2])) w <- sapply(1:length(tiles), function(p) { aux <- tiles[p, ] @@ -351,7 +351,7 @@ mesh_area = function(mesh, region.poly = NULL, variant = 'gpc', relative = FALSE # Calculate area of each polygon in km2 w <- sf::st_area( sf::st_as_sf(polys) - ) %>% units::set_units(km^2) %>% as.numeric() + ) |> units::set_units(km^2) |> as.numeric() # Relative area if(relative) w <- w / sum(w) } @@ -378,17 +378,17 @@ mesh_as_sf <- function(mesh) { # Retrieve the vertex coordinates of the current triangle cur <- pointindex[index, ] # Construct a Polygons object to contain the triangle - Polygons(list( - sp::Polygon( points[c(cur, cur[1]), ], hole = FALSE)), - ID = index - ) - }, points = mesh$loc[, c(1, 2)], pointindex = tv) %>% + sp::Polygons(list( + sp::Polygon( points[c(cur, cur[1]), ], hole = FALSE)), + ID = index + ) + }, points = mesh$loc[, c(1, 2)], pointindex = tv) |> # Convert the polygons to a SpatialPolygons object - sp::SpatialPolygons(., proj4string = mesh$crs) %>% + sp::SpatialPolygons(., proj4string = mesh$crs) |> # Convert to sf sf::st_as_sf(.) # Calculate and add area to the polygon - dp$areakm2 <- sf::st_area(dp) %>% units::set_units(km^2) %>% as.numeric() + dp$areakm2 <- sf::st_area(dp) |> units::set_units(km^2) |> as.numeric() dp$relarea <- dp$areakm2 / sum(dp$areakm2,na.rm = TRUE) return(dp) } @@ -631,7 +631,7 @@ post_prediction <- function(mod, nsamples = 100, preds <- mod$model$predictors # Set any other existing intercept variables preds[,grep('Intercept',rownames(model$summary.fixed),value = TRUE)] <- 1 - preds <- SpatialPixelsDataFrame(preds[,c('x','y')],data=preds) + preds <- sp::SpatialPixelsDataFrame(preds[,c('x','y')],data=preds) preds_names <- mod$model$predictors_names preds_types <- mod$model$predictors_types ofs <- mod$model$offset @@ -720,7 +720,7 @@ post_prediction <- function(mod, nsamples = 100, myLog('[Summary]','green',paste('Formatted', length(vals), 'posterior samples')) # evaluate_model Function - A <- inlabru::amatrix_eval(model, data = preds) + A <- inlabru:::amatrix_eval(model, data = preds) A <- x$engine$data$stk_pred$stk_proj$A effects <- evaluate_effect_multi( @@ -735,7 +735,7 @@ post_prediction <- function(mod, nsamples = 100, return(effects) } - values <- evaluate_predictor( + values <- inlabru::evaluate_predictor( model, state = state, data = data, @@ -1044,7 +1044,7 @@ inla_make_projection_stack <- function(stk_resp, model, mesh, mesh.area, type, ) # Buffer the region to be sure - suppressWarnings( background.g <- rgeos::gBuffer(as(background,'Spatial'), width = 0) ) + suppressWarnings( background.g <- rgeos::gBuffer(methods::as(background,'Spatial'), width = 0) ) # # Get and append coordinates from each polygon # background.bdry <- unique( # do.call('rbind', lapply(background.g@polygons[[1]]@Polygons, function(x) return(x@coords) ) ) @@ -1054,12 +1054,12 @@ inla_make_projection_stack <- function(stk_resp, model, mesh, mesh.area, type, # cbind(background.bdry[,1], background.bdry[,2])) # Get only those points from the projection grid that are on the background - # projpoints <- projgrid$lattice$loc %>% as.data.frame() %>% sf::st_as_sf(coords = c(1,2),crs = st_crs(background)) + # projpoints <- projgrid$lattice$loc |> as.data.frame() |> sf::st_as_sf(coords = c(1,2),crs = st_crs(background)) # TODO: Try and find an alternative to the splancs package to remove this dependent package suppressWarnings( cellsIn <- !is.na(sp::over(x = sp::SpatialPoints(projgrid$lattice$loc, - proj4string = as(background.g,'Spatial')@proj4string), + proj4string = methods::as(background.g,'Spatial')@proj4string), y = background.g)) ) @@ -1211,12 +1211,12 @@ inla_predpoints <- function( mesh, background, cov, proj_stepsize = NULL, spatia dims = Nxy) # Convert background to buffered land suppressWarnings( - background.g <- rgeos::gBuffer(as(background,'Spatial'), + background.g <- rgeos::gBuffer(methods::as(background,'Spatial'), width = 0) ) suppressWarnings( cellsIn <- !is.na(sp::over(x = sp::SpatialPoints(projgrid$lattice$loc, - proj4string = as(background.g,'Spatial')@proj4string), + proj4string = methods::as(background.g,'Spatial')@proj4string), y = background.g)) ) # Get the cells that are in @@ -1252,10 +1252,10 @@ inla_predpoints <- function( mesh, background, cov, proj_stepsize = NULL, spatia proj4string = mesh$crs ) # Remove missing data - preds <- subset(preds, complete.cases(preds@data)) - preds <- as(preds, 'SpatialPixelsDataFrame') + preds <- subset(preds, stats::complete.cases(preds@data)) + preds <- methods::as(preds, 'SpatialPixelsDataFrame') } else { - preds <- subset(preds, complete.cases(preds)) + preds <- subset(preds, stats::complete.cases(preds)) } return(preds) @@ -1282,8 +1282,8 @@ tidy_inla_summary <- function(m, what = 'fixed',...){ assertthat::assert_that(length(w2)==1) # Format the output - o <- m[[w2]] %>% - tibble::rownames_to_column('variable') %>% + o <- m[[w2]] |> + tibble::rownames_to_column('variable') |> tibble::as_tibble() if(what == "fixed"){ names(o) <- c("variable", "mean", "sd", "q05", "q50", "q95", "mode", "kld") @@ -1301,15 +1301,15 @@ plot_inla_marginals = function(inla.model, what = 'fixed'){ assertthat::assert_that(inherits(inla.model,'inla'), is.character(what), what %in% c('fixed','hyper')) - par.ori <- par(no.readonly = TRUE) - par(mfrow = c(4,3), mar = c(3,3,1,0.3), mgp = c(2,1,0)) + par.ori <- graphics::par(no.readonly = TRUE) + graphics::par(mfrow = c(4,3), mar = c(3,3,1,0.3), mgp = c(2,1,0)) if(what == 'fixed'){ varnames <- names(inla.model$marginals.fixed) for(i in 1: length(varnames)){ var.mar <- data.frame(inla.model$marginals.fixed[i]) plot(x = var.mar[,1], y=var.mar[, 2], type="l", xlab=paste(names(var.mar)[1]), ylab=paste(names(var.mar)[2])) - abline(v=0, col="red") + graphics::abline(v=0, col="red") } } else { varnames <- names(inla.model$marginals.hyperpar) @@ -1319,7 +1319,7 @@ plot_inla_marginals = function(inla.model, what = 'fixed'){ xlab=paste(names(var.mar)[1]), ylab=paste(names(var.mar)[2])) } } - par(par.ori) + graphics::par(par.ori) } #' Additional INLA priors not already available. @@ -1397,7 +1397,7 @@ inla.backstep <- function(master_form, if(!is.null(keep)){ te <- te[grep(pattern = paste0(keep,collapse = '|'),x = te, invert = TRUE, fixed = TRUE )] # Also remove keep from master_form as we won't use them below - master_form <- as.formula(paste0(response,' ~ ', paste0(te,collapse = " + ")," - 1")) + master_form <- stats::as.formula(paste0(response,' ~ ', paste0(te,collapse = " + ")," - 1")) } assertthat::assert_that(length(te)>0, !is.null(response), all(keep %notin% te )) @@ -1490,7 +1490,7 @@ inla.backstep <- function(master_form, best_found <- o } else { # Get best model - test_form <- as.formula(oo$form[which.min(oo$cpo)]) + test_form <- stats::as.formula(oo$form[which.min(oo$cpo)]) } rm(o,oo) } else { diff --git a/R/utils-predictors.R b/R/utils-predictors.R index ee90ee2d..4f1c650f 100644 --- a/R/utils-predictors.R +++ b/R/utils-predictors.R @@ -1,543 +1,815 @@ -#' @include utils.R utils-spatial.R -NULL - -#' Spatial adjustment of environmental predictors and raster stacks -#' -#' @description -#' This function allows the transformation of provided environmental predictors (in [`Raster`] format). -#' A common use case is for instance the standardization (or scaling) of all predictors prior to model fitting. -#' This function works both with [`Raster`] as well as with [`stars`] objects. -#' @details -#' Available options are: -#' * \code{'none'} The original layer(s) are returned. -#' * \code{'scale'} This run the [`scale()`] function with default settings (1 Standard deviation) across all predictors. -#' A sensible default to for most model fitting. -#' * \code{'norm'} This normalizes all predictors to a range from \code{0-1}. -#' * \code{'windsor'} This applies a 'windsorization' to an existing raster layer by setting the lowest, respectively -#' largest values to the value at a certain percentage level (e.g. 95%). Those can be set via the parameter \code{"windsor_props"}. -#' * \code{'windsor_thresh'} Same as option 'windsor', however in this case values are clamped to a thresholds -#' rather than certain percentages calculated on the data. -#' * \code{'percentile'} This converts and bins all values into percentiles, e.g. the top 10% or lowest 10% of values and so on. -#' * \code{'pca'} This option runs a principal component decomposition of all predictors (via [`prcomp()`]). -#' It returns new predictors resembling all components in order of the most important ones. Can be useful to -#' reduce collinearity, however note that this changes all predictor names to 'PCX', where X is the number of the component. -#' The parameter \code{'pca.var'} can be modified to specify the minimum variance to be covered by the axes. -#' * \code{'revjack'} Removes outliers from the supplied stack via a reverse jackknife procedure. -#' Identified outliers are by default set to \code{NA}. -#' -#' @param env A [`Raster`] object. -#' @param option A [`vector`] stating whether predictors should be preprocessed in any way (Options: \code{'none'}, -#' \code{'scale'}, \code{'norm'}, \code{'windsor'}, \code{'windsor_thresh'}, \code{'percentile'} \code{'pca'}, \code{'revjack'}). See Details. -#' @param windsor_props A [`numeric`] vector specifying the proportions to be clipped for windsorization (Default: \code{c(.05,.95)}). -#' @param pca.var A [`numeric`] value between \code{>0} and \code{1} stating the minimum amount of variance to be covered (Default: \code{0.8}). -#' @param method As \code{'option'} for more intuitive method setting. Can be left empty (in this case option has to be set). -#' @param ... other options (Non specified). -#' @returns Returns a adjusted [`Raster`] object of identical resolution. -#' @seealso predictor_derivate -#' @examples -#' \dontrun{ -#' # Where x is a rasterstack -#' new_x <- predictor_transform(x, option = 'scale') -#' } -#' @keywords utils -#' @export -predictor_transform <- function(env, option, windsor_props = c(.05,.95), pca.var = 0.8, method = NULL, ...){ - assertthat::assert_that( - inherits(env,'Raster') || inherits(env, 'stars'), - # Support multiple options - is.numeric(windsor_props) & length(windsor_props)==2, - is.numeric(pca.var) - ) - # Convenience function - if(missing(option)){ - assertthat::assert_that(!is.null(method)) - option <- method - } - assertthat::assert_that( - is.character(option), - base::length(option) == 1 - ) - # Match option - option <- match.arg(option, c('none','pca', 'scale', 'norm','windsor', 'windsor_thresh', 'revjack', 'percentile'), several.ok = FALSE) - - # Nothing to be done - if(option == 'none') return(env) - - # If stars see if we can convert it to a stack - if(inherits(env, 'stars')){ - lyrs <- names(env) # Names of predictors - times <- stars::st_get_dimension_values(env, which = 3) # Assume this being the time attribute - dims <- stars::st_dimensions(env) - # Convert to list - env_list <- list() - for(name in lyrs) env_list[[name]] <- as(env[name], 'Raster') - } else { - # Get times in case a stack is supplied (this can get lost depending on transformation) - times <- raster::getZ(env) - } - - # Normalization - if(option == 'norm'){ - if(is.Raster(env)){ - out <- (env - raster::cellStats(env, stat="min")) / - (raster::cellStats(env, stat="max") - - raster::cellStats(env, stat="min")) - } else { - out <- lapply(env_list, function(x) { - (x - raster::cellStats(x, stat="min")) / - (raster::cellStats(x, stat="max") - - raster::cellStats(x, stat="min")) - }) - } - } - # Scaling - if(option == 'scale'){ - if(is.Raster(env)){ - out <- raster::scale(env, center = TRUE, scale = TRUE) - } else { - out <- lapply(env_list, function(x) raster::scale(x, center = TRUE, scale = TRUE)) - } - } - - # Percentile cutting - if(option == 'percentile'){ - if(is.Raster(env)){ - perc <- raster::quantile(env, seq(0,1, length.out = 11)) - perc <- unique(perc) - out <- raster::cut(env, perc) - } else { - out <- lapply(env_list, function(x) { - perc <- raster::quantile(x, seq(0,1, length.out = 11)) - perc <- unique(perc) - raster::cut(x, perc) - }) - } - } - - # Windsorization - if(option == 'windsor'){ - win <- function(x, windsor_props){ - xq <- stats::quantile(x = x[], probs = windsor_props, na.rm = TRUE) - min.value <- xq[1] - max.value <- xq[2] - if(is.vector(env)) out <- units::drop_units(env) else out <- env - out[out < min.value] <- min.value - out[out > max.value] <- max.value - out - } - if(is.Raster(env)){ - out <- win(env, windsor_props ) - } else { - out <- lapply(env_list, function(x) win(x, windsor_props)) - } - } else if(option == 'windsor_thresh'){ - win <- function(x, windsor_thresh){ - if(is.vector(env)) out <- units::drop_units(env) else out <- env - out[out < windsor_thresh[1]] <- windsor_thresh[1] - out[out > windsor_thresh[2]] <- windsor_thresh[2] - out - } - if(is.Raster(env)){ - out <- win(env, windsor_props ) - } else { - out <- lapply(env_list, function(x) win(x, windsor_props)) - } - } - - # Reverse jackknife removal of outliers - if(option == 'revjack'){ - rj <- function(x){ - o <- emptyraster(x) - o[] <- rm_outlier_revjack(x[], procedure = "missing") - return(o) - } - if(is.Raster(env)){ - out <- raster::stack() - for(n in 1:nlayers(env)){ - out <- raster::addLayer(out, rj(env[[n]]) ) - } - } else { - out <- lapply(env_list, function(x) rj(x)) - } - } - - # Principle component separation of variables - # Inspiration taken from RSToolbox package - if(option == 'pca'){ - if(is.Raster(env)){ - assertthat::assert_that(raster::nlayers(env)>=2,msg = 'Need at least two predictors to calculate PCA.') - - # FIXME: Allow a reduction to few components than nr of layers? - nComp <- nlayers(env) - # Construct mask of all cells - envMask <- !sum(raster::calc(env, is.na)) - assertthat::assert_that(cellStats(envMask, sum)>0,msg = 'A predictor is either NA only or no valid values across all layers') - env <- raster::mask(env, envMask, maskvalue = 0) - - # Sample covariance from stack and fit PCA - covMat <- raster::layerStats(env, stat = "cov", na.rm = TRUE) - pca <- stats::princomp(covmat = covMat[[1]], cor = FALSE) - # Add means and grid cells - pca$center <- covMat$mean - pca$n.obs <- raster::ncell(env) - - # Check how many components are requested: - if(pca.var<1){ - sums <- loadings( summary(pca) )[] - props <- cumsum(colSums(sums^2) / nrow(sums)) # Cumulative explained variance - nComp <- length( which(props <= pca.var) ) - } - # Predict principle components - out <- raster::predict(env, pca,na.rm = TRUE, index = 1:nComp) - names(out) <- paste0("PC", 1:nComp) - - return(out) - } else { - # TODO: - stop("Principal component transformation for stars objects is not yet implemented. Pre-process externally!") - } - } - - # If stars convert back to stars object - if(inherits(env, 'stars')){ - # Convert list back to stars - out <- do.call( - stars:::c.stars, - lapply(out, function(x) stars::st_as_stars(x)) - ) - # Reset names of attributes - names(out) <- lyrs - # FIXME: Hacky solution, but breaks other scenarios otherwise - out2 <- try({stars::st_set_dimensions(out, which = 3, values = times, names = "time")},silent = TRUE) - if(inherits(out2, "try-error")){ - # This happens when a stars provided layer has only a single time band - out <- stars::st_redimension(out, new_dims = dims) # use the previously saved dimensions - } else { out <- out2; out2} - } else { - # Final security checks - assertthat::assert_that( - raster::nlayers(env) == raster::nlayers(out), - is_comparable_raster(out, env) - ) - # Reset times - if(!is.null(times)) out <- raster::setZ(out, times) - - return(out) - } -} - -#' Create spatial derivative of raster stacks -#' -#' @description -#' This function creates derivatives of existing covariates and returns them in Raster format. -#' Derivative variables can in the machine learning literature commonly be understood as one aspect of feature -#' engineering. They can be particularly powerful in introducing non-linearities in otherwise linear models, -#' for example is often done in the popular Maxent framework. -#' @details -#' Available options are: -#' * \code{'none'} - The original layer(s) are returned. -#' * \code{'quadratic'} - A quadratic transformation (\eqn{x^{2}}) is created of the provided layers. -#' * \code{'hinge'} - Creates hinge transformation of covariates, which set all values lower than a set threshold to \code{0} -#' and all others to a range of \eqn{[0,1]}. The number of thresholds and thus new derivates is specified -#' via the parameter \code{'nknots'} (Default: \code{4}). -#' * \code{'interaction'} - Creates interactions between variables. Target variables have to be specified via \code{"int_variables"}. -#' * \code{'thresh'} - A threshold transformation of covariates, which sets all values lower than a set threshold ot -#' \code{0} and those larger to \code{1}. -#' The number of thresholds and thus new derivates is specified via the parameter \code{'nknots'} (Default: \code{4}). -#' * \code{'bin'} - Creates a factor representation of a covariates by cutting the range of covariates by their percentiles. -#' The number of percentile cuts and thus new derivates is specified via the parameter \code{'nknots'} (Default: \code{4}). -#' @param env A [`Raster`] object. -#' @param option A [`vector`] stating whether predictors should be preprocessed in any way -#' (Options: \code{'none'}, \code{'quadratic'}, \code{'hinge'}, \code{'thresh'}, \code{'bin'}). -#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). -#' @param deriv A [`vector`] with [`characters`] of specific derivates to create (Default: \code{NULL}). -#' @param int_variables A [`vector`] with length greater or equal than \code{2} specifying the covariates (Default: \code{NULL}). -#' @param method As \code{'option'} for more intuitive method setting. Can be left empty (in this case option has to be set). -#' @param ... other options (Non specified). -#' @return Returns the derived adjusted [`Raster`] objects of identical resolution. -#' @seealso predictor_derivate -#' @examples -#' \dontrun{ -#' # Create a hinge transformation of one or multiple RasterLayers. -#' predictor_derivate(covs, option = "hinge", knots = 4) -#' } -#' @keywords utils -#' @export -predictor_derivate <- function(env, option, nknots = 4, deriv = NULL, int_variables = NULL, method = NULL, ...){ - assertthat::assert_that( - inherits(env,'Raster') || inherits(env, "stars"), - !missing(env), - is.numeric(nknots) && nknots > 1, - is.null(deriv) || is.character(deriv), - is.null(int_variables) || is.vector(int_variables) - ) - # Convenience function - if(missing(option)){ - assertthat::assert_that(!is.null(method)) - option <- method - } - assertthat::assert_that( - is.character(option), - base::length(option) == 1 - ) - # Match argument. - option <- match.arg(option, c('none','quadratic', 'hinge', 'thresh', 'bin', 'interaction'), several.ok = FALSE) - - # None, return as is - if(option == 'none') return(env) - - # If stars see if we can convert it to a stack - if(inherits(env, 'stars')){ - assertthat::assert_that(!is.null(deriv),msg = "Derivate names could not be found!") - # Decompose derivate variable names if set - deriv <- grep(paste0(option, "__"), deriv, value = TRUE) - if(length(deriv)==0){ - if(getOption('ibis.setupmessages')) myLog('[Setup]','red','Predictors with derivates not found!') - return(NULL) - } - cutoffs <- do.call(rbind,strsplit(deriv, "__")) |> as.data.frame() - cutoffs$deriv <- deriv - - lyrs <- names(env) # Names of predictors - times <- stars::st_get_dimension_values(env, which = 3) # Time attribute - # Create a list to house the results - env_list <- list() - for(name in cutoffs$deriv){ - env_list[[name]] <- as(env[cutoffs[which(cutoffs$deriv==name),2]], 'Raster') # Specify original raster - } - assertthat::assert_that(length(env_list) > 0) - } else {cutoffs <- NULL} - - # Simple quadratic transformation - if(option == 'quadratic'){ - if(is.Raster(env)){ - if(raster::nlayers(env)==1){ - new_env <- env^2 - } else { - new_env <- raster::calc(env, function(x) I(x^2)) - } - names(new_env) <- paste0('quad__', names(env)) - } else { - # Stars processing - new_env <- lapply(env_list, function(x) { - raster::calc(x, function(z) I(z^2)) - }) - } - } - - # Hinge transformation - # From`maxnet` package - if(option == 'hinge'){ - if(is.Raster(env)){ - # Build new stacks - new_env <- raster::stack() - for(val in names(env)){ - o <- makeHinge(env[[val]], n = val, nknots = nknots, cutoffs = cutoffs) - if(is.null(o)) next() - new_env <- raster::addLayer(new_env, - fill_rasters(o, emptyraster(env) ) - ) - rm(o) - } - } else { - # Stars object - for(val in names(env_list)){ - # Format cutoffs - cu <- cutoffs[which(cutoffs$deriv == val), 3] - cu <- strsplit(cu, "_") |> unlist() - # Remove any leading points - if(any(substr(cu,1, 1)==".")){ - cu[which(substr(cu,1, 1)==".")] <- gsub("^.","",cu[which(substr(cu,1, 1)==".")]) - } - cu <- as.numeric(cu) - assertthat::assert_that(!anyNA(cu), is.numeric(cu)) - for(k in 1:nlayers(env_list[[val]])){ - o <- emptyraster(env_list[[val]][[k]]) - o[] <- hingeval(env_list[[val]][[k]][], cu[1], cu[2]) - env_list[[val]][[k]] <- o - rm(o) - } - } - invisible(gc()) - } - } - - # For thresholds - # Take functionality in maxnet package - if(option == 'thresh'){ - if(is.Raster(env)){ - new_env <- raster::stack() - for(val in names(env)){ - o <- makeThresh(env[[val]],n = val,nknots = nknots, cutoffs = cutoffs) - if(is.null(o)) next() - new_env <- raster::addLayer(new_env, - fill_rasters(o, emptyraster(env)) - ) - rm(o) - } - } else { - # For stats layers - for(val in names(env_list)){ - # Format cutoffs - cu <- cutoffs[which(cutoffs$deriv == val), 3] - cu <- strsplit(cu, "_") |> unlist() - # Remove any leading points - if(any(substr(cu,1, 1)==".")){ - cu[which(substr(cu,1, 1)==".")] <- gsub("^.","",cu[which(substr(cu,1, 1)==".")]) - } - cu <- as.numeric(cu) - assertthat::assert_that(!anyNA(cu), is.numeric(cu)) - for(k in 1:nlayers(env_list[[val]])){ - o <- emptyraster(env_list[[val]][[k]]) - o[] <- thresholdval(env_list[[val]][[k]][], cu) - env_list[[val]][[k]] <- o - rm(o) - } - } - invisible(gc()) - } - } - - # For binning, calculate cuts of thresholds - if(option == 'bin'){ - if(is.Raster(env)){ - new_env <- raster::stack() - for(val in names(env)){ - o <- makeBin(env[[val]],n = val,nknots = nknots, cutoffs = cutoffs) - if(is.null(o)) next() - new_env <- raster::addLayer(new_env, o) - rm(o) - } - } else { - # For stats layers - for(val in names(env_list)){ - # Format cutoffs - cu <- cutoffs[which(cutoffs$deriv == val), 3] - cu <- strsplit(cu, "_") |> unlist() - # Remove any leading points - if(any(substr(cu,1, 1)==".")){ - cu[which(substr(cu,1, 1)==".")] <- gsub("^.","",cu[which(substr(cu,1, 1)==".")]) - } - cu <- as.numeric(cu) - assertthat::assert_that(!anyNA(cu), is.numeric(cu)) - for(k in 1:nlayers(env_list[[val]])){ - o <- emptyraster(env_list[[val]][[k]]) - o <- raster::cut(env_list[[val]][[k]], cu) - o[is.na(o)] <- 0 - o <- raster::mask(o, env_list[[val]][[k]] ) - env_list[[val]][[k]] <- o - rm(o) - } - } - invisible(gc()) - } - } - - # Create interaction variables - if(option == 'interaction'){ - # Check whether interaction is provided or an attribute - if(is.null(int_variables)){ - int_variables <- attr(env, "int_variables") - } - assertthat::assert_that(is.vector(int_variables)) - - if(is.Raster(env)){ - # Make unique combinations - ind <- combn(int_variables, 2) - - # Now for each combination build new variable - new_env <- raster::stack() - - for(i in 1:ncol(ind)){ - # Multiply first with second entry - o <- env[[ind[1,1]]] * env[[ind[2,1]]] - names(o) <- paste0('inter__', names(env)[ind[1,1]],".",names(env)[ind[2,1]]) - new_env <- raster::addLayer(new_env,o) - rm(o) - } - } else { - # Stars processing - stop("Not yet implemented!") - } - } - - # If stars convert back to stars object - if(inherits(env, 'stars')){ - # Add the original layers back - for(name in names(env)){ - env_list[[name]] <- as(env[name], 'Raster') # Specify original raster - } - - # Convert list back to stars - new_env <- do.call( - stars:::c.stars, - lapply(env_list, function(x) stars::st_as_stars(x)) - ) - # Reset names of attributes - names(new_env) <- c( cutoffs$deriv, names(env)) - new_env <- stars::st_set_dimensions(new_env, which = 3, values = times, names = "time") - } - return(new_env) -} - -#' Homogenize NA values across a set of predictors. -#' -#' @description This method allows the homogenization of missing data across a set of environmental predictors. -#' It is by default called when predictors are added to [´BiodiversityDistribution´] object. Only grid cells with NAs that contain -#' values at some raster layers are homogenized. -#' Additional parameters allow instead of homogenization to fill the missing data with neighbouring values -#' @param env A [`Raster`] object with the predictors -#' @param fill A [`logical`] value indicating whether missing data are to be filled (Default: FALSE). -#' @param fill_method A [`character`] of the method for filling gaps to be used (Default: 'ngb') -#' @param return_na_cells A [`logical`] value of whether the ids of grid cells with NA values is to be returned instead (Default: FALSE) -#' @returns A [`Raster`] object with the same number of layers as the input. -#' @keywords utils -#' @export -predictor_homogenize_na <- function(env, fill = FALSE, fill_method = 'ngb', return_na_cells = FALSE){ - assertthat::assert_that( - is.Raster(env) || inherits(env, 'stars'), - is.logical(fill), - is.character(fill_method), fill_method %in% c('ngb'), - is.logical(return_na_cells) - ) - # Workflow for raster layers - if(is.Raster(env)){ - nl <- raster::nlayers(env) - # If the number of layers is 1, no need for homogenization - if(nl > 1){ - # Calculate number of NA grid cells per stack - mask_na <- sum( is.na(env) ) - # Remove grid cells that are equal to the number of layers (all values NA) - none_area <- mask_na == nl - none_area[none_area == 0 ] <- NA - mask_na <- raster::mask(mask_na, - mask = none_area,inverse = TRUE) - - # Should any fill be conducted? - if(fill){ - stop('Not yet implemented!') - } else { - # Otherwise just homogenize NA values across predictors - if(cellStats(mask_na,'max')>0){ - mask_all <- mask_na == 0; mask_all[mask_all == 0] <- NA - env <- raster::mask(env, mask = mask_all) - } - } - # Should NA coordinates of cells where 1 or more predictor is NA be returned? - # FIXME: One could directly return a data.frame with the predictor names to allow easier lookup. - if(return_na_cells){ - vals <- which((mask_na>0)[]) - env <- list(cells_na = vals, env = env) - } - rm(mask_na, none_area) # Cleanup - } - } else if(inherits(env, 'stars')){ - stop('Not implemented yet.') - } - # Security checks - assertthat::assert_that( - is.Raster(env) || is.list(env) || inherits(env, 'stars') - ) - # Return the result - return(env) -} +#' @include utils.R utils-spatial.R +NULL + +#' Spatial adjustment of environmental predictors and raster stacks +#' +#' @description +#' This function allows the transformation of provided environmental predictors (in [`Raster`] format). +#' A common use case is for instance the standardization (or scaling) of all predictors prior to model fitting. +#' This function works both with [`Raster`] as well as with [`stars`] objects. +#' @details +#' Available options are: +#' * \code{'none'} The original layer(s) are returned. +#' * \code{'scale'} This run the [`scale()`] function with default settings (1 Standard deviation) across all predictors. +#' A sensible default to for most model fitting. +#' * \code{'norm'} This normalizes all predictors to a range from \code{0-1}. +#' * \code{'windsor'} This applies a 'windsorization' to an existing raster layer by setting the lowest, respectively +#' largest values to the value at a certain percentage level (e.g. 95%). Those can be set via the parameter \code{"windsor_props"}. +#' * \code{'windsor_thresh'} Same as option 'windsor', however in this case values are clamped to a thresholds +#' rather than certain percentages calculated on the data. +#' * \code{'percentile'} This converts and bins all values into percentiles, e.g. the top 10% or lowest 10% of values and so on. +#' * \code{'pca'} This option runs a principal component decomposition of all predictors (via [`prcomp()`]). +#' It returns new predictors resembling all components in order of the most important ones. Can be useful to +#' reduce collinearity, however note that this changes all predictor names to 'PCX', where X is the number of the component. +#' The parameter \code{'pca.var'} can be modified to specify the minimum variance to be covered by the axes. +#' * \code{'revjack'} Removes outliers from the supplied stack via a reverse jackknife procedure. +#' Identified outliers are by default set to \code{NA}. +#' +#' @param env A [`Raster`] object. +#' @param option A [`vector`] stating whether predictors should be preprocessed in any way (Options: \code{'none'}, +#' \code{'scale'}, \code{'norm'}, \code{'windsor'}, \code{'windsor_thresh'}, \code{'percentile'} \code{'pca'}, \code{'revjack'}). See Details. +#' @param windsor_props A [`numeric`] vector specifying the proportions to be clipped for windsorization (Default: \code{c(.05,.95)}). +#' @param pca.var A [`numeric`] value between \code{>0} and \code{1} stating the minimum amount of variance to be covered (Default: \code{0.8}). +#' @param method As \code{'option'} for more intuitive method setting. Can be left empty (in this case option has to be set). +#' @param ... other options (Non specified). +#' @returns Returns a adjusted [`Raster`] object of identical resolution. +#' @seealso predictor_derivate +#' @examples +#' \dontrun{ +#' # Where x is a rasterstack +#' new_x <- predictor_transform(x, option = 'scale') +#' } +#' @keywords utils +#' @export +predictor_transform <- function(env, option, windsor_props = c(.05,.95), pca.var = 0.8, method = NULL, ...){ + assertthat::assert_that( + inherits(env,'Raster') || inherits(env, 'stars'), + # Support multiple options + is.numeric(windsor_props) & length(windsor_props)==2, + is.numeric(pca.var) + ) + # Convenience function + if(missing(option)){ + assertthat::assert_that(!is.null(method)) + option <- method + } + assertthat::assert_that( + is.character(option), + base::length(option) == 1 + ) + # Match option + option <- match.arg(option, c('none','pca', 'scale', 'norm','windsor', 'windsor_thresh', 'revjack', 'percentile'), several.ok = FALSE) + + # Nothing to be done + if(option == 'none') return(env) + + # If stars see if we can convert it to a stack + if(inherits(env, 'stars')){ + lyrs <- names(env) # Names of predictors + times <- stars::st_get_dimension_values(env, which = 3) # Assume this being the time attribute + dims <- stars::st_dimensions(env) + # Convert to list + env_list <- list() + for(name in lyrs) env_list[[name]] <- methods::as(env[name], 'Raster') + } else { + # Get times in case a stack is supplied (this can get lost depending on transformation) + times <- raster::getZ(env) + } + + # Normalization + if(option == 'norm'){ + if(is.Raster(env)){ + out <- (env - raster::cellStats(env, stat="min")) / + (raster::cellStats(env, stat="max") - + raster::cellStats(env, stat="min")) + } else { + out <- lapply(env_list, function(x) { + (x - raster::cellStats(x, stat="min")) / + (raster::cellStats(x, stat="max") - + raster::cellStats(x, stat="min")) + }) + } + } + # Scaling + if(option == 'scale'){ + if(is.Raster(env)){ + out <- raster::scale(env, center = TRUE, scale = TRUE) + } else { + out <- lapply(env_list, function(x) raster::scale(x, center = TRUE, scale = TRUE)) + } + } + + # Percentile cutting + if(option == 'percentile'){ + if(is.Raster(env)){ + perc <- raster::quantile(env, seq(0,1, length.out = 11)) + perc <- unique(perc) + out <- raster::cut(env, perc) + } else { + out <- lapply(env_list, function(x) { + perc <- raster::quantile(x, seq(0,1, length.out = 11)) + perc <- unique(perc) + raster::cut(x, perc) + }) + } + } + + # Windsorization + if(option == 'windsor'){ + win <- function(x, windsor_props){ + xq <- stats::quantile(x = x[], probs = windsor_props, na.rm = TRUE) + min.value <- xq[1] + max.value <- xq[2] + if(is.vector(env)) out <- units::drop_units(env) else out <- env + out[out < min.value] <- min.value + out[out > max.value] <- max.value + out + } + if(is.Raster(env)){ + out <- win(env, windsor_props ) + } else { + out <- lapply(env_list, function(x) win(x, windsor_props)) + } + } else if(option == 'windsor_thresh'){ + win <- function(x, windsor_thresh){ + if(is.vector(env)) out <- units::drop_units(env) else out <- env + out[out < windsor_thresh[1]] <- windsor_thresh[1] + out[out > windsor_thresh[2]] <- windsor_thresh[2] + out + } + if(is.Raster(env)){ + out <- win(env, windsor_props ) + } else { + out <- lapply(env_list, function(x) win(x, windsor_props)) + } + } + + # Reverse jackknife removal of outliers + if(option == 'revjack'){ + rj <- function(x){ + o <- emptyraster(x) + o[] <- rm_outlier_revjack(x[], procedure = "missing") + return(o) + } + if(is.Raster(env)){ + out <- raster::stack() + for(n in 1:nlayers(env)){ + out <- raster::addLayer(out, rj(env[[n]]) ) + } + } else { + out <- lapply(env_list, function(x) rj(x)) + } + } + + # Principle component separation of variables + # Inspiration taken from RSToolbox package + if(option == 'pca'){ + if(is.Raster(env)){ + assertthat::assert_that(raster::nlayers(env)>=2,msg = 'Need at least two predictors to calculate PCA.') + + # FIXME: Allow a reduction to few components than nr of layers? + nComp <- nlayers(env) + # Construct mask of all cells + envMask <- !sum(raster::calc(env, is.na)) + assertthat::assert_that(cellStats(envMask, sum)>0,msg = 'A predictor is either NA only or no valid values across all layers') + env <- raster::mask(env, envMask, maskvalue = 0) + + # Sample covariance from stack and fit PCA + covMat <- raster::layerStats(env, stat = "cov", na.rm = TRUE) + pca <- stats::princomp(covmat = covMat[[1]], cor = FALSE) + # Add means and grid cells + pca$center <- covMat$mean + pca$n.obs <- raster::ncell(env) + + # Check how many components are requested: + if(pca.var<1){ + sums <- stats::loadings( summary(pca) )[] + props <- cumsum(colSums(sums^2) / nrow(sums)) # Cumulative explained variance + nComp <- length( which(props <= pca.var) ) + } + # Predict principle components + out <- raster::predict(env, pca,na.rm = TRUE, index = 1:nComp) + names(out) <- paste0("PC", 1:nComp) + + return(out) + } else { + # TODO: + stop("Principal component transformation for stars objects is not yet implemented. Pre-process externally!") + } + } + + # If stars convert back to stars object + if(inherits(env, 'stars')){ + # Convert list back to stars + out <- do.call( + stars:::c.stars, + lapply(out, function(x) stars::st_as_stars(x)) + ) + # Reset names of attributes + names(out) <- lyrs + # FIXME: Hacky solution, but breaks other scenarios otherwise + out2 <- try({stars::st_set_dimensions(out, which = 3, values = times, names = "time")},silent = TRUE) + if(inherits(out2, "try-error")){ + # This happens when a stars provided layer has only a single time band + out <- stars::st_redimension(out, new_dims = dims) # use the previously saved dimensions + } else { out <- out2; out2} + } else { + # Final security checks + assertthat::assert_that( + raster::nlayers(env) == raster::nlayers(out), + is_comparable_raster(out, env) + ) + # Reset times + if(!is.null(times)) out <- raster::setZ(out, times) + + return(out) + } +} + +#' Create spatial derivative of raster stacks +#' +#' @description +#' This function creates derivatives of existing covariates and returns them in Raster format. +#' Derivative variables can in the machine learning literature commonly be understood as one aspect of feature +#' engineering. They can be particularly powerful in introducing non-linearities in otherwise linear models, +#' for example is often done in the popular Maxent framework. +#' @details +#' Available options are: +#' * \code{'none'} - The original layer(s) are returned. +#' * \code{'quadratic'} - A quadratic transformation (\eqn{x^{2}}) is created of the provided layers. +#' * \code{'hinge'} - Creates hinge transformation of covariates, which set all values lower than a set threshold to \code{0} +#' and all others to a range of \eqn{[0,1]}. The number of thresholds and thus new derivates is specified +#' via the parameter \code{'nknots'} (Default: \code{4}). +#' * \code{'interaction'} - Creates interactions between variables. Target variables have to be specified via \code{"int_variables"}. +#' * \code{'thresh'} - A threshold transformation of covariates, which sets all values lower than a set threshold ot +#' \code{0} and those larger to \code{1}. +#' The number of thresholds and thus new derivates is specified via the parameter \code{'nknots'} (Default: \code{4}). +#' * \code{'bin'} - Creates a factor representation of a covariates by cutting the range of covariates by their percentiles. +#' The number of percentile cuts and thus new derivates is specified via the parameter \code{'nknots'} (Default: \code{4}). +#' @param env A [`Raster`] object. +#' @param option A [`vector`] stating whether predictors should be preprocessed in any way +#' (Options: \code{'none'}, \code{'quadratic'}, \code{'hinge'}, \code{'thresh'}, \code{'bin'}). +#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). +#' @param deriv A [`vector`] with [`characters`] of specific derivates to create (Default: \code{NULL}). +#' @param int_variables A [`vector`] with length greater or equal than \code{2} specifying the covariates (Default: \code{NULL}). +#' @param method As \code{'option'} for more intuitive method setting. Can be left empty (in this case option has to be set). +#' @param ... other options (Non specified). +#' @return Returns the derived adjusted [`Raster`] objects of identical resolution. +#' @seealso predictor_derivate +#' @examples +#' \dontrun{ +#' # Create a hinge transformation of one or multiple RasterLayers. +#' predictor_derivate(covs, option = "hinge", knots = 4) +#' } +#' @keywords utils +#' @export +predictor_derivate <- function(env, option, nknots = 4, deriv = NULL, int_variables = NULL, method = NULL, ...){ + assertthat::assert_that( + inherits(env,'Raster') || inherits(env, "stars"), + !missing(env), + is.numeric(nknots) && nknots > 1, + is.null(deriv) || is.character(deriv), + is.null(int_variables) || is.vector(int_variables) + ) + # Convenience function + if(missing(option)){ + assertthat::assert_that(!is.null(method)) + option <- method + } + assertthat::assert_that( + is.character(option), + base::length(option) == 1 + ) + # Match argument. + option <- match.arg(option, c('none','quadratic', 'hinge', 'thresh', 'bin', 'interaction'), several.ok = FALSE) + + # None, return as is + if(option == 'none') return(env) + + # If stars see if we can convert it to a stack + if(inherits(env, 'stars')){ + assertthat::assert_that(!is.null(deriv),msg = "Derivate names could not be found!") + # Decompose derivate variable names if set + deriv <- grep(paste0(option, "__"), deriv, value = TRUE) + if(length(deriv)==0){ + if(getOption('ibis.setupmessages')) myLog('[Setup]','red','Predictors with derivates not found!') + return(NULL) + } + cutoffs <- do.call(rbind,strsplit(deriv, "__")) |> as.data.frame() + cutoffs$deriv <- deriv + + lyrs <- names(env) # Names of predictors + times <- stars::st_get_dimension_values(env, which = 3) # Time attribute + # Create a list to house the results + env_list <- list() + for(name in cutoffs$deriv){ + env_list[[name]] <- methods::as(env[cutoffs[which(cutoffs$deriv==name),2]], 'Raster') # Specify original raster + } + assertthat::assert_that(length(env_list) > 0) + } else {cutoffs <- NULL} + + # Simple quadratic transformation + if(option == 'quadratic'){ + if(is.Raster(env)){ + if(raster::nlayers(env)==1){ + new_env <- env^2 + } else { + new_env <- raster::calc(env, function(x) I(x^2)) + } + names(new_env) <- paste0('quad__', names(env)) + } else { + # Stars processing + new_env <- lapply(env_list, function(x) { + raster::calc(x, function(z) I(z^2)) + }) + } + } + + # Hinge transformation + # From`maxnet` package + if(option == 'hinge'){ + if(is.Raster(env)){ + # Build new stacks + new_env <- raster::stack() + for(val in names(env)){ + o <- makeHinge(env[[val]], n = val, nknots = nknots, cutoffs = cutoffs) + if(is.null(o)) next() + new_env <- raster::addLayer(new_env, + fill_rasters(o, emptyraster(env) ) + ) + rm(o) + } + } else { + # Stars object + for(val in names(env_list)){ + # Format cutoffs + cu <- cutoffs[which(cutoffs$deriv == val), 3] + cu <- strsplit(cu, "_") |> unlist() + # Remove any leading points + if(any(substr(cu,1, 1)==".")){ + cu[which(substr(cu,1, 1)==".")] <- gsub("^.","",cu[which(substr(cu,1, 1)==".")]) + } + cu <- as.numeric(cu) + assertthat::assert_that(!anyNA(cu), is.numeric(cu)) + for(k in 1:nlayers(env_list[[val]])){ + o <- emptyraster(env_list[[val]][[k]]) + o[] <- hingeval(env_list[[val]][[k]][], cu[1], cu[2]) + env_list[[val]][[k]] <- o + rm(o) + } + } + invisible(gc()) + } + } + + # For thresholds + # Take functionality in maxnet package + if(option == 'thresh'){ + if(is.Raster(env)){ + new_env <- raster::stack() + for(val in names(env)){ + o <- makeThresh(env[[val]],n = val,nknots = nknots, cutoffs = cutoffs) + if(is.null(o)) next() + new_env <- raster::addLayer(new_env, + fill_rasters(o, emptyraster(env)) + ) + rm(o) + } + } else { + # For stats layers + for(val in names(env_list)){ + # Format cutoffs + cu <- cutoffs[which(cutoffs$deriv == val), 3] + cu <- strsplit(cu, "_") |> unlist() + # Remove any leading points + if(any(substr(cu,1, 1)==".")){ + cu[which(substr(cu,1, 1)==".")] <- gsub("^.","",cu[which(substr(cu,1, 1)==".")]) + } + cu <- as.numeric(cu) + assertthat::assert_that(!anyNA(cu), is.numeric(cu)) + for(k in 1:nlayers(env_list[[val]])){ + o <- emptyraster(env_list[[val]][[k]]) + o[] <- thresholdval(env_list[[val]][[k]][], cu) + env_list[[val]][[k]] <- o + rm(o) + } + } + invisible(gc()) + } + } + + # For binning, calculate cuts of thresholds + if(option == 'bin'){ + if(is.Raster(env)){ + new_env <- raster::stack() + for(val in names(env)){ + o <- makeBin(env[[val]],n = val,nknots = nknots, cutoffs = cutoffs) + if(is.null(o)) next() + new_env <- raster::addLayer(new_env, o) + rm(o) + } + } else { + # For stats layers + for(val in names(env_list)){ + # Format cutoffs + cu <- cutoffs[which(cutoffs$deriv == val), 3] + cu <- strsplit(cu, "_") |> unlist() + # Remove any leading points + if(any(substr(cu,1, 1)==".")){ + cu[which(substr(cu,1, 1)==".")] <- gsub("^.","",cu[which(substr(cu,1, 1)==".")]) + } + cu <- as.numeric(cu) + assertthat::assert_that(!anyNA(cu), is.numeric(cu)) + for(k in 1:nlayers(env_list[[val]])){ + o <- emptyraster(env_list[[val]][[k]]) + o <- raster::cut(env_list[[val]][[k]], cu) + o[is.na(o)] <- 0 + o <- raster::mask(o, env_list[[val]][[k]] ) + env_list[[val]][[k]] <- o + rm(o) + } + } + invisible(gc()) + } + } + + # Create interaction variables + if(option == 'interaction'){ + # Check whether interaction is provided or an attribute + if(is.null(int_variables)){ + int_variables <- attr(env, "int_variables") + } + assertthat::assert_that(is.vector(int_variables)) + + if(is.Raster(env)){ + # Make unique combinations + ind <- utils::combn(int_variables, 2) + + # Now for each combination build new variable + new_env <- raster::stack() + + for(i in 1:ncol(ind)){ + # Multiply first with second entry + o <- env[[ind[1,1]]] * env[[ind[2,1]]] + names(o) <- paste0('inter__', names(env)[ind[1,1]],".",names(env)[ind[2,1]]) + new_env <- raster::addLayer(new_env,o) + rm(o) + } + } else { + # Stars processing + stop("Not yet implemented!") + } + } + + # If stars convert back to stars object + if(inherits(env, 'stars')){ + # Add the original layers back + for(name in names(env)){ + env_list[[name]] <- methods::as(env[name], 'Raster') # Specify original raster + } + + # Convert list back to stars + new_env <- do.call( + stars:::c.stars, + lapply(env_list, function(x) stars::st_as_stars(x)) + ) + # Reset names of attributes + names(new_env) <- c( cutoffs$deriv, names(env)) + new_env <- stars::st_set_dimensions(new_env, which = 3, values = times, names = "time") + } + return(new_env) +} + +#' Homogenize NA values across a set of predictors. +#' +#' @description This method allows the homogenization of missing data across a set of environmental predictors. +#' It is by default called when predictors are added to [´BiodiversityDistribution´] object. Only grid cells with NAs that contain +#' values at some raster layers are homogenized. +#' Additional parameters allow instead of homogenization to fill the missing data with neighbouring values +#' @param env A [`Raster`] object with the predictors +#' @param fill A [`logical`] value indicating whether missing data are to be filled (Default: FALSE). +#' @param fill_method A [`character`] of the method for filling gaps to be used (Default: 'ngb') +#' @param return_na_cells A [`logical`] value of whether the ids of grid cells with NA values is to be returned instead (Default: FALSE) +#' @returns A [`Raster`] object with the same number of layers as the input. +#' @keywords utils +#' @export +predictor_homogenize_na <- function(env, fill = FALSE, fill_method = 'ngb', return_na_cells = FALSE){ + assertthat::assert_that( + is.Raster(env) || inherits(env, 'stars'), + is.logical(fill), + is.character(fill_method), fill_method %in% c('ngb'), + is.logical(return_na_cells) + ) + # Workflow for raster layers + if(is.Raster(env)){ + nl <- raster::nlayers(env) + # If the number of layers is 1, no need for homogenization + if(nl > 1){ + # Calculate number of NA grid cells per stack + mask_na <- sum( is.na(env) ) + # Remove grid cells that are equal to the number of layers (all values NA) + none_area <- mask_na == nl + none_area[none_area == 0 ] <- NA + mask_na <- raster::mask(mask_na, + mask = none_area,inverse = TRUE) + + # Should any fill be conducted? + if(fill){ + stop('Not yet implemented!') + } else { + # Otherwise just homogenize NA values across predictors + if(cellStats(mask_na,'max')>0){ + mask_all <- mask_na == 0; mask_all[mask_all == 0] <- NA + env <- raster::mask(env, mask = mask_all) + } + } + # Should NA coordinates of cells where 1 or more predictor is NA be returned? + # FIXME: One could directly return a data.frame with the predictor names to allow easier lookup. + if(return_na_cells){ + vals <- which((mask_na>0)[]) + env <- list(cells_na = vals, env = env) + } + rm(mask_na, none_area) # Cleanup + } + } else if(inherits(env, 'stars')){ + stop('Not implemented yet.') + } + # Security checks + assertthat::assert_that( + is.Raster(env) || is.list(env) || inherits(env, 'stars') + ) + # Return the result + return(env) +} + +#### Filter predictor functions ---- + +#' Filter a set of correlated predictors to fewer ones +#' +#' @description +#' This function helps to remove highly correlated variables from a set of predictors. It supports multiple options +#' some of which require both environmental predictors and observations, others only predictors. +#' +#' Some of the options require different packages to be pre-installed, such as [ranger] or [Boruta]. +#' +#' @details +#' Available options are: +#' +#' * \code{"none"} No prior variable removal is performed (Default). +#' * \code{"pearson"}, \code{"spearman"} or \code{"kendall"} Makes use of pairwise comparisons to identify and +#' remove highly collinear predictors (Pearson's \code{r >= 0.7}). +#' * \code{"abess"} A-priori adaptive best subset selection of covariates via the [abess] package (see References). Note that this +#' effectively fits a separate generalized linear model to reduce the number of covariates. +#' * \code{"boruta"} Uses the [Boruta] package to identify non-informative features. +#' +#' @note +#' Using this function on predictors effectively means that a separate model is fitted on the data +#' with all the assumptions that come with in (e.g. linearity, appropriateness of response, normality, etc). +#' +#' @param env A [`data.frame`] or [`matrix`] with extracted environmental covariates for a given species. +#' @param obs A [`vector`] with observational records to use for determining variable importance. Can be \code{NULL}. +#' @param keep A [`vector`] with variables to keep regardless. These are usually variables for which prior +#' information is known. +#' @param method Which method to use for constructing the correlation matrix (Options: \code{'pearson'} (Default), +#' \code{'spearman'}| \code{'kendal'}), \code{"abess"}, or \code{"boruta"}. +#' @param ... Other options for a specific method +#' +#' @keywords utils +#' @return A [`character`] [`vector`] of variable names to be excluded. +#' If the function fails due to some reason return \code{NULL}. +#' @examples +#' \dontrun{ +#' # Remove highly correlated predictors +#' env <- predictor_filter( env, option = "pearson") +#' } +#' @export +predictor_filter <- function( env, keep = NULL, method = "pearson", ...){ + assertthat::assert_that( + is.data.frame(env) || is.matrix(env), + ncol(env) >2, + is.null(keep) || is.vector(keep), + is.character(method) + ) + # Match the predictor names + method <- match.arg(method, + c("none", "pearson", "spearman", "kendall", "abess", "boruta"), + several.ok = FALSE) + + # Now apply the filter depending on the option + if(method == "none"){ + co <- NULL + } else if(method %in% c("pearson", "spearman", "kendall")){ + # Simply collinearity check based on colinear predictors + co <- predictors_filter_collinearity( + env, keep = keep, method = method, ... + ) + } else if(method == "abess"){ + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Applying abess method to reduce predictors...') + co <- predictors_filter_abess( + env = env, keep = keep, method = method, ... + ) + } else if(method == "boruta"){ + check_package("Boruta") + co <- predictors_filter_boruta( + env = env, keep = keep, method = method, ... + ) + + } else { + stop("Method not yet implemented!") + } + + # Security checks and return + assertthat::assert_that(is.null(co) || is.character(co)) + return(co) +} + +#' Identify collinear predictors +#' +#' @inheritParams predictor_filter +#' @param cutoff A [`numeric`] variable specifying the maximal correlation cutoff. +#' @concept Code inspired from the [`caret`] package +#' @keywords utils, internal +#' @returns [`vector`] of variable names to exclude +predictors_filter_collinearity <- function( env, keep = NULL, cutoff = getOption('ibis.corPred'), method = 'pearson', ...){ + # Security checks + assertthat::assert_that(is.data.frame(env), + is.character(method), + is.numeric(cutoff), + is.null(keep) || is.vector(keep) + ) + keep <- keep[keep %in% names(env)] # Remove those not in the data.frame. For instance if a spatial effect is selected + if(!is.null(keep) || length(keep) == 0) x <- env |> dplyr::select(-keep) else x <- env + + # Removing non-numeric columns + non.numeric.columns <- colnames(x)[!sapply(x, is.numeric)] + x <- x[, !(colnames(x) %in% non.numeric.columns)] + + # Get all variables that are singular or unique in value + singular_var <- which(round( apply(x, 2, var),4) == 0) + if(length(singular_var)>0) x <- x[,-singular_var] + + # Calculate correlation matrix + cm <- stats::cor(x, method = method) + + # Copied from the \code{caret} package to avoid further dependencies + if (any(!stats::complete.cases(cm))) stop("The correlation matrix has some missing values.") + averageCorr <- colMeans(abs(cm)) + averageCorr <- as.numeric(as.factor(averageCorr)) + cm[lower.tri(cm, diag = TRUE)] <- NA + + # Determine combinations over cutoff + combsAboveCutoff <- which(abs(cm) > cutoff) + colsToCheck <- ceiling(combsAboveCutoff/nrow(cm)) + rowsToCheck <- combsAboveCutoff%%nrow(cm) + + # Exclude columns with variables over average correlation + colsToDiscard <- averageCorr[colsToCheck] > averageCorr[rowsToCheck] + rowsToDiscard <- !colsToDiscard + + # Get columns to discard + deletecol <- c(colsToCheck[colsToDiscard], rowsToCheck[rowsToDiscard]) + deletecol <- unique(deletecol) + + # Which variables to discard + o <- names(env)[deletecol] + if(length(singular_var)>0) o <- unique( c(o, names(singular_var) ) ) + o +} + +#' Apply the adaptive best subset selection framework on a set of predictors +#' +#' @description +#' This is a wrapper function to fit the adaptive subset selection procedure outlined +#' in Zhu et al. (2021) and Zhu et al. (2020). +#' @inheritParams predictor_filter +#' +#' @param family A [`character`] indicating the family the observational data originates from. +#' @param tune.type [`character`] indicating the type used for subset evaluation. +#' Options are \code{c("gic", "ebic", "bic", "aic", "cv")} as listed in [abess]. +#' @param lambda A [`numeric`] single lambda value for regularized best subset selection (Default: \code{0}). +#' @param weight Observation weights. When weight = \code{NULL}, we set weight = \code{1} for each observation as default. +#' @references +#' * abess: A Fast Best Subset Selection Library in Python and R. Jin Zhu, Liyuan Hu, Junhao Huang, Kangkang Jiang, Yanhang Zhang, Shiyun Lin, Junxian Zhu, Xueqin Wang (2021). arXiv preprint arXiv:2110.09697. +#' * A polynomial algorithm for best-subset selection problem. Junxian Zhu, Canhong Wen, Jin Zhu, Heping Zhang, Xueqin Wang. Proceedings of the National Academy of Sciences Dec 2020, 117 (52) 33117-33123; doi: 10.1073/pnas.2014241117 +#' @keywords utils, internal +#' @returns A [`vector`] of variable names to exclude +predictors_filter_abess <- function( env, observed, method, family, tune.type = "cv", lambda = 0, + weight = NULL, keep = NULL, ...){ + # Security checks + assertthat::assert_that(is.data.frame(env) || is.matrix(env), + is.vector(observed), + is.numeric(lambda), + is.character(tune.type), + is.null(keep) || is.vector(keep), + is.null(weight) || is.vector(weight) + ) + assertthat::assert_that( + length(observed) == nrow(env), msg = "Number of observation unequal to number of covariate rows." + ) + # Match family and type + family <- match.arg(family, c("gaussian", "binomial", "poisson", "cox", "mgaussian", "multinomial", + "gamma"), several.ok = FALSE) + tune.type <- match.arg(tune.type, c("gic", "ebic", "bic", "aic", "cv"), several.ok = FALSE) + + # Check that abess package is available + check_package("abess") + if(!isNamespaceLoaded("abess")) { attachNamespace("abess");requireNamespace('abess') } + + # Build model + abess_fit <- abess::abess(x = env, + y = observed, + family = family, + tune.type = tune.type, + weight = weight, + lambda = lambda, + always.include = keep, + nfolds = 100, # Increase from default 5 + num.threads = 0 + ) + + if(anyNA(stats::coef(abess_fit)[,1]) ) { + # Refit with minimum support size + abess_fit <- abess::abess(x = env, + y = observed, + family = family, + lambda = lambda, + tune.type = tune.type, + weight = weight, + always.include = keep, + nfolds = 100, # Increase from default 5 + # Minimum support site of 10% of number of covariates + support.size = ceiling(ncol(env) * 0.1), + num.threads = 0 + ) + + } + # Get best vars + co <- stats::coef(abess_fit, support.size = abess_fit[["best.size"]]) + co <- names( which(co[,1] != 0)) + co <- co[grep("Intercept", co, ignore.case = TRUE, invert = TRUE)] + # Make some checks on the list of reduced variables + if(length(co) <= 2) { + warning("Abess was likely to rigours. Likely to low signal-to-noise ratio.") + return(NULL) + } else { + co + } +} + +#' All relevant feature selection using Boruta +#' +#' @description +#' This function uses the [Boruta] package to identify predictor variables with little information content. It iteratively +#' compares importances of attributes with importances of shadow attributes, created by shuffling original ones. +#' Attributes that have significantly worst importance than shadow ones are being consecutively dropped. +#' +#' @note +#' This package depends on the [ranger] package to iteratively fit randomForest models. +#' +#' @inheritParams predictor_filter +#' @param iter [`numeric`] on the number of maximal runs (Default: \code{100}). Increase if too many tentative left. +#' @param verbose [`logical`] whether to be chatty. +#' @references +#' * Miron B. Kursa, Witold R. Rudnicki (2010). Feature Selection with the Boruta Package. Journal of Statistical Software, 36(11), 1-13. URL https://doi.org/10.18637/jss.v036.i11. +#' @keywords utils, internal +#' @returns A [`vector`] of variable names to exclude. +predictors_filter_boruta <- function( env, observed, method, keep = NULL, + iter = 100, verbose = getOption('ibis.setupmessages'), ...){ + # Security checks + assertthat::assert_that(is.data.frame(env) || is.matrix(env), + is.null(observed) || is.vector(observed), + is.null(keep) || is.vector(keep), + is.numeric(iter), iter>10, + is.logical(verbose) + ) + check_package("Boruta") + + # Get all variable names to test + vars <- names(env) + + # Remove kept variables + if(!is.null(keep)){ + keep <- keep[keep %in% vars] # Remove those not in the data.frame. For instance if a spatial effect is selected + if(!is.null(keep) || length(keep) == 0) { + env <- env |> dplyr::select(-keep) + vars <- names(env) + } + } + + # Check for other common variables unlikely to be important + if("Intercept" %in% vars) vars <- vars[-which(vars == "Intercept")] + if("intercept" %in% vars) vars <- vars[-which(vars == "intercept")] + if("ID" %in% vars) vars <- vars[-which(vars == "ID")] + + # Apply boruta + bo_test <- Boruta::Boruta(env, y = observed, + maxRuns = iter, + # Verbosity + doTrace = ifelse(verbose, 1, 0)) + + # Get from the bo_test object all variables that are clearly rejected + res <- bo_test$finalDecision + co <- names(res)[which(res == "Rejected")] + if(length(co)==0) co <- NULL + return(co) +} diff --git a/R/utils-scenario.R b/R/utils-scenario.R index 7d6bf817..09a8d41a 100644 --- a/R/utils-scenario.R +++ b/R/utils-scenario.R @@ -1,609 +1,610 @@ -#' Approximate missing time steps between dates -#' -#' @description -#' This function linearly approximates shares between time steps, so that gaps for instance -#' between 2010 and 2020 are filled with data for 2010, 2011, 2012, etc. -#' @param env A [`stars`] object. -#' @param date_interpolation [`character`] on how missing dates between events should be interpolated. See [`project()`]. -#' @return [`logical`] indicating if the two [`Raster-class`] objects have the same -#' @keywords scenario -#' @noRd -approximate_gaps <- function(env, date_interpolation = "annual"){ - assertthat::assert_that( - inherits(env, "stars"), - is.character(date_interpolation) - ) - check_package("dplyr") - date_interpolation <- match.arg(date_interpolation, c("none", "yearly", "annual", "monthly", "daily"), several.ok = FALSE) - if(date_interpolation=="none") return(env) - - stop("Still in progress") - # --- # - # Get individual time steps at interval - times <- stars::st_get_dimension_values(env, which = names(dim(env))[3], center = TRUE) - times <- to_POSIXct(times) - tzone <- attr(as.POSIXlt(times), "tzone")[2] # Get timezone - assertthat::assert_that(tzone != "", length(times)>=2) - # Interpolate time steps - inc <- switch (date_interpolation, - "yearly" = "year", - "annual" = "year", - "monthly" = "month", - "daily" = "day" - ) - new_times <- seq.Date(from = as.Date(times[1],tz = tzone), to = as.Date(times[length(times)],tz = tzone), by = inc) - new_times <- to_POSIXct(new_times) - - # Linearly approximate all attributes for new object - # FIXME: Probably terribly memory inefficient but works - new <- as.data.frame(env) - assertthat::assert_that(assertthat::has_name(new,c("x","y","time"))) - new <- dplyr::right_join(new, expand.grid(x = unique(new$x), y = unique(new$y), time = new_times), - by = c("x", "y","time")) - # Sort by time - new <- new[order(new$time),] - # Now linearly interpolate the missing values per grid cell - new2 <- apply(new[,4:ncol(new)], 2, function(z){ - # if(inherits(z, "POSIXct")) return(z) - if(all(is.na(z))) return(z) - stats::approx(y = z, x = as.numeric(new$time), method = "linear") - }) - - # Steps: - # empty_stars - # Join with existing one - # approxNA - - tt <- as.numeric(new_times) - # Calc pixel-wise linear slope - out <- stars::st_apply( - env, - 1:2, - function(x) { - if (anyNA(x)) - NA_real_ - else - stats::lm.fit(cbind(1, tt), x)$coefficients[2] - } - ) - - # new <- stars::st_redimension(out, along = list(time = new_times)) -} - -#' Aggregate stars variables across dimensions -#' -#' @description -#' Small helper function that acts a wrapper to combine -#' 2 or more variables in a `stars` object together. -#' @note Currently only works via matrix manipulation -#' @param obj A [`stars`] object or a [`list`] that can be coerced to one. -#' @param vars A [`vector`] describing the variables to be combined. Has to be of -#' length two or greater. -#' @param newname A [`character`] with the new name for the variable. -#' @param fun A function how the respective layers should be combined. -#' @examples -#' \dontrun{ -#' st_reduce(obj, vars = c('forestShare', 'forestShare.2'), -#' newname = "forest",fun = "sum") -#' } -#' @keywords scenario, internal -st_reduce <- function(obj, vars, newname, fun = 'sum'){ - assertthat::assert_that( - is.list(obj) || inherits(obj, 'stars'), - is.vector(vars) && is.character(vars), - length(vars) >=2, - is.character(fun), - is.character(newname) - ) - fun <- match.arg(fun, c("sum", "multiply", "divide", "subtract", "mean"), several.ok = FALSE) - check_package('stars') - # Convert to stars if not already - if(!inherits(obj, 'stars')) obj <- stars::st_as_stars(obj) - # Check that variables are present - assertthat::assert_that( - all(vars %in% names(obj)) - ) - # Future? - if(foreach:::getDoParRegistered()){ - ibis_future(cores = getOption("ibis.nthread"), strategy = getOption("ibis.futurestrategy")) - fut <- TRUE - } else { fut <- FALSE } - # --- # - # First get all target variables and non-target variables - target <- stars:::select.stars(obj, vars) - non_target <- stars:::select.stars(obj, -vars) - - # Now apply the function on the target - # FIXME: Not working as intended and horribly slow - # target <- stars::st_apply(target, dims, fun, - # PROGRESS = TRUE, - # FUTURE = fut, - # ... - # ) - what <- switch (fun, - "sum" = "+", - "multiply" = "*", - "divide" = "/", - "subtract" = "-", - "mean" = "+" - ) - new <- stars::st_as_stars( Reduce(what, target) ) - if(what == "mean") new <- new / length(target) - stars::st_dimensions(new) <- stars::st_dimensions(target) - # Rename to newname - names(new) <- newname - - # Combine the result again with non-target - out <- c(non_target, new) - rm(new, non_target, target) - - return( - out - ) -} - -#' Converts a stars object to list of rasters -#' -#' @description -#' This is a small helper function to convert a [`stars`] object -#' to a [`Raster`] object. It is possible to select the time frame as well. -#' If multiple \code{"which"} entries are specified, then a [`list`] will be returned. -#' @param obj A [`stars`] object with a \code{"time"} dimension at least. -#' @param which The time entry to use for subsetting. Can be single [`numeric`] or a [`vector`] -#' of numeric time entries corresponding to the time dimension (Default: \code{NULL}). -#' @param template An optional [`Raster`] template to which the output should be aligned too. -#' @returns A [`list`] containing [`Raster`] objects. -#' @keywords scenario, internal -stars_to_raster <- function(obj, which = NULL, template = NULL){ - assertthat::assert_that( - inherits(obj, 'stars'), - is.null(which) || is.numeric(which), - is.null(template) || is.Raster(template) - ) - # Take name of third band, assuming it to be time - time_band <- names(dim(obj))[3] - - assertthat::assert_that( - length(which) <= dim(obj)[time_band] - ) - # Get time dimension and correct if specific entries are requested - times <- stars::st_get_dimension_values(obj, time_band, center = TRUE) - if(is.null(which)) { - which <- 1:length(times) # Use default length - } - - # Output type raster - out <- list() - for(tt in which){ - # Slice to a specific time frame for each - o <- obj %>% stars:::slice.stars({{time_band}}, tt) |> - as("Raster") - - # Reset times to the correct ones - o <- raster::setZ(o, rep(times[tt], raster::nlayers(o))) - - # Now transform the out put if template is set - if(!is.null(template)){ - if(is.Raster(template)){ - # Check again if necessary to rotate - if(!raster::compareCRS(o, template)){ - o <- raster::projectRaster(from = o, crs = template, method = "ngb") - names(o) <- names(obj) - } - # Now crop and resample to target extent if necessary - if(!compareRaster(o, template, stopiffalse = FALSE)){ - o <- raster::crop(o, template) - o2 <- try({alignRasters(data = o, - template = template, - method = "ngb", - func = "mean", cl = FALSE) - },silent = TRUE) - if(inherits(o2,"try-error")){ - o <- raster::resample(o, template, - method = "ngb") - } else { o <- o2; rm(o2)} - } - } - } # End of template adjustments - out[[paste0("time",times[tt])]] <- o - } - return( out ) -} - -#' Converts a raster object to stars -#' -#' @description -#' This is a small helper function to convert a to a [`Raster`] object. -#' @param obj A [`Raster`] object with a \code{"time"} dimension at least (checked via [`getZ`]). -#' @returns A [`stars`] object with the formatted data -#' @seealso `stars_to_raster` -#' @keywords scenario, internal -raster_to_stars <- function(obj){ - assertthat::assert_that( - is.Raster(obj) - ) - # Check that time dimension exist - assertthat::assert_that( !is.null( raster::getZ(obj) ), - msg = "The supplied object requires a z dimension! Preferably provide a stars object.") - assertthat::assert_that(!is.na(raster::crs(obj)), - msg = "Uniform projection for input raster is missing!") - - # Get time dimension - times <- raster::getZ(obj) - if(!all(inherits(times,"Date"))) times <- as.Date(times) - prj <- sf::st_crs(raster::crs(obj)) - - # Convert to RasterStack and reset time dimension - obj <- raster::stack(obj) - obj <- raster::setZ(obj, times) - # stars::make_intervals(times[1], times[2]) # For making intervals from start to end - - # Convert to stars step by step - new_env <- list() - for(i in 1:raster::nlayers(obj)){ - suppressWarnings( o <- stars::st_as_stars(obj[[i]]) ) - # If CRS is NA - if(is.na(sf::st_crs(o))) sf::st_crs(o) <- prj - - # Some hacky stuff since stars is not behaving as intended - dims <- stars::st_dimensions(o) - dims$time <- stars:::create_dimension(values = times[i]) - o <- stars::st_redimension(o,new_dims = dims) - - new_env[[names(obj)[i]]] <- o - } - - new_env <- do.call(stars:::c.stars, new_env) - assertthat::assert_that(inherits(new_env, "stars"), - stars::st_dimensions(new_env) |> length() == 3) - - return(new_env) -} - -#' This function add layers from a RasterStack to a stars object -#' -#' @description -#' Often it is necessary to add static variables to existing stars objects. -#' These will be replicated across the time dimension. This function is a small helper function -#' that allows the addition of said raster stacks to a stars object. -#' @param obj A [`stars`] object with a time dimension (\code{"time"}). -#' @param new A [`RasterStack`] object with additional covariates to be added. -#' @returns A [`stars`] object with the names of the [`Raster`] object added. -#' @keywords scenario, internal -st_add_raster <- function(obj, new){ - assertthat::assert_that( - inherits(obj, "stars"), - is.Raster(new), - raster::nlayers(new) >= 1 - ) - - # Check whether there are any variables in the stars object already, if so drop - if(any(names(new) %in% names(obj))){ - myLog("[Starting]", "yellow", "Duplicate variables in stars and new objects.") - new <- raster::dropLayer(new, which( names(new) %in% names(obj) ) ) - } - - full_dims <- stars::st_dimensions(obj) - # Get times objects - time_name <- names(full_dims)[3] - times <- rep(stars::st_get_dimension_values(obj, time_name)) - - # Now loop through each layer and add it to the target file - for(lyr in names(new)){ - s <- raster::stack(replicate(length(times), new[[lyr]])) |> - stars::st_as_stars() - names(s) <- lyr - - stars::st_dimensions(s) <- full_dims - obj <- c(obj, s) - } - assertthat::assert_that( - all(all(names(new) %in% names(obj))) - ) - return(obj) -} - -#' Summarize results from scenario projection object -#' -#' @description -#' This is a wrapper function to summarize the output of a scenario projection. The -#' output will contain the average change in the layer per time step. -#' A parameter called \code{"relative"} can be set to calculate relative change instead. -#' @param scenario A [`stars`] object with a time dimension. -#' @param relative A [`logical`] check whether to calculate relative changes instead. -#' @keywords internal, scenario -#' @noRd -summarise_projection <- function(scenario, fun = "mean", relative = TRUE){ - assertthat::assert_that( - is.list(scenario) || inherits(scenario, "stars"), - length(dim(scenario))==3, - is.logical(relative) - ) - fun <- match.arg(fun, c("mean", "sum"),several.ok = FALSE) - - # Convert to scenarios to data.frame - df <- stars:::as.data.frame.stars(stars:::st_as_stars(scenario)) %>% subset(., complete.cases(.)) - names(df) <- c("x", "y", "band", "suitability") - # Add grid cell grouping - df <- df %>% dplyr::group_by(x,y) %>% dplyr::mutate(id = dplyr::cur_group_id()) %>% - dplyr::ungroup() %>% dplyr::select(-x,-y) %>% - dplyr::arrange(id, band) - - # Summarize the overall moments - if(fun == "mean"){ - # Check if has unit, if so deparse - if(inherits(df$suitability, 'units')) df$suitability <- as.numeric(df$suitability) - out <- df %>% - dplyr::filter(suitability > 0) %>% - dplyr::group_by(band) %>% - dplyr::summarise(suitability_mean = mean(suitability, na.rm = TRUE), - suitability_q25 = quantile(suitability, .25), - suitability_q50 = quantile(suitability, .5), - suitability_q75 = quantile(suitability, .75)) - # Total amount of area lost / gained / stable since previous time step - totchange_occ <- df %>% - dplyr::group_by(id) %>% - dplyr::mutate(change = (suitability - dplyr::lag(suitability)) ) %>% dplyr::ungroup() - o <- totchange_occ %>% dplyr::group_by(band) %>% - dplyr::summarise(suitability_avggain = mean(change[change > 0]), - suitability_avgloss = mean(change[change < 0])) - - out <- out %>% dplyr::left_join(o, by = "band") - if(relative){ - # Finally calculate relative change to baseline (first entry) for all entries where this is possible - relChange <- function(v, fac = 100) (((v- v[1]) / v[1]) *fac) - out[,c("suitability_mean","suitability_q25", "suitability_q50", "suitability_q75")] <- apply( - out[,c("suitability_mean","suitability_q25", "suitability_q50", "suitability_q75")], 2, relChange) - } - } else if(fun == "sum") { - # Check if has unit, if so deparse - if(inherits(df$suitability, 'units')) df$suitability <- as.numeric(df$suitability) - out <- df %>% - dplyr::filter(suitability > 0) %>% - dplyr::group_by(band) %>% - dplyr::summarise(suitability_sum = sum(suitability, na.rm = TRUE), - suitability_q25 = quantile(suitability, .25), - suitability_q50 = quantile(suitability, .5), - suitability_q75 = quantile(suitability, .75)) - # Total amount of area lost / gained / stable since previous time step - totchange_occ <- df %>% - dplyr::group_by(id) %>% - dplyr::mutate(change = (suitability - dplyr::lag(suitability)) ) %>% dplyr::ungroup() - o <- totchange_occ %>% dplyr::group_by(band) %>% - dplyr::summarise(suitability_avggain = sum(change[change > 0]), - suitability_avgloss = sum(change[change < 0])) - - out <- out %>% dplyr::left_join(o, by = "band") - if(relative){ - # Finally calculate relative change to baseline (first entry) for all entries where this is possible - relChange <- function(v, fac = 100) (((v- v[1]) / v[1]) *fac) - out[,c("suitability_sum","suitability_q25", "suitability_q50", "suitability_q75")] <- apply( - out[,c("suitability_sum","suitability_q25", "suitability_q50", "suitability_q75")], 2, relChange) - } - } - - # Return output - return(out) -} - -#' Summarize change before to after -#' -#' @description -#' This is a wrapper function to summarize the output of a scenario projection, but specifically -#' calculates statistics of change for two time steps, a before and after step. -#' @param scenario A [`stars`] object with a time dimension. -#' @references -#' * Godsoe, W. (2014). Inferring the similarity of species distributions using Species’ Distribution Models. Ecography, 37(2), 130-136. -#' @keywords internal, scenario -#' @noRd -summarise_change <- function(scenario){ - assertthat::assert_that( - inherits(scenario, "stars") - ) - check_package("geosphere") - - # Get the current and future - ss <- stars_to_raster(scenario) - # Time period - times <- stars::st_get_dimension_values(scenario, 3,center = TRUE) - current <- ss[[1]] - future <- ss[[length(ss)]] - times_length <- round(as.numeric(difftime(times[length(times)], times[1], units = "weeks"))/52.25,0) - rm(ss) - - # Calculate the area and units - ar <- st_area(scenario) - ar_unit <- units::deparse_unit(ar$area) - if(ar_unit == "m2"){ - ar_unit <- "ha" - mult <- 0.0001 - } else { mult <- 1} - ar <- as(ar, "Raster") - - # --- # - val <- c("Current range", "Future range", "Unsuitable", - "Loss", "Gain", "Stable", "Percent loss", - "Percent gain", "Range change", "Percent change", - "Sorensen index", "Centroid distance", "Centroid change direction") - change <- data.frame(category = val, - period = c(times[1] |> as.character(), - times[length(times)] |> as.character(), rep(paste0(times_length, " years"), 11 ) ), - value = NA, - unit = c(rep(ar_unit,6), "%", "%", ar_unit, "%", "similarity", NA, "deg")) - change$value[1] <- raster::cellStats((current) * raster::area(current), "sum") * mult - change$value[2] <- raster::cellStats((future) * raster::area(future), "sum") * mult - - # Check that is binary thresholded - rr <- raster::overlay(current, future, fun = function(x, y){x + y * 2}) - change$value[3] <- raster::cellStats((rr == 0) * raster::area(current), "sum") * mult - change$value[4] <- raster::cellStats((rr == 1) * raster::area(current), "sum") * mult - change$value[5] <- raster::cellStats((rr == 2) * raster::area(current), "sum") * mult - change$value[6] <- raster::cellStats((rr == 3) * raster::area(current), "sum") * mult - change$value[7] <- change$value[4] / change$value[1] * 100 - change$value[8] <- change$value[5] / change$value[1] * 100 - change$value[9] <- change$value[2] - change$value[1] - change$value[10] <- change$value[9] / sum(c(change$value[3], change$value[4])) * 100 - - # Sorensen similarity index - change$value[11] <- 2 * raster::cellStats(rr == 3, "sum") / (raster::cellStats(current, "sum") + raster::cellStats(future, "sum")) - - # Calculate distance between centroids - sf1 <- calculate_range_centre(current, spatial = TRUE) - sf2 <- calculate_range_centre(future, spatial = TRUE) - dis <- sf::st_distance(sf1, sf2, by_element = FALSE) - dis_unit <- units::deparse_unit(dis) - # Convert units if meter - if( dis_unit == "m") {mult <- 0.001; dis_unit = "km" } else { mult <- 1} - change$value[12] <- as.vector(dis) * mult - change$unit[12] <- dis_unit - - # Calculate direction between centroids - change$value[13] <- geosphere::finalBearing(as_Spatial(sf1 |> sf::st_transform(crs = sf::st_crs(4326))), - as_Spatial(sf2 |> sf::st_transform(crs = sf::st_crs(4326)))) - - change <- change |> tibble::as_tibble() - return(change) -} - -#' Crop and project a stars raster `HACK` -#' -#' @description -#' The reprojection of WGS84 currently fails due to some unforeseen bug. -#' This function is meant to reproject back the lasyer -#' @param obj A ['stars'] object to be clipped and cropped. -#' @param template A ['Raster'] or ['sf'] object to which the object should be projected. -#' @keywords internal, scenario -#' @noRd -hack_project_stars <- function(obj, template){ - assertthat::assert_that( - inherits(obj, "stars"), - is.Raster(template) || inherits(template, "sf") - ) - # Get tempdir - td <- raster::tmpDir() - - # Get resolution - bg <- stars::st_as_stars(template) - - # Get full dis - full_dis <- stars::st_dimensions(obj) - assertthat::assert_that(length(full_dis)<=3,msg = "Stars object can only have x,y,z dimension.") - - # Output - out <- c() - for(v in names(obj)){ - sub <- obj[v] - stars::write_stars(sub, file.path(td, "ReprojectedStars.tif")) - - suppressWarnings( - gdalUtils::gdalwarp(srcfile = file.path(td, "ReprojectedStars.tif"), - dstfile = file.path(td, "ReprojectedStars_temp.tif"), - s_srs = "EPSG:4296", - tr = raster::res(template), - te = raster::bbox(template), - t_srs = sp::proj4string(template)) - ) - oo <- stars::read_stars(file.path(td, "ReprojectedStars_temp.tif"),proxy = F) - names(oo) <- v # Rename - - # provide to output - out <- c(out, oo) - rm(oo) - try({file.remove(file.path(td, "ReprojectedStars.tif"), - file.path(td, "ReprojectedStars_temp.tif"))},silent = TRUE) - } - # Reformat again - out <- stars::st_as_stars(out) - assertthat::assert_that( - length(stars::st_get_dimension_values(bg, "x")) == length(stars::st_get_dimension_values(out, "x")) - ) - # Now reset the dimensions and add to output - dims <- stars::st_dimensions(out) - # Replace the band variable with the original one - names(dims)[3] <- "time" - dims$time <- full_dis$time - # And the x-y dimensions by the template values - bg_dim <- stars::st_dimensions(bg) - dims$x <- bg_dim$x; dims$y <- bg_dim$y - stars::st_dimensions(out) <- dims - out <- stars::st_set_dimensions(out, xy = c("x","y")) - assertthat::assert_that( - length(out) == length(obj), - stars:::is_regular_grid(out) - ) - return(out) -} - -#' Quick handy function to calculate an area-weighted centre of a range -#' -#' @param layer A [`RasterLayer`] or [`sf`] object for which the centre of the range is to be calculated. -#' If the distribution is continuous, then the centre is calculated as the value centre to all non-NA values. -#' @param spatial A [`logical`] of whether outputs should be returned as spatial -#' @keywords scenario, internal -#' @noRd -calculate_range_centre <- function(layer, spatial = TRUE) { - assertthat::assert_that( - is.Raster(layer) || inherits(layer, "sf") - ) - - # If layer is a raster - if(is.Raster(layer)){ - assertthat::assert_that( - length( unique(layer) ) == 2, - raster::cellStats(layer, 'max') == 1 - ) - # Calculate area-weighted centre - r_wt <- raster::area(layer) - values(r_wt)[is.na(values(layer))] <- NA - - # Make a spatial point layer - spdf <- raster::rasterToPoints( raster::stack(layer, r_wt), spatial = TRUE) |> sf::st_as_sf() - spdf <- spdf[which(spdf[[1]]>0), ] # Get only non-zero values - - if(is.na(sf::st_crs(spdf))) stop("Unprojected layer found. Check projections throughout!") - # If long-latitude, convert to google mercator for calculating the centroids - if(sf::st_is_longlat(spdf) ){ - ori.proj <- sf::st_crs(spdf) - spdf <- sf::st_transform( spdf, crs = sf::st_crs(3857)) - } else { ori.proj <- sf::st_crs(spdf) } - - p <- sf::st_drop_geometry(spdf[, names(spdf)[2] ])[,1] - # Calculate weighted centroid - Xw <- sum(sf::st_coordinates(spdf)[,1] * p) - Yw <- sum(sf::st_coordinates(spdf)[,2] * p) - wX <- Xw/sum(p) - wY <- Yw/sum(p) - xy <- data.frame(ID = 1, name = names(layer), X=wX, Y=wY) - cent <- sf::st_as_sf(xy, coords = c("X", "Y"), - crs = sf::st_crs(spdf), agr = "constant") - # Convert back to original projection - cent <- sf::st_transform(cent, ori.proj) - - } else { - if(is.na(sf::st_crs(layer))) stop("Unprojected layer found. Check projections throughout!") - # If long-latitude, convert to google mercator for calculating the centroids - if(sf::st_is_longlat(layer) ){ - ori.proj <- sf::st_crs(layer) - layer <- sf::st_transform( layer, crs = sf::st_crs(3857)) - } else { ori.proj <- sf::st_crs(layer) } - - if(unique(sf::st_geometry_type(layer)) %in% c("POLYGON", "MULTIPOLYGON")){ - # Cast them into a multi-polygon - cent <- sf::st_combine(layer) |> sf::st_centroid() |> sf::st_as_sf() - } else if(unique(sf::st_geometry_type(layer)) %in% c("POINT", "MULTIPOINT")){ - cent <- sf::st_combine(layer) |> sf::st_centroid() |> sf::st_as_sf() - } else { - stop("Centroid calculations not implemented!") - } - # Convert back to original projection - cent <- sf::st_transform(cent, ori.proj) - cent$ID = 1 - } - - if(!spatial){ - cent$X <- sf::st_coordinates(cent)[,1] - cent$Y <- sf::st_coordinates(cent)[,2] - cent <- sf::st_drop_geometry(cent) - } - return(cent) -} +#' Approximate missing time steps between dates +#' +#' @description +#' This function linearly approximates shares between time steps, so that gaps for instance +#' between 2010 and 2020 are filled with data for 2010, 2011, 2012, etc. +#' @param env A [`stars`] object. +#' @param date_interpolation [`character`] on how missing dates between events should be interpolated. See [`project()`]. +#' @return [`logical`] indicating if the two [`Raster-class`] objects have the same +#' @keywords scenario +#' @noRd +approximate_gaps <- function(env, date_interpolation = "annual"){ + assertthat::assert_that( + inherits(env, "stars"), + is.character(date_interpolation) + ) + check_package("dplyr") + date_interpolation <- match.arg(date_interpolation, c("none", "yearly", "annual", "monthly", "daily"), several.ok = FALSE) + if(date_interpolation=="none") return(env) + + stop("Still in progress") + # --- # + # Get individual time steps at interval + times <- stars::st_get_dimension_values(env, which = names(dim(env))[3], center = TRUE) + times <- to_POSIXct(times) + tzone <- attr(as.POSIXlt(times), "tzone")[2] # Get timezone + assertthat::assert_that(tzone != "", length(times)>=2) + # Interpolate time steps + inc <- switch (date_interpolation, + "yearly" = "year", + "annual" = "year", + "monthly" = "month", + "daily" = "day" + ) + new_times <- seq.Date(from = as.Date(times[1],tz = tzone), to = as.Date(times[length(times)],tz = tzone), by = inc) + new_times <- to_POSIXct(new_times) + + # Linearly approximate all attributes for new object + # FIXME: Probably terribly memory inefficient but works + # MH: Should this be stars:::as.data.frame.stars? + new <- as.data.frame(env) + assertthat::assert_that(assertthat::has_name(new,c("x","y","time"))) + new <- dplyr::right_join(new, expand.grid(x = unique(new$x), y = unique(new$y), time = new_times), + by = c("x", "y","time")) + # Sort by time + new <- new[order(new$time),] + # Now linearly interpolate the missing values per grid cell + new2 <- apply(new[,4:ncol(new)], 2, function(z){ + # if(inherits(z, "POSIXct")) return(z) + if(all(is.na(z))) return(z) + stats::approx(y = z, x = as.numeric(new$time), method = "linear") + }) + + # Steps: + # empty_stars + # Join with existing one + # approxNA + + tt <- as.numeric(new_times) + # Calc pixel-wise linear slope + out <- stars::st_apply( + env, + 1:2, + function(x) { + if (anyNA(x)) + NA_real_ + else + stats::lm.fit(cbind(1, tt), x)$coefficients[2] + } + ) + + # new <- stars::st_redimension(out, along = list(time = new_times)) +} + +#' Aggregate stars variables across dimensions +#' +#' @description +#' Small helper function that acts a wrapper to combine +#' 2 or more variables in a `stars` object together. +#' @note Currently only works via matrix manipulation +#' @param obj A [`stars`] object or a [`list`] that can be coerced to one. +#' @param vars A [`vector`] describing the variables to be combined. Has to be of +#' length two or greater. +#' @param newname A [`character`] with the new name for the variable. +#' @param fun A function how the respective layers should be combined. +#' @examples +#' \dontrun{ +#' st_reduce(obj, vars = c('forestShare', 'forestShare.2'), +#' newname = "forest",fun = "sum") +#' } +#' @keywords scenario, internal +st_reduce <- function(obj, vars, newname, fun = 'sum'){ + assertthat::assert_that( + is.list(obj) || inherits(obj, 'stars'), + is.vector(vars) && is.character(vars), + length(vars) >=2, + is.character(fun), + is.character(newname) + ) + fun <- match.arg(fun, c("sum", "multiply", "divide", "subtract", "mean"), several.ok = FALSE) + check_package('stars') + # Convert to stars if not already + if(!inherits(obj, 'stars')) obj <- stars::st_as_stars(obj) + # Check that variables are present + assertthat::assert_that( + all(vars %in% names(obj)) + ) + # Future? + if(foreach::getDoParRegistered()){ + ibis_future(cores = getOption("ibis.nthread"), strategy = getOption("ibis.futurestrategy")) + fut <- TRUE + } else { fut <- FALSE } + # --- # + # First get all target variables and non-target variables + target <- stars:::select.stars(obj, vars) + non_target <- stars:::select.stars(obj, -vars) + + # Now apply the function on the target + # FIXME: Not working as intended and horribly slow + # target <- stars::st_apply(target, dims, fun, + # PROGRESS = TRUE, + # FUTURE = fut, + # ... + # ) + what <- switch (fun, + "sum" = "+", + "multiply" = "*", + "divide" = "/", + "subtract" = "-", + "mean" = "+" + ) + new <- stars::st_as_stars( Reduce(what, target) ) + if(what == "mean") new <- new / length(target) + stars::st_dimensions(new) <- stars::st_dimensions(target) + # Rename to newname + names(new) <- newname + + # Combine the result again with non-target + out <- c(non_target, new) + rm(new, non_target, target) + + return( + out + ) +} + +#' Converts a stars object to list of rasters +#' +#' @description +#' This is a small helper function to convert a [`stars`] object +#' to a [`Raster`] object. It is possible to select the time frame as well. +#' If multiple \code{"which"} entries are specified, then a [`list`] will be returned. +#' @param obj A [`stars`] object with a \code{"time"} dimension at least. +#' @param which The time entry to use for subsetting. Can be single [`numeric`] or a [`vector`] +#' of numeric time entries corresponding to the time dimension (Default: \code{NULL}). +#' @param template An optional [`Raster`] template to which the output should be aligned too. +#' @returns A [`list`] containing [`Raster`] objects. +#' @keywords scenario, internal +stars_to_raster <- function(obj, which = NULL, template = NULL){ + assertthat::assert_that( + inherits(obj, 'stars'), + is.null(which) || is.numeric(which), + is.null(template) || is.Raster(template) + ) + # Take name of third band, assuming it to be time + time_band <- names(dim(obj))[3] + + assertthat::assert_that( + length(which) <= dim(obj)[time_band] + ) + # Get time dimension and correct if specific entries are requested + times <- stars::st_get_dimension_values(obj, time_band, center = TRUE) + if(is.null(which)) { + which <- 1:length(times) # Use default length + } + + # Output type raster + out <- list() + for(tt in which){ + # Slice to a specific time frame for each + o <- obj |> stars:::slice.stars({{time_band}}, tt) |> + methods::as("Raster") + + # Reset times to the correct ones + o <- raster::setZ(o, rep(times[tt], raster::nlayers(o))) + + # Now transform the out put if template is set + if(!is.null(template)){ + if(is.Raster(template)){ + # Check again if necessary to rotate + if(!raster::compareCRS(o, template)){ + o <- raster::projectRaster(from = o, crs = template, method = "ngb") + names(o) <- names(obj) + } + # Now crop and resample to target extent if necessary + if(!compareRaster(o, template, stopiffalse = FALSE)){ + o <- raster::crop(o, template) + o2 <- try({alignRasters(data = o, + template = template, + method = "ngb", + func = "mean", cl = FALSE) + },silent = TRUE) + if(inherits(o2,"try-error")){ + o <- raster::resample(o, template, + method = "ngb") + } else { o <- o2; rm(o2)} + } + } + } # End of template adjustments + out[[paste0("time",times[tt])]] <- o + } + return( out ) +} + +#' Converts a raster object to stars +#' +#' @description +#' This is a small helper function to convert a to a [`Raster`] object. +#' @param obj A [`Raster`] object with a \code{"time"} dimension at least (checked via [`getZ`]). +#' @returns A [`stars`] object with the formatted data +#' @seealso `stars_to_raster` +#' @keywords scenario, internal +raster_to_stars <- function(obj){ + assertthat::assert_that( + is.Raster(obj) + ) + # Check that time dimension exist + assertthat::assert_that( !is.null( raster::getZ(obj) ), + msg = "The supplied object requires a z dimension! Preferably provide a stars object.") + assertthat::assert_that(!is.na(raster::crs(obj)), + msg = "Uniform projection for input raster is missing!") + + # Get time dimension + times <- raster::getZ(obj) + if(!all(inherits(times,"Date"))) times <- as.Date(times) + prj <- sf::st_crs(raster::crs(obj)) + + # Convert to RasterStack and reset time dimension + obj <- raster::stack(obj) + obj <- raster::setZ(obj, times) + # stars::make_intervals(times[1], times[2]) # For making intervals from start to end + + # Convert to stars step by step + new_env <- list() + for(i in 1:raster::nlayers(obj)){ + suppressWarnings( o <- stars::st_as_stars(obj[[i]]) ) + # If CRS is NA + if(is.na(sf::st_crs(o))) sf::st_crs(o) <- prj + + # Some hacky stuff since stars is not behaving as intended + dims <- stars::st_dimensions(o) + dims$time <- stars:::create_dimension(values = times[i]) + o <- stars::st_redimension(o,new_dims = dims) + + new_env[[names(obj)[i]]] <- o + } + + new_env <- do.call(stars:::c.stars, new_env) + assertthat::assert_that(inherits(new_env, "stars"), + stars::st_dimensions(new_env) |> length() == 3) + + return(new_env) +} + +#' This function add layers from a RasterStack to a stars object +#' +#' @description +#' Often it is necessary to add static variables to existing stars objects. +#' These will be replicated across the time dimension. This function is a small helper function +#' that allows the addition of said raster stacks to a stars object. +#' @param obj A [`stars`] object with a time dimension (\code{"time"}). +#' @param new A [`RasterStack`] object with additional covariates to be added. +#' @returns A [`stars`] object with the names of the [`Raster`] object added. +#' @keywords scenario, internal +st_add_raster <- function(obj, new){ + assertthat::assert_that( + inherits(obj, "stars"), + is.Raster(new), + raster::nlayers(new) >= 1 + ) + + # Check whether there are any variables in the stars object already, if so drop + if(any(names(new) %in% names(obj))){ + myLog("[Starting]", "yellow", "Duplicate variables in stars and new objects.") + new <- raster::dropLayer(new, which( names(new) %in% names(obj) ) ) + } + + full_dims <- stars::st_dimensions(obj) + # Get times objects + time_name <- names(full_dims)[3] + times <- rep(stars::st_get_dimension_values(obj, time_name)) + + # Now loop through each layer and add it to the target file + for(lyr in names(new)){ + s <- raster::stack(replicate(length(times), new[[lyr]])) |> + stars::st_as_stars() + names(s) <- lyr + + stars::st_dimensions(s) <- full_dims + obj <- c(obj, s) + } + assertthat::assert_that( + all(all(names(new) %in% names(obj))) + ) + return(obj) +} + +#' Summarize results from scenario projection object +#' +#' @description +#' This is a wrapper function to summarize the output of a scenario projection. The +#' output will contain the average change in the layer per time step. +#' A parameter called \code{"relative"} can be set to calculate relative change instead. +#' @param scenario A [`stars`] object with a time dimension. +#' @param relative A [`logical`] check whether to calculate relative changes instead. +#' @keywords internal, scenario +#' @noRd +summarise_projection <- function(scenario, fun = "mean", relative = TRUE){ + assertthat::assert_that( + is.list(scenario) || inherits(scenario, "stars"), + length(dim(scenario))==3, + is.logical(relative) + ) + fun <- match.arg(fun, c("mean", "sum"),several.ok = FALSE) + + # Convert to scenarios to data.frame + df <- stars:::as.data.frame.stars(stars::st_as_stars(scenario)) |> (\(.) subset(., stats::complete.cases(.)))() + names(df) <- c("x", "y", "band", "suitability") + # Add grid cell grouping + df <- df |> dplyr::group_by(x,y) |> dplyr::mutate(id = dplyr::cur_group_id()) |> + dplyr::ungroup() |> dplyr::select(-x,-y) |> + dplyr::arrange(id, band) + + # Summarize the overall moments + if(fun == "mean"){ + # Check if has unit, if so deparse + if(inherits(df$suitability, 'units')) df$suitability <- as.numeric(df$suitability) + out <- df |> + dplyr::filter(suitability > 0) |> + dplyr::group_by(band) |> + dplyr::summarise(suitability_mean = mean(suitability, na.rm = TRUE), + suitability_q25 = quantile(suitability, .25), + suitability_q50 = quantile(suitability, .5), + suitability_q75 = quantile(suitability, .75)) + # Total amount of area lost / gained / stable since previous time step + totchange_occ <- df |> + dplyr::group_by(id) |> + dplyr::mutate(change = (suitability - dplyr::lag(suitability)) ) |> dplyr::ungroup() + o <- totchange_occ |> dplyr::group_by(band) |> + dplyr::summarise(suitability_avggain = mean(change[change > 0]), + suitability_avgloss = mean(change[change < 0])) + + out <- out |> dplyr::left_join(o, by = "band") + if(relative){ + # Finally calculate relative change to baseline (first entry) for all entries where this is possible + relChange <- function(v, fac = 100) (((v- v[1]) / v[1]) *fac) + out[,c("suitability_mean","suitability_q25", "suitability_q50", "suitability_q75")] <- apply( + out[,c("suitability_mean","suitability_q25", "suitability_q50", "suitability_q75")], 2, relChange) + } + } else if(fun == "sum") { + # Check if has unit, if so deparse + if(inherits(df$suitability, 'units')) df$suitability <- as.numeric(df$suitability) + out <- df |> + dplyr::filter(suitability > 0) |> + dplyr::group_by(band) |> + dplyr::summarise(suitability_sum = sum(suitability, na.rm = TRUE), + suitability_q25 = quantile(suitability, .25), + suitability_q50 = quantile(suitability, .5), + suitability_q75 = quantile(suitability, .75)) + # Total amount of area lost / gained / stable since previous time step + totchange_occ <- df |> + dplyr::group_by(id) |> + dplyr::mutate(change = (suitability - dplyr::lag(suitability)) ) |> dplyr::ungroup() + o <- totchange_occ |> dplyr::group_by(band) |> + dplyr::summarise(suitability_avggain = sum(change[change > 0]), + suitability_avgloss = sum(change[change < 0])) + + out <- out |> dplyr::left_join(o, by = "band") + if(relative){ + # Finally calculate relative change to baseline (first entry) for all entries where this is possible + relChange <- function(v, fac = 100) (((v- v[1]) / v[1]) *fac) + out[,c("suitability_sum","suitability_q25", "suitability_q50", "suitability_q75")] <- apply( + out[,c("suitability_sum","suitability_q25", "suitability_q50", "suitability_q75")], 2, relChange) + } + } + + # Return output + return(out) +} + +#' Summarize change before to after +#' +#' @description +#' This is a wrapper function to summarize the output of a scenario projection, but specifically +#' calculates statistics of change for two time steps, a before and after step. +#' @param scenario A [`stars`] object with a time dimension. +#' @references +#' * Godsoe, W. (2014). Inferring the similarity of species distributions using Species’ Distribution Models. Ecography, 37(2), 130-136. +#' @keywords internal, scenario +#' @noRd +summarise_change <- function(scenario){ + assertthat::assert_that( + inherits(scenario, "stars") + ) + check_package("geosphere") + + # Get the current and future + ss <- stars_to_raster(scenario) + # Time period + times <- stars::st_get_dimension_values(scenario, 3,center = TRUE) + current <- ss[[1]] + future <- ss[[length(ss)]] + times_length <- round(as.numeric(difftime(times[length(times)], times[1], units = "weeks"))/52.25,0) + rm(ss) + + # Calculate the area and units + ar <- st_area(scenario) + ar_unit <- units::deparse_unit(ar$area) + if(ar_unit == "m2"){ + ar_unit <- "ha" + mult <- 0.0001 + } else { mult <- 1} + ar <- methods::as(ar, "Raster") + + # --- # + val <- c("Current range", "Future range", "Unsuitable", + "Loss", "Gain", "Stable", "Percent loss", + "Percent gain", "Range change", "Percent change", + "Sorensen index", "Centroid distance", "Centroid change direction") + change <- data.frame(category = val, + period = c(times[1] |> as.character(), + times[length(times)] |> as.character(), rep(paste0(times_length, " years"), 11 ) ), + value = NA, + unit = c(rep(ar_unit,6), "%", "%", ar_unit, "%", "similarity", NA, "deg")) + change$value[1] <- raster::cellStats((current) * raster::area(current), "sum") * mult + change$value[2] <- raster::cellStats((future) * raster::area(future), "sum") * mult + + # Check that is binary thresholded + rr <- raster::overlay(current, future, fun = function(x, y){x + y * 2}) + change$value[3] <- raster::cellStats((rr == 0) * raster::area(current), "sum") * mult + change$value[4] <- raster::cellStats((rr == 1) * raster::area(current), "sum") * mult + change$value[5] <- raster::cellStats((rr == 2) * raster::area(current), "sum") * mult + change$value[6] <- raster::cellStats((rr == 3) * raster::area(current), "sum") * mult + change$value[7] <- change$value[4] / change$value[1] * 100 + change$value[8] <- change$value[5] / change$value[1] * 100 + change$value[9] <- change$value[2] - change$value[1] + change$value[10] <- change$value[9] / sum(c(change$value[3], change$value[4])) * 100 + + # Sorensen similarity index + change$value[11] <- 2 * raster::cellStats(rr == 3, "sum") / (raster::cellStats(current, "sum") + raster::cellStats(future, "sum")) + + # Calculate distance between centroids + sf1 <- calculate_range_centre(current, spatial = TRUE) + sf2 <- calculate_range_centre(future, spatial = TRUE) + dis <- sf::st_distance(sf1, sf2, by_element = FALSE) + dis_unit <- units::deparse_unit(dis) + # Convert units if meter + if( dis_unit == "m") {mult <- 0.001; dis_unit = "km" } else { mult <- 1} + change$value[12] <- as.vector(dis) * mult + change$unit[12] <- dis_unit + + # Calculate direction between centroids + change$value[13] <- geosphere::finalBearing(as_Spatial(sf1 |> sf::st_transform(crs = sf::st_crs(4326))), + as_Spatial(sf2 |> sf::st_transform(crs = sf::st_crs(4326)))) + + change <- change |> tibble::as_tibble() + return(change) +} + +#' Crop and project a stars raster `HACK` +#' +#' @description +#' The reprojection of WGS84 currently fails due to some unforeseen bug. +#' This function is meant to reproject back the lasyer +#' @param obj A ['stars'] object to be clipped and cropped. +#' @param template A ['Raster'] or ['sf'] object to which the object should be projected. +#' @keywords internal, scenario +#' @noRd +hack_project_stars <- function(obj, template){ + assertthat::assert_that( + inherits(obj, "stars"), + is.Raster(template) || inherits(template, "sf") + ) + # Get tempdir + td <- raster::tmpDir() + + # Get resolution + bg <- stars::st_as_stars(template) + + # Get full dis + full_dis <- stars::st_dimensions(obj) + assertthat::assert_that(length(full_dis)<=3,msg = "Stars object can only have x,y,z dimension.") + + # Output + out <- c() + for(v in names(obj)){ + sub <- obj[v] + stars::write_stars(sub, file.path(td, "ReprojectedStars.tif")) + + suppressWarnings( + gdalUtils::gdalwarp(srcfile = file.path(td, "ReprojectedStars.tif"), + dstfile = file.path(td, "ReprojectedStars_temp.tif"), + s_srs = "EPSG:4296", + tr = raster::res(template), + te = raster::bbox(template), + t_srs = sp::proj4string(template)) + ) + oo <- stars::read_stars(file.path(td, "ReprojectedStars_temp.tif"),proxy = F) + names(oo) <- v # Rename + + # provide to output + out <- c(out, oo) + rm(oo) + try({file.remove(file.path(td, "ReprojectedStars.tif"), + file.path(td, "ReprojectedStars_temp.tif"))},silent = TRUE) + } + # Reformat again + out <- stars::st_as_stars(out) + assertthat::assert_that( + length(stars::st_get_dimension_values(bg, "x")) == length(stars::st_get_dimension_values(out, "x")) + ) + # Now reset the dimensions and add to output + dims <- stars::st_dimensions(out) + # Replace the band variable with the original one + names(dims)[3] <- "time" + dims$time <- full_dis$time + # And the x-y dimensions by the template values + bg_dim <- stars::st_dimensions(bg) + dims$x <- bg_dim$x; dims$y <- bg_dim$y + stars::st_dimensions(out) <- dims + out <- stars::st_set_dimensions(out, xy = c("x","y")) + assertthat::assert_that( + length(out) == length(obj), + stars:::is_regular_grid(out) + ) + return(out) +} + +#' Quick handy function to calculate an area-weighted centre of a range +#' +#' @param layer A [`RasterLayer`] or [`sf`] object for which the centre of the range is to be calculated. +#' If the distribution is continuous, then the centre is calculated as the value centre to all non-NA values. +#' @param spatial A [`logical`] of whether outputs should be returned as spatial +#' @keywords scenario, internal +#' @noRd +calculate_range_centre <- function(layer, spatial = TRUE) { + assertthat::assert_that( + is.Raster(layer) || inherits(layer, "sf") + ) + + # If layer is a raster + if(is.Raster(layer)){ + assertthat::assert_that( + length( unique(layer) ) == 2, + raster::cellStats(layer, 'max') == 1 + ) + # Calculate area-weighted centre + r_wt <- raster::area(layer) + values(r_wt)[is.na(values(layer))] <- NA + + # Make a spatial point layer + spdf <- raster::rasterToPoints( raster::stack(layer, r_wt), spatial = TRUE) |> sf::st_as_sf() + spdf <- spdf[which(spdf[[1]]>0), ] # Get only non-zero values + + if(is.na(sf::st_crs(spdf))) stop("Unprojected layer found. Check projections throughout!") + # If long-latitude, convert to google mercator for calculating the centroids + if(sf::st_is_longlat(spdf) ){ + ori.proj <- sf::st_crs(spdf) + spdf <- sf::st_transform( spdf, crs = sf::st_crs(3857)) + } else { ori.proj <- sf::st_crs(spdf) } + + p <- sf::st_drop_geometry(spdf[, names(spdf)[2] ])[,1] + # Calculate weighted centroid + Xw <- sum(sf::st_coordinates(spdf)[,1] * p) + Yw <- sum(sf::st_coordinates(spdf)[,2] * p) + wX <- Xw/sum(p) + wY <- Yw/sum(p) + xy <- data.frame(ID = 1, name = names(layer), X=wX, Y=wY) + cent <- sf::st_as_sf(xy, coords = c("X", "Y"), + crs = sf::st_crs(spdf), agr = "constant") + # Convert back to original projection + cent <- sf::st_transform(cent, ori.proj) + + } else { + if(is.na(sf::st_crs(layer))) stop("Unprojected layer found. Check projections throughout!") + # If long-latitude, convert to google mercator for calculating the centroids + if(sf::st_is_longlat(layer) ){ + ori.proj <- sf::st_crs(layer) + layer <- sf::st_transform( layer, crs = sf::st_crs(3857)) + } else { ori.proj <- sf::st_crs(layer) } + + if(unique(sf::st_geometry_type(layer)) %in% c("POLYGON", "MULTIPOLYGON")){ + # Cast them into a multi-polygon + cent <- sf::st_combine(layer) |> sf::st_centroid() |> sf::st_as_sf() + } else if(unique(sf::st_geometry_type(layer)) %in% c("POINT", "MULTIPOINT")){ + cent <- sf::st_combine(layer) |> sf::st_centroid() |> sf::st_as_sf() + } else { + stop("Centroid calculations not implemented!") + } + # Convert back to original projection + cent <- sf::st_transform(cent, ori.proj) + cent$ID = 1 + } + + if(!spatial){ + cent$X <- sf::st_coordinates(cent)[,1] + cent$Y <- sf::st_coordinates(cent)[,2] + cent <- sf::st_drop_geometry(cent) + } + return(cent) +} diff --git a/R/utils-spatial.R b/R/utils-spatial.R index 2797b106..6a1cbedf 100644 --- a/R/utils-spatial.R +++ b/R/utils-spatial.R @@ -1,1267 +1,1267 @@ -#' Are rasters comparable? -#' -#' This function checks if two `Raster-class` objects -#' are comparable. -#' -#' @param x [`Raster-class`] object. -#' @param y [`Raster-class`] object. -#' @keywords internal, utils -#' @return [`logical`] indicating if the two [`Raster-class`] objects have the same -#' resolution, extent, dimensionality, and coordinate system. -#' @noRd -is_comparable_raster <- function(x, y) { - assertthat::assert_that(inherits(x, "Raster"), inherits(y, "Raster")) && - sf::st_crs(x@crs) == sf::st_crs(y@crs) && - raster::compareRaster(x, y, crs = FALSE, res = TRUE, tolerance = 1e-5, - stopiffalse = FALSE) -} - -#' Do extents intersect? -#' -#' Verify if the extents of two spatial objects intersect or not. -#' -#' @param x [`Raster-class`], [`Spatial-class`] or [`sf::sf()`] object. -#' @param y [`Raster-class`], [`Spatial-class`] or [`sf::sf()`] object. -#' @keywords internal, utils -#' @return [`logical`]. -#' @noRd -intersecting_extents <- function(x, y) { - assertthat::assert_that( - inherits(x, c("Raster", "Spatial", "sf")), - inherits(y, c("Raster", "Spatial", "sf"))) - isTRUE(sf::st_intersects( - sf::st_as_sf(methods::as(raster::extent(x), "SpatialPolygons")), - sf::st_as_sf(methods::as(raster::extent(y), "SpatialPolygons")), - sparse = FALSE)[[1]]) -} - - -#' Extract polygon data from intersecting point data -#' @param poly A [sf] object. -#' @param points A [`Spatial`] or [sf] object. -#' @param coords A [vector] pointing to the coordinate columns. (Default: \code{c("x", "y")}) -#' @keywords utils, internal -#' @return An object with the spatial intersection -#' @noRd -point_in_polygon <- function(poly, points, coords = c('x','y')){ - assertthat::assert_that( - inherits(poly,'sf'), - inherits(points,'sf') || inherits(points,'Spatial') || inherits(points,'data.frame'), - length(coords)>0 - ) - # Convert to sf - points <- sf::st_as_sf(points, coords = coords, crs = sf::st_crs(poly)) - assertthat::assert_that( - sf::st_crs(poly) == sf::st_crs(points) - ) - - # Parallize if number of points large and allowed - if(getOption("ibis.runparallel") && nrow(points) > 5000){ - # Within test - # Paralise any simple features analysis. - # @source https://www.spatialanalytics.co.nz/post/2018/04/01/fixing-st-par/ - st_parallel <- function(sf_df, sf_func, n_cores, ...){ - - # Create a vector to split the data set up by. - split_vector <- rep(1:n_cores, each = nrow(sf_df) / n_cores, length.out = nrow(sf_df)) - - # FIXME: - # MC.cores does not work properly on windows. To be replaced with future - if(Sys.info()['sysname']=="Windows") n_cores <- 1 - # Perform GIS analysis - split_results <- split(sf_df, split_vector) %>% - parallel::mclapply(function(x) sf_func(x, ...), mc.cores = n_cores) - - # Define the output_class. If length is greater than two, then grab the second variable. - output_class <- class(split_results[[1]]) - if (length(output_class) == 2){ - output_class <- output_class[2] - } - - # Combine results back together. Method of combining depends on the output from the function. - if (output_class == "matrix"){ - result <- do.call("rbind", split_results) - names(result) <- NULL - } else if (output_class == "sfc") { - result <- do.call("c", split_results) - result <- sf_func(result) # do.call combines the list but there are still n_cores of the geometry which had been split up. Running st_union or st_collect gathers them up into one, as is the expected output of these two functions. - } else if (output_class %in% c('list', 'sgbp') ){ - result <- do.call("c", split_results) - names(result) <- NULL - } else if (output_class == "data.frame" ){ - result <- do.call("rbind", split_results) - } else { - stop("Unknown class. st_parallel only accepts the following outputs at present: sfc, list, sf, matrix, sgbp.") - } - - # Return result - return(result) - } - ov <- st_parallel(points, function(x) sf::st_join(x, poly, join = st_within), getOption("ibis.nthread")) - } else { - # Within test - ov <- suppressMessages( sf::st_join(points, poly, join = st_within) ) - } - return(ov) -} - -#' Create mask based on a zonal layer -#' -#' @description -#' This function has options to create a mask based on provided point data. It is identical in functionality to -#' the parameter \code{'limit'} in `train()`. Currently it has two available options: -#' -#' [*] It is either possible to provide a categorical zonal raster layer takes available point data and intersects it with a -#' zonal layer. The output is a [`RasterLayer`] object with only those classes in which a point occurrence fell. -#' Typical example for instance is layer of the distribution of Biomes, Ecoregions or Climatic Zones. -#' -#' [*] Buffer, in which case a buffer in the units of the geographic projection are created. Buffer width have to -#' be supplied as non-NULL parameter to \code{'buffer_width'}. The output mask thus allows to limit the prediction -#' to a spatial buffer of provided extent within any geovalid occurrence record. -#' -#' @param df A [`sf`] object with point information. -#' @param zones A [`sf`] or [`RasterLayer`] object with polygons of the zones to be used for occurrence masking. -#' @param buffer_width A [`numeric`] value specifying the buffer width. Ignored if a Zones layer is provided. -#' @param column A [`character`] giving the column in which zonal ids are found. Only used when zones is of -#' type [`sf`]. (Default: \code{"limits"}). -#' @param template An optional [`RasterLayer`] object on which which the zones should be rasterized (Default: \code{NULL}). -#' @returns A [`sf`] or [`RasterLayer`] object. -#' @keywords utils, internal -#' @noRd -create_zonaloccurrence_mask <- function(df, zones = NULL, buffer_width = NULL, column = "limits", template = NULL){ - assertthat::assert_that( - inherits(df, "sf"), - unique(sf::st_geometry_type(df)) %in% "POINT", - is.character(column), - is.null(zones) || (inherits(zones, "sf") || is.Raster(zones)), - is.null(buffer_width) || is.numeric(buffer_width), - is.null(template) || is.Raster(template), - # Can't have both set - !(is.null(zones) && is.null(buffer_width)) - ) - # Make zones mask - if(!is.null(zones)){ - # If zones is sf, check that it is of type polygon - if(inherits(zones, "sf")) assertthat::assert_that( all( unique(sf::st_geometry_type(zones)) %in% c("POLYGON", "MULTIPOLYGON") ) ) - - if(inherits(zones, "sf")){ - if(sf::st_crs(df)!=sf::st_crs(zones)){ - zones <- zones |> sf::st_transform(crs = sf::st_crs(df)) - } - - # Get zones from the limiting area, e.g. those intersecting with input - suppressMessages( - suppressWarnings( - zones <- sf::st_intersection(df, zones) - ) - ) - # Extract values from zonal raster layer - limit <- raster::extract(zones, df) |> unique() - - # Limit zones - zones <- subset(zones, limit %in% unique(zones[[column]]) ) - - # Finally rasterize if template is set - if(!is.null(template)) zones <- raster::rasterize(zones, template, field = column) - } else { - # Extract values from zonal raster layer - ex <- raster::extract(zones, df) |> unique() - # Remove NA if found - if(anyNA(ex)) ex <- ex[-which(is.na(ex))] - - # Now create copy of zonal raster and set all values other than ex to NA - new <- emptyraster(zones) - new[zones %in% ex] <- 1 - zones <- new - # Align with template if set - if(!is.null(template)){ - if(raster::compareRaster(zones, template,stopiffalse = FALSE)){ - zones <- raster::resample(zones, template, method = "ngb", func = raster::modal) - } - } - } - } else { - assertthat::assert_that( - is.Raster(template),msg = "A background layer has to be provided for this function to work!" - ) - # Buffer points width provided layer - suppressWarnings( - buf <- sf::st_buffer(x = df, dist = buffer_width, nQuadSegs = 50) - ) - # Rasterize - zones <- raster::rasterize(buf, background, field = 1, background = 0) - zones <- raster::mask(zones, background) - # Ratify - zones <- raster::ratify(zones) - } - return(zones) -} - -#' Converts a bounding box to a Well Known Text (WKT) polygon -#' -#' @param minx Minimum x value, or the most western longitude -#' @param miny Minimum y value, or the most southern latitude -#' @param maxx Maximum x value, or the most eastern longitude -#' @param maxy Maximum y value, or the most northern latitude -#' @param all A [`vector`] of length 4, with the elements: minx, miny, maxx, maxy -#' @return An object of class [`character`], a Well Known Text (WKT) string of the form -#' 'POLYGON((minx miny, maxx miny, maxx maxy, minx maxy, minx miny))' -#' @keywords internal, utils -#' @noRd -bbox2wkt <- function(minx=NA, miny=NA, maxx=NA, maxy=NA, bbox=NULL){ - if(is.null(bbox)) bbox <- c(minx, miny, maxx, maxy) - assertthat::assert_that(length(bbox)==4) #check for 4 digits - assertthat::assert_that(noNA(bbox)) #check for NAs - assertthat::assert_that(is.numeric(as.numeric(bbox))) #check for numeric-ness - - paste('POLYGON((', - sprintf('%s %s',bbox[1],bbox[2]), ',', sprintf('%s %s',bbox[3],bbox[2]), ',', - sprintf('%s %s',bbox[3],bbox[4]), ',', sprintf('%s %s',bbox[1],bbox[4]), ',', - sprintf('%s %s',bbox[1],bbox[2]), - '))', sep="") -} - -#' Expand an extent by a certain number -#' @param e an [`extent`] object -#' @param f [`numeric`] value to increase the extent (Default: \code{0.1}) -#' @keywords utils, internal -#' @return Returns the unified total [`extent`] object -#' @noRd -extent_expand <- function(e,f=0.1){ - assertthat::assert_that(inherits(e,'Extent')) - xi <- (e@xmax-e@xmin)*(f/2) - yi <- (e@ymax-e@ymin)*(f/2) - - xmin <- e@xmin-xi - xmax <- e@xmax+xi - ymin <- e@ymin-yi - ymax <- e@ymax+yi - - return(extent(c(xmin,xmax,ymin,ymax))) -} - -#' Helper function rename the geometry of a provided -#' -#' @param g A [`sf`] object containing some data. -#' @param name A [`character`] with the new name for the geometry. -#' @source https://gis.stackexchange.com/questions/386584/sf-geometry-column-naming-differences-r -#' @keywords internal, utils -#' @noRd -rename_geometry <- function(g, name){ - assertthat::assert_that( - inherits(g, "sf"), - is.character(name) - ) - current = attr(g, "sf_column") - if(current == name) return(g) - names(g)[names(g)==current] = name - sf::st_geometry(g)=name - g -} - -#' Convert a data.frame or tibble to simple features -#' -#' @description This function tries to guess the coordinate field and converts a data.frame -#' to a simple feature. -#' @param df A [`data.frame`], [`tibble`] or [`sf`] object. -#' @param geom_name A [`character`] indicating the name of the geometry column (Default: \code{'geometry'}). -#' @keywords internal, utils -#' @noRd -guess_sf <- function(df, geom_name = 'geometry'){ - assertthat::assert_that( - inherits(df,'data.frame') || inherits(df, 'sf') || inherits(df, 'tibble') - ) - # If sf, return immediately - if(inherits(df, 'sf')) return(df) - # If there is an attribute, but for some reason the file is not sf, use that one - if(!is.null(attr(df, "sf_column"))) { - df <- sf::st_as_sf(df) - if(attr(df, "sf_column") != geom_name){ - names(df)[which(names(df) == attr(df, "sf_column"))] <- geom_name - sf::st_geometry(df) <- geom_name - } - return(df) - } - # Commonly used column names - nx = c("x","X","lon","longitude") - ny = c("y", "Y", "lat", "latitude") - ng = c("geom", "geometry", "geometry") - - # Check if geom is present - if(any( ng %in% names(df) )){ - attr(df, "sf_column") <- ng[which(ng %in% names(df))] - df <- sf::st_as_sf(df) - } - # Finally check if any commonly used coordinate name exist - if(any( nx %in% names(df))){ - df <- sf::st_as_sf(df, coords = c(nx[which(nx %in% names(df))], - ny[which(ny %in% names(df))]) - ) - } - # If at this point df is still not a sf object, then it is unlikely to be converted - assertthat::assert_that(inherits(df, 'sf'), - msg = "Point object could not be converted to an sf object.") - if(attr(df, "sf_column") != geom_name){ - names(df)[which(names(df) == attr(df, "sf_column"))] <- geom_name - sf::st_geometry(df) <- geom_name - } - return(df) -} - -#' Kernel density estimation of coordinates -#' -#' @description -#' Takes input point coordinates as [`sf`] layer and estimates the Gaussian Kernel density over -#' a specified bandwidth for constructing a bivariate Gaussian kernel (see also [`MASS::kde2d()`]). -#' @details -#' Requires the `MASS` R-package to be installed! -#' @param points A \code{POINTS} [`sf`] object. -#' @param background A template [`Raster`] object describing the background. -#' @param bandwidth A [`numeric`] of the input bandwidth (Default \code{2}). -#' @returns A [`RasterLayer`] with the density of point observations. -#' @keywords utils, intenral -#' @noRd -st_kde <- function(points, background, bandwidth = 3){ - assertthat::assert_that( - inherits(points, "sf"), - is.numeric(bandwidth) - ) - check_package("MASS") - - # Get extent and cellsize - cellsize <- raster::res(background)[1] - extent_vec <- sf::st_bbox(background)[c(1,3,2,4)] - - n_y <- ceiling((extent_vec[4]-extent_vec[3])/cellsize) - n_x <- ceiling((extent_vec[2]-extent_vec[1])/cellsize) - - extent_vec[2] <- extent_vec[1]+(n_x*cellsize)-cellsize - extent_vec[4] <- extent_vec[3]+(n_y*cellsize)-cellsize - - # Make - coords <- sf::st_coordinates(points) - matrix <- MASS::kde2d(coords[,1],coords[,2], - h = bandwidth, n = c(n_x, n_y), lims = extent_vec) - out <- raster::raster(matrix) - - # Resample output for small point mismatches - if(!raster::compareRaster(out, background,stopiffalse = FALSE)){ - out <- raster::resample(out, background) - } - out <- raster::mask(out, background) - names(out) <- "kde__coordinates" - rm(matrix, coords) - return( out ) -} - -#' Polygon to points -#' -#' @description -#' Converts a polygon [`sf`] layer to a point layer by rasterizing it -#' over a provided Raster layer. -#' @param poly A \code{POLYGON} or \code{MULTIPOLYGON} [`sf`] object. -#' @param template A template [`Raster`] object. -#' @param field_occurrence A [`character`] specifying the occurrence field. Should contain information on the type. -#' @keywords utils, internal -#' @noRd -polygon_to_points <- function(poly, template, field_occurrence ) { - assertthat::assert_that( - inherits(poly, 'sf'), - is.Raster(template), - is.character(field_occurrence), - assertthat::has_name(poly, field_occurrence) - ) - - # Rasterize the polygon to - out <- raster::rasterize(poly, template, field = field_occurrence) - - # Construct new point data - co <- raster::xyFromCell(out, cell = which(!is.na(out[])) ) |> as.data.frame() - co[[field_occurrence]] <- out[!is.na(out[])] - co <- guess_sf(co) # Convert to sf and add coordinates - co$x <- sf::st_coordinates(co)[,1] - co$y <- sf::st_coordinates(co)[,2] - sf::st_crs(co) <- sf::st_crs(template) - - assertthat::assert_that(inherits(co, 'sf')) - return(co) -} - -#' Calculate the dimensions from a provided extent object -#' -#' @description Calculate the dimensions of an extent -#' (either an extent object or four-element vector in the right order), either in projected or spherical space -#' @param ex Either a [`vector`], a [`extent`] or alternatively a [`Raster`],[`Spatial*`] or [`sf`] object -#' @param lonlat A [`logical`] indication whether the extent is WGS 84 projection (Default: TRUE) -#' @param output_unit [`character`] determining the units. Allowed is 'm' and 'km' (Default: 'km') -#' @keywords utils, internal -#' @noRd -extent_dimensions <- function(ex, lonlat = TRUE, output_unit = 'km') { - assertthat::assert_that(inherits(ex, 'Extent') || inherits(ex, 'numeric') || inherits(ex, 'sf') || inherits(ex, 'Raster') || inherits(ex, 'Spatial'), - is.logical(lonlat), - is.character(output_unit) && output_unit %in% c('m','km')) - # Coerce to vector if necessary - if(is.Raster(ex)) ex <- raster::extent(ex) - if(is.vector(ex)) assertthat::assert_that(length(ex)==4, is.numeric(ex),msg = 'No valid extent object supplied!') - - # Convert to vector - ex <- switch(class(ex)[1], - Extent = as.vector(ex), - Raster = as.vector( raster::extent(ex) ), - sf = as.vector( raster::extent(ex) ), - numeric = ex - ) - # Rename the vector - names(ex) <- c("xmin", "xmax", "ymin", "ymax") - - # Procedures for longlat raster - if(lonlat) { - # Dimensions in degrees - height <- as.numeric( abs(diff(ex[1:2])) ) - width <- as.numeric( abs(diff(cos(ex[3:4]))) ) - # Scaling to get spherical surface area in km2 - scaling <- (6371 ^ 2 * pi) / 180 - surface_area <- width * height * scaling - - # Ratio between GCD height and width - # Get ratio between height and width in great circle distance, given an extent vector in lat/long - lonLatRatio <- function(extent) { - # lower left point - p1 <- matrix(extent[c(1, 3)], nrow = 1) - # upper left and lower right points - p2 <- rbind(extent[c(1, 4)], extent[c(2, 3)]) - # get ratio between distances - dists <- raster::pointDistance(p1,p2,lonlat = TRUE) - ratio <- dists[1] / dists[2] - return (ratio) - } - ratio <- lonLatRatio( as.vector(ex) ) - # calculate equivalent dimensions in km - w <- sqrt(surface_area / ratio) - dim <- c(w, w * ratio) - if(output_unit == 'm') dim * 1000 - } else { - # else assume a rectangle in m and convert to km - dim <- abs(diff(extent)[c(1, 3)]) - if(output_unit=='km'){ - dim <- dim * 0.1 ^ 3 - } - } - return(dim) -} - -#' Align a [`Raster-class`] object to another by harmonizing geometry and extend. -#' -#' If the data is not in the same projection as the template, the alignment -#' will be computed by reprojection only. If the data has already the same -#' projection, the data set will be cropped and aggregated prior to resampling -#' in order to reduce computation time. -#' -#' @param data [`Raster-class`] object to be resampled. -#' @param template [`Raster-class`] or [`Spatial-class`] object from which geometry can be extracted. -#' @param method method for resampling (Options: \code{"ngb"} or \code{"bilinear"}). -#' @param func function for resampling (Default: [mean]). -#' @param cl [`logical`] value if multicore computation should be used (Default: \code{TRUE}). -#' @keywords utils -#' @details -#' Nearest Neighbour resampling (ngb) is recommended for discrete and Bilinear -#' resampling for continuous data. -#' @return New [`Raster`] object aligned to the supplied template layer -#' @examples -#' \dontrun{ -#' # Align one raster to another -#' ras1 <- alignRasters( ras1, ras2, method = "ngb", cl = FALSE) -#' } -#' @export -alignRasters <- function(data, template, method = "bilinear",func = mean,cl = TRUE){ - # Security checks - assertthat::assert_that( - inherits(data,'Raster'), inherits(template, c("Raster", "Spatial", "sf")), - is.character(method), - is.logical(cl) - ) - method <- match.arg(method, c("bilinear", "ngb"),several.ok = FALSE) - - # Start cluster if necessary - if(cl) raster::beginCluster(parallel::detectCores()-1) - if(raster::projection(data) == raster::projection(template)){ - # Crop raster to template - data <- raster::crop(data, template, snap = "out") - if(inherits(template, "RasterLayer")){ - # Aggregate to minimal scale - if(data@ncols / template@ncols >= 2){ - factor <- floor(data@ncols/template@ncols) - data <- aggregate(data, fact = factor, fun = func, - expand=TRUE) - } - # Resample with target method - data <- raster::resample(data, template, method = method) - } - } else { - # Project Raster layer - data <- projectRaster(data, template, method = method) - } - # Stop cluster - if(cl) endCluster() - return(data) -} - -#' @title Create an empty \code{RasterLayer} based on a template -#' -#' @description -#' This function creates an empty copy of a provided \code{RasterLayer} object. It -#' is primarily used in the package to create the outputs for the predictions. -#' @param x a \code{Raster*} object corresponding. -#' @param ... other arguments that can be passed to \code{\link{raster}} -#' @return an empty raster, i.e. all cells are \code{NA}. -#' @import raster -#' @keywords raster, utils -#' @examples -#' require(raster) -#' r <- raster(matrix(1:100, 5, 20)) -#' emptyraster(r) -#' @export -emptyraster <- function(x, ...) { # add name, filename, - assertthat::assert_that(is.Raster(x)) - raster::raster(nrows = nrow(x), ncols = ncol(x), - crs = x@crs, - ext = raster::extent(x), ...) -} - -#' Function to extract nearest neighbour predictor values of provided points -#' -#' @description -#' This function performs nearest neighbour matching between biodiversity observations and independent -#' predictors, and operates directly on provided data.frames. -#' **Note that despite being parallized this function can be rather slow for large data volumes of data!** -#' @param coords A [`matrix`], [`data.frame`] or [`sf`] object. -#' @param env A [`data.frame`] object with the predictors -#' @param longlat A [`logical`] variable indicating whether the projection is long-lat -#' @param field_space A [`vector`] highlight the columns from which coordinates are to be extracted (default: \code{c('x','y')}) -#' @param cheap A [`logical`] variable whether the dataset is considered to be large and faster computation could help. -#' @param ... other options. -#' @return A [`data.frame`] with the extracted covariate data from each provided data point. -#' @details Nearest neighbour matching is done via the [geodist] R-package (\code{geodist::geodist}) -#' @note If multiple values are of equal distance during the nearest neighbour check, then the results is by default averaged. -#' @references -#' * Mark Padgham and Michael D. Sumner (2021). geodist: Fast, Dependency-Free Geodesic Distance Calculations. R package version 0.0.7. https://CRAN.R-project.org/package=geodist -#' @keywords utils -#' @export -get_ngbvalue <- function(coords, env, longlat = TRUE, field_space = c('x','y'), cheap = FALSE, ...) { - # Security checks - assertthat::assert_that( - is.data.frame(coords) || inherits(coords,'sf') || inherits(coords,'matrix'), - assertthat::is.flag(longlat), - is.data.frame(env),assertthat::has_name(env, field_space), - length(field_space) == 2, is.vector(field_space) - ) - # Convert to matrices - coords <- as.matrix(coords) - coords_env <- as.matrix(env[,field_space]) - - # If either of the matrices are larger than 10000 records, process in parallel - if(is.null( getOption('ibis.runparallel') ) || getOption('ibis.runparallel') == TRUE ){ - process_in_parallel = ifelse(nrow(coords) > 10000 || nrow(coords_env) > 100000, TRUE, FALSE) - } else { - process_in_parallel = FALSE - } - - # Pairwise distance function - # FIXME: Potentially evaluate whether sf::st_distance is of similar speed for very large matrices. - # Thus making this dependency suggested and optional - # disfun <- geosphere::distHaversine - if(longlat){ - disfun <- function(x1,x2, m = ifelse(cheap,'cheap','haversine')) geodist::geodist(x1,x2, measure = m) - } else { - disfun <- function(x1, x2) raster::pointDistance(x1, x2, lonlat = longlat) - } - - if(process_in_parallel){ - check_package("doParallel") - suppressPackageStartupMessages(require(doParallel)) - - # Split coordinates into equal size batches of 10 - coords_split <- ggplot2::cut_width(1:nrow(coords),10,boundary=0) - - cl <- doParallel::registerDoParallel(cores = getOption('ibis.nthread')) - out <- foreach(z = iterators::iter(unique(coords_split)), - .combine = 'rbind', - .inorder = FALSE, - .multicombine = TRUE, - .errorhandling = 'stop', - .export = c('coords','coords_env','coords_split', 'disfun'), - .packages = c('geodist') - ) %dopar% { - o <- - apply(coords[which(coords_split==z),], 1, function(xy1, xy2){ - dists <- disfun(xy2, xy1) - # In a few cases these can be multiple in equal distance - d <- which(dists==min(dists)) - if(length(d)>=2){ - # Average them both - o <- as.data.frame( - t( - apply(env[d, ,drop = FALSE], 2, function(x) mean(x, na.rm = TRUE) ) - ) - ) - return(o) - } else return( env[d, ,drop = FALSE] ) - }, xy2 = coords_env) - return(do.call(rbind, o)) - } - doParallel::stopImplicitCluster() - rm(cl) - } else { - env_sub <- apply(coords, 1, function(xy1, xy2) { - dists <- disfun(xy2, xy1) - # In a few cases these can be multiple in equal distance - d <- which(dists==min(dists)) - if(length(d)>=2){ - # Average them both - o <- as.data.frame( - t( - apply(env[d, ,drop = FALSE], 2, function(x) mean(x, na.rm = TRUE) ) - ) - ) - return(o) - } else return( env[d, ,drop = FALSE] ) - }, xy2 = coords_env) - # Combine - out <- do.call(rbind, env_sub) - out[,field_space] <- as.data.frame(coords) # Ensure that coordinates are back in - } - return(out) -} - -#' Function to extract directly the raster value of provided points -#' -#' @description -#' This function simply extracts the values from a provided [`RasterLayer`], -#' [`RasterStack`] or [`RasterBrick`] object. For points where or NA values were extracted -#' a small buffer is applied to try and obtain the remaining values. -#' @details -#' It is essentially a wrapper for [`terra::extract`]. -#' @param coords A [`Spatial`], [`data.frame`], [`matrix`] or [`sf`] object. -#' @param env A [`Raster`] object with the provided predictors. -#' @param rm.na [`logical`] parameter which - if set - removes all rows with a missing data point (\code{NA}) from the result. -#' @return A [`data.frame`] with the extracted covariate data from each provided data point. -#' @keywords utils -#' @examples -#' \dontrun{ -#' # Extract values -#' vals <- get_rastervalue(coords, env) -#' } -#' @export -get_rastervalue <- function(coords, env, rm.na = FALSE){ - assertthat::assert_that( - inherits(coords,"sf") || inherits(coords, "Spatial") || (is.data.frame(coords) || is.matrix(coords)), - is.Raster(env), - is.logical(rm.na) - ) - - # Try an extraction - try({ex <- raster::extract(x = env, - y = coords, - method = "simple", - df = TRUE)},silent = FALSE) - if(inherits(ex, "try-error")) stop(paste("Raster extraction failed: ", ex)) - # Find those that have NA in there - check_again <- apply(ex, 1, function(x) anyNA(x)) - if(any(check_again)){ - # Re-extract but with a small buffer - coords_sub <- coords[which(check_again),] - try({ex_sub <- raster::extract(x = env, - y = coords_sub, - method = "simple", - small = TRUE, - df = TRUE)},silent = FALSE) - if(inherits(ex_sub, "try-error")) stop(paste("Raster extraction failed: ", ex_sub)) - ex[which(check_again),] <- ex_sub - } - # Add coordinate fields to the predictors as these might be needed later - if(!any(assertthat::has_name(ex, c("x", "y")))){ - if(inherits(coords,"sf")) coords <- sf::st_coordinates(coords) - ex[["x"]] <- as.numeric(coords[,1]); ex[["y"]] <- as.numeric(coords[,2]) - } - # Convert to factor if any - if(any(is.factor(env))){ - ex[,names(env)[which(is.factor(env))]] <- factor(ex[,names(env)[which(is.factor(env))]]) - } - - if(rm.na){ - ex <- subset(ex, complete.cases(ex)) - } - assertthat::assert_that(is.data.frame(ex), - nrow(ex)>0, - msg = "Something went wrong with the extraction or all points had missing data.") - return(ex) -} - -#' Hinge transformation of a given predictor -#' -#' @description -#' This function transforms a provided predictor variable with a hinge transformation, -#' e.g. a new range of values where any values lower than a certain knot are set to \code{0}, -#' while the remainder is left at the original values. -#' @param v A [`Raster`] object. -#' @param n A [`character`] describing the name of the variable. Used as basis for new names. -#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). -#' @param cutoffs A [`numeric`] vector of optionally used cutoffs to be used instead (Default: \code{NULL}). -#' @keywords utils, internal -#' @concept Concept taken from the [maxnet] package. -#' @returns A hinge transformed [`data.frame`]. -#' @noRd -makeHinge <- function(v, n, nknots = 4, cutoffs = NULL){ - assertthat::assert_that(is.Raster(v), - is.character(n), - is.numeric(nknots), - is.numeric(cutoffs) || is.null(cutoffs)) - # Get stats - v.min <- raster::cellStats(v, min) - v.max <- raster::cellStats(v, max) - if(is.null(cutoffs)){ - k <- seq(v.min, v.max, length = nknots) - } else { - k <- cutoffs - } - if(length(k)<=1) return(NULL) - - # Hinge up to max - lh <- outer(v[], utils::head(k, -1), function(w, h) hingeval(w,h, v.max)) - # Hinge starting from min - rh <- outer(v[], k[-1], function(w, h) hingeval(w, v.min, h)) - colnames(lh) <- paste0("hinge__",n,'__', round( utils::head(k, -1), 2),'_', round(v.max, 2)) - colnames(rh) <- paste0("hinge__",n,'__', round( v.min, 2),'_', round(k[-1], 2)) - o <- as.data.frame( - cbind(lh, rh) - ) - # Kick out first (min) and last (max) col as those are perfectly correlated - o <- o[,-c(1,ncol(o))] - attr(o, "deriv.hinge") <- k - return(o) -} - -#' Threshold transformation of a given predictor -#' -#' @description -#' This function transforms a provided predictor variable with a threshold transformation, -#' e.g. a new range of values where any values lower than a certain knot are set to 0, -#' while the remainder is set to 1. -#' @param v A [`Raster`] object. -#' @param n A [`character`] describing the name of the variable. Used as basis for new names. -#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). -#' @param cutoffs A [`numeric`] vector of optionally used cutoffs to be used instead (Default: \code{NULL}). -#' @keywords utils, internal -#' @concept Concept taken from the [maxnet] package. -#' @returns A threshold transformed [`data.frame`]. -#' @noRd -makeThresh <- function(v, n, nknots = 4, cutoffs = NULL){ - assertthat::assert_that(is.Raster(v), - is.character(n), - is.numeric(nknots), - is.numeric(cutoffs) || is.null(cutoffs)) - if(is.null(cutoffs)){ - # Get min max - v.min <- raster::cellStats(v,min) - v.max <- raster::cellStats(v,max) - k <- seq(v.min, v.max, length = nknots + 2)[2:nknots + 1] - } else { - k <- cutoffs - } - if(length(k)<=1) return(NULL) - f <- outer(v[], k, function(w, t) ifelse(w >= t, 1, 0)) - colnames(f) <- paste0("thresh__", n, "__", round(k, 2)) - f <- as.data.frame(f) - attr(f, "deriv.thresh") <- k - return(f) -} - -#' Binned transformation of a given predictor -#' -#' @description -#' This function takes predictor values and 'bins' them into categories based on a -#' percentile split. -#' @param v A [`Raster`] object. -#' @param n A [`character`] describing the name of the variable. Used as basis for new names. -#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). -#' @param cutoffs A [`numeric`] vector of optionally used cutoffs to be used instead (Default: \code{NULL}). -#' @keywords utils, internal -#' @returns A binned transformed [`data.frame`] with columns representing each bin. -#' @noRd -makeBin <- function(v, n, nknots, cutoffs = NULL){ - assertthat::assert_that(is.Raster(v), - is.character(n), - is.numeric(nknots), - is.numeric(cutoffs) || is.null(cutoffs)) - if(is.null(cutoffs)){ - # Calculate cuts - cu <- raster::quantile(v, probs = seq(0, 1, by = 1/nknots) ) - } else { cu <- cutoffs} - - if(anyDuplicated(cu)){ - # If duplicated quantiles (e.g. 0, 0, 0.2..), sample from a larger number - cu <- raster::quantile(v, probs = seq(0, 1, by = 1/(nknots*2)) ) - cu <- cu[-which(duplicated(cu))] # Remove duplicated cuts - if(length(cu)<=2) return( NULL ) - if(length(cu) > nknots){ - cu <- cu[(length(cu)-(nknots)):length(cu)] - } - } - # Make cuts and explode - out <- explode_factorized_raster( - raster::ratify( - raster::cut(v, cu) - ) - ) - # Format threshold names - cu.brk <- as.character(cut(cu[-1], cu)) - cu.brk <- gsub(",","_",cu.brk) - cu.brk <- gsub("\\(|\\]", "", cu.brk) - # names(out) <- paste0("bin__",n, "__", gsub(x = names(cu)[-1], pattern = "\\D", replacement = ""),"__", cu.brk ) - names(out) <- paste0("bin__",n, "__", cu.brk ) - for(i in 1:nlayers(out)){ - attr(out[[i]], "deriv.bin") <- cu[i:(i+1)] - } - return(out) -} - -#' Create new raster stack from a given data.frame -#' -#' @param post A data.frame -#' @param background A [`Raster-class`] object for the background raster -#' @keywords internal, utils -#' @return A [`Raster-class`] object with number of columns equal to ncol(post) -#' @noRd -fill_rasters <- function(post, background){ - assertthat::assert_that( - is.data.frame(post),ncol(post)>1, - inherits(background,'Raster'), - nrow(post) == ncell(background) - ) - # Make names to be sure - names(post) <- base::make.names(names(post)) - - # If only one raster - if(ncol(post)==1){ - out <- emptyraster(background) - out[] <- post[,1] - } else { - # Loop through each column - out <- raster::stack() - for(co in 1:ncol(post)){ - o <- emptyraster(background) - o[] <- post[,co] # Assign values - # Add to stack - out <- raster::addLayer(out, o) - rm(o) - } - } - # Assign names - names(out) <- names(post) - - # Check that derivate attributes if existing are passed - if(length( grep("deriv", names(attributes(post)) ))>0){ - attr(out, grep("deriv", names(attributes(post)),value = TRUE) ) <- attr(post, grep("deriv", names(attributes(post)),value = TRUE) ) - } - - # Final check - assertthat::assert_that( - inherits(out,'Raster'), - nlayers(out) == ncol(post) - ) - return(out) -} - -#' Create a polynomial transformation from coordinates -#' -#' @description This function transforms the coordinates of a supplied file through a polynomial transform. -#' By default it applies weights and a QR decomposition for numerical stability. -#' @param coords A [`data.frame`], [`matrix`] or [`sf`] object with coordinates (2 columns named x-y). -#' @param degree The number of degrees used for polynominal transformation (Default: \code{2}). -#' @param weights Set by default to the inverse of the number of coordinates. -#' @returns A data.frame with transformed coordinates. -#' @keywords utils -#' @keywords internal -#' @references Dray S., Plissier R., Couteron P., Fortin M.J., Legendre P., Peres-Neto P.R., Bellier E., Bivand R., Blanchet F.G., De Caceres M., Dufour A.B., Heegaard E., Jombart T., Munoz F., Oksanen J., Thioulouse J., Wagner H.H. (2012). Community ecology in the age of multivariate multiscale spatial analysis. Ecological Monographs 82, 257–275. -#' @noRd -polynominal_transform <- function(coords, degree = 2, weights = rep(1/nrow(coords), nrow(coords)) ){ - assertthat::assert_that( - inherits(coords, 'data.frame') || inherits(coords, 'matrix') || inherits(coords, 'sf'), - is.numeric(degree), - !is.null(weights) && length(weights) == nrow(coords) - ) - # If spatial get coordinates - if(inherits(coords, 'sf')){ - coords <- sf::st_coordinates(coords) - } - # Polynomial transform - a0 <- poly(x = as.matrix( coords ), degree = degree, simple = TRUE) - # Standardize colnames - poly.names <- colnames(a0) # Column names for later - poly.names <- paste0("spatialtrend_", gsub("\\.","_",poly.names) ) - - # Standardize the weights - weights <- weights/sum(weights) - a0 <- cbind(weights, a0) # Add to polynominal transform - a0 <- base::qr.Q(base::qr(a0)) # QR decomposition for better numerical stability - a0 <- as.data.frame(a0[, -1])/sqrt(weights) # Weighting - - # Rename - colnames(a0) <- poly.names - return(a0) -} - -#' Clean up raster layer from disk -#' -#' Completely deletes for instance a temporary created raster file. -#' @param A [`raster`] object. -#' @param verbose Print progress (Default: \code{FALSE}) -#' @keywords utils -#' @noRd -clean_rasterfile <- function(x, verbose = FALSE) -{ - stopifnot(grepl("Raster", class(x))) - if (!fromDisk(x)) - return(NULL) - sink(tempfile()) - tdir = rasterOptions()[["tmpdir"]] - sink(NULL) - if (inherits(x, "RasterLayer")) - files = basename(x@file@name) - if (inherits(x, "RasterStack")) - files = do.call(c, lapply(methods::slot(x, "layers"), - function(x) x@file@name)) - files = files[file.exists(files)] - if (length(files) == 0) - return(NULL) - lapply(files, function(f) { - if (fromDisk(x) & file.exists(f)) - file.remove(f, sub("grd", "gri", f)) - if (verbose) { - print(paste("Deleted: ", f)) - print(paste("Deleted: ", sub("grd", "gri", - f))) - } - }) - parent.var.name <- deparse(substitute(x)) - rm(list = parent.var.name, envir = sys.frame(-1)) -} - -#' Split raster factor levels to stack -#' -#' @description Takes a single raster that is a [`factor`] and creates -#' a new [`RasterStack`] that contains the individual levels. -#' @param ras A [`RasterLayer`] object that is a [`factor`]. Alternatively a [`RasterStack`] object -#' can be supplied in which only factor variables are 'exploded' -#' @param name An optional [`character`] name for the [raster]. -#' @param ... Other parameters (not used). -#' @returns A [`RasterStack`] object -#' @keywords utils, internal -#' @noRd -explode_factorized_raster <- function(ras, name = NULL, ...){ - assertthat::assert_that(is.Raster(ras), - is.null(name) || is.character(name)) - - # Simply return the input if there are no factors - if(!any(is.factor(ras))) return(ras) - - # If input is a RasterLayer - if(inherits(ras, 'RasterLayer')){ - # Get name - # Create output template - temp <- emptyraster(ras) - if(is.null(name)) name <- names(ras) - - # Extract data - o <- data.frame(val = values(ras));names(o) <- name;o[[name]] <- factor(o[[name]]) - - # Make function that converts all factors to split rasters - f <- as.data.frame( - outer(o[[name]], levels(o[[name]]), function(w, f) ifelse(w == f, 1, 0)) - ) - - # Fill template rasters - out <- fill_rasters(f,temp) - names(out) <- paste(name, levels(o[[name]]), sep = ".") - - } else if(inherits(ras, 'RasterStack') || inherits(ras, 'RasterBrick')){ - # Alternatively if input is stack - fcts <- is.factor(ras) - - # Get non-factor variables - out <- ras[[which(!fcts)]] - for(k in which(fcts)){ - - sub <- ras[[k]] - - temp <- emptyraster(sub) - if(is.null(name)) new_name <- names(sub) - - # Extract data - o <- data.frame(val = values(sub));names(o) <- new_name;o[[new_name]] <- factor(o[[new_name]]) - - # Make function that converts all factors to split rasters - f <- as.data.frame( - outer(o[[new_name]], levels(o[[new_name]]), function(w, f) ifelse(w == f, 1, 0)) - ) - - # Fill template rasters - new <- fill_rasters(f, temp) - names(new) <- paste(new_name, levels(o[[new_name]]), sep = ".") - out <- raster::addLayer(out, new) - } - } - return(out) # Return the result -} - -#' Functionality for geographic and environmental thinning -#' -#' @description -#' For most species distribution modelling approaches it is assumed that occurrence records are unbiased, which -#' is rarely the case. While model-based control can alleviate some of the effects of sampling bias, it can often be -#' desirable to account for some sampling biases through spatial thinning (Aiello‐Lammens et al. 2015). This -#' is an approach based on the assumption that oversampled grid cells contribute little more than bias, rather than -#' strengthing any environmental responses. -#' This function provides some methods to apply spatial thinning approaches. Note that this effectively removes -#' data prior to any estimation and its use should be considered with care (see also Steen et al. 2021). -#' -#' @details -#' Currently implemented thinning methods: -#' -#' * \code{"random"}: Samples at random up to number of \code{"minpoints"} across all occupied grid cells. -#' Does not account for any spatial or environmental distance between observations. -#' * \code{"bias"}: This option removed explicitly points that are considered biased (parameter \code{"env"}) only. -#' Points are preferentially thinned from grid cells which are in the 25% most biased (larger values assumed greater bias) -#' and have high point density. Thins the observations up to \code{"minpoints"}. -#' * \code{"zones"}: Assesses for each observation that it falls with a maximum of \code{"minpoints"} into -#' each occupied zone. Careful: If the zones are relatively wide this can remove quite a few observations. -#' * \code{"environmental"}: This approach creates an observation-wide clustering (k-means) under the assumption -#' that the full environmental niche has been comprehensively sampled and is covered by the provided covariates \code{env}. -#' We then obtain an number equal to (\code{"minpoints"}) of observations for each cluster. -#' * \code{"spatial"}: Calculates the spatial distance between all observations. Then points are removed -#' iteratively until the minimum distance between points is crossed. The \code{"mindistance"} parameter has to -#' be set for this function to work. -#' -#' @param df A [`sf`] or [`data.frame`] object with observed occurrence points. All methods threat presence-only -#' and presence-absence occurrence points equally. -#' @param background A [`RasterLayer`] object with the background of the study region. Use for assessing point density. -#' @param env A [`Raster`] object with environmental covariates. Needed when method is set to \code{"environmental"} -#' or \code{"bias"} (Default: \code{NULL}). -#' @param method A [`character`] of the method to be applied (Default: \code{"random"}). -#' @param minpoints A [`numeric`] giving the number of data points at minimum to take (Default: \code{10}). -#' @param mindistance A [`numeric`] for the minimum distance of neighbouring observations (Default: \code{NULL}). -#' @param zones A [`RasterLayer`] to be supplied when option \code{"method"} is chosen (Default: \code{NULL}). -#' @param verbose [`logical`] of whether to print some statistics about the thinning outcome (Default: \code{TRUE}). -#' @examples -#' \dontrun{ -#' # Thin a certain number of observations -#' # At random -#' thin_points <- thin_observations(points, background, method = "random") -#' # using a bias layer -#' thin_points <- thin_observations(points, background, method = "bias", env = bias) -#' } -#' @references -#' * Aiello‐Lammens, M. E., Boria, R. A., Radosavljevic, A., Vilela, B., & Anderson, R. P. (2015). spThin: an R package for spatial thinning of species occurrence records for use in ecological niche models. Ecography, 38(5), 541-545. -#' * Steen, V. A., Tingley, M. W., Paton, P. W., & Elphick, C. S. (2021). Spatial thinning and class balancing: Key choices lead to variation in the performance of species distribution models with citizen science data. Methods in Ecology and Evolution, 12(2), 216-226. -#' @keywords utils -#' @export -thin_observations <- function(df, background, env = NULL, method = "random", minpoints = 10, mindistance = NULL, - zones = NULL, verbose = TRUE){ - assertthat::assert_that( - inherits(df, "sf") || inherits(df, "data.frame"), - nrow(df) > 0, - is.Raster(background), - is.Raster(env) || is.null(env), - is.character(method), - is.numeric(minpoints) && minpoints > 0, - is.null(mindistance) || is.numeric(mindistance), - is.Raster(zones) || is.null(zones) - ) - check_package("dplyr") - # Match method - method <- match.arg(method, choices = c("random", "spatial", "bias", "environmental", "zones"), several.ok = FALSE) - - # Label background with id - bg <- background - bg[] <- 1:raster::ncell(bg) - bg <- raster::mask(bg, background) - - # Check that environment has the same projection - if(is.Raster(env) && method == "environmental"){ - assertthat::assert_that( raster::compareRaster(bg, env) ) - } - # Check that CRS is the same as background - if(sf::st_crs(df) != sf::st_crs(bg)){ - message("Projection is different from input data. Reprojecting!") - df <- df |> sf::st_transform(crs = sf::st_crs(bg)) - } - - # Take coordinates of supplied data and rasterize - coords <- sf::st_coordinates( df ) - ras <- raster::rasterize(coords, bg) # Get the number of observations per grid cell - - # Bounds for thining - totake <- c(lower = minpoints, upper = max(raster::cellStats(ras,"min"), minpoints)) - - # -- # - if(method == "random"){ - # For each unique grid cell id, get the minimum value up to a maximum of the points - # by sampling at random from the occupied grid cells - - # Output vector - sel <- vector() - - ex <- data.frame(id = 1:nrow(coords), - cid = raster::extract(bg, coords) - ) - ex <- subset(ex, complete.cases(ex)) # Don't need missing points - - ex <- dplyr::left_join(ex, - ex %>% dplyr::group_by(cid) %>% dplyr::summarise(N = dplyr::n()), - by = "cid" - ) - # Points to take - sel <- append(sel, ex$id[which(ex$N <= min(totake))] ) - - # For those where we have more than the minimum, take at random the upper limits of observations - ex$oversampled <- ifelse(ex$N >= totake["upper"], 1, 0) - if(dplyr::n_distinct(ex$oversampled) > 1){ - # If there any oversampled - # Now sample at random up to the maximum amount. Got tired of doing this outside tidyverse - o <- ex %>% dplyr::filter(oversampled == 1) %>% - dplyr::group_by(cid) %>% - dplyr::slice_sample(n = min(totake)) - if(nrow(o)>0) sel <- append(sel, o$id) - rm(o) - } - if(anyDuplicated(sel)) sel <- unique(sel) - - try({rm(ex)},silent = TRUE) - } else if(method == "bias"){ - assertthat::assert_that(is.Raster(env), - raster::nlayers(env)==1, - msg = "Bias requires a single Raster layer provided to env.") - - sel <- vector() - - # Convert bias layer into percentile (largest being) - bias_perc <- raster::quantile(env, c(.75)) - - # Now extract - ex <- data.frame(id = 1:nrow(coords), - cid = raster::extract(bg, coords), - pres = raster::extract(ras, coords), - bias = raster::extract(env, coords) - ) - ex <- subset(ex, complete.cases(ex)) # Don't need missing points - # Now identify those to be thinned - ex$tothin <- ifelse((ex$bias >= bias_perc) & (ex$pres > totake[1]), 1, 0) - assertthat::assert_that(dplyr::n_distinct(ex$tothin) == 2) - # Now thin those points that are to be thinned - ss <- ex |> dplyr::filter(tothin == 1) |> - dplyr::group_by(cid) |> - dplyr::slice_sample(n = totake[1], weight_by = bias, replace = T) |> - dplyr::distinct() - - # Points to take - sel <- append(sel, ex$id[ex$tothin==0] ) - sel <- append(sel, ss$id ) - - try({rm(ss, ex)},silent = TRUE) - } else if(method == "zones"){ - # Thinning by zones - assertthat::assert_that(is.Raster(zones), - is.factor(zones)) - - if(!raster::compareRaster(bg, zones,stopiffalse = FALSE)){ - zones <- alignRasters(zones, bg, method = "ngb", func = raster::modal, cl = FALSE) - } - - # Output vector - sel <- vector() - - ex <- data.frame(id = 1:nrow(coords), - cid = raster::extract(bg, coords), - zones = raster::extract(zones, coords) - ) - # Now for each zone, take the minimum amount at random - ss <- ex |> - dplyr::group_by(zones) |> - dplyr::slice_sample(n = max(totake[1]), replace = TRUE) |> - dplyr::distinct() - - # Take the zone data points - sel <- append(sel, ss$id ) - try({rm(ss, ex)},silent = TRUE) - - } else if(method == "environmental"){ - # Environmental clustering - - if(!raster::compareRaster(bg, env,stopiffalse = FALSE)){ - env <- alignRasters(env, bg, method = "ngb", func = raster::modal, cl = FALSE) - } - # If there are any factors, explode - if(any(is.factor(env))){ - env <- explode_factorized_raster(env) - } - - # Output vector - sel <- vector() - - # Get a matrix of all environmental data, also with coordinates - # However first normalize all data - stk <- raster::as.data.frame( - predictor_transform(env, option = "norm"), - xy = TRUE) - - stk$cid <- 1:nrow(stk) - stk <- subset(stk, complete.cases(stk)) - - # Cluster - E <- kmeans(x = subset(stk, select = -cid), - centers = ncol(stk)-1, iter.max = 10) - - stk$cluster <- E$cluster - - # Now fill an empty raster and re-xtract - new <- emptyraster(env) - new[stk$cid] <- stk$cluster - - # Now re-extract and sampling points - ex <- data.frame(id = 1:nrow(coords), - cid = raster::extract(bg, coords), - zones = raster::extract(new, coords) - ) - - # Now for each zone, take the minimum amount at random - ss <- ex |> - dplyr::group_by(zones) |> - dplyr::slice_sample(n = max(totake[1]), replace = TRUE) |> - dplyr::distinct() - - # Take the zone data points - sel <- append(sel, ss$id ) - - try({rm(new, stk, ss, ex, E)},silent = TRUE) - } else if(method == "spatial"){ - # Spatial thinning - stop("Not yet implemented!") - - } - - # Return subsampled coordinates - out <- df[sel,] - if(nrow(out)==0) { - message("Thinning failed for some reason") - return(df) - } else { - if(verbose){ - message(paste0( - "(", method, ")", " thinning completed! \n", - "Original number of records: ", nrow(df), "\n", - "Number of retained records: ", nrow(out)) - ) - } - return(out) - } -} +#' Are rasters comparable? +#' +#' This function checks if two `Raster-class` objects +#' are comparable. +#' +#' @param x [`Raster-class`] object. +#' @param y [`Raster-class`] object. +#' @keywords internal, utils +#' @return [`logical`] indicating if the two [`Raster-class`] objects have the same +#' resolution, extent, dimensionality, and coordinate system. +#' @noRd +is_comparable_raster <- function(x, y) { + assertthat::assert_that(inherits(x, "Raster"), inherits(y, "Raster")) && + sf::st_crs(x@crs) == sf::st_crs(y@crs) && + raster::compareRaster(x, y, crs = FALSE, res = TRUE, tolerance = 1e-5, + stopiffalse = FALSE) +} + +#' Do extents intersect? +#' +#' Verify if the extents of two spatial objects intersect or not. +#' +#' @param x [`Raster-class`], [`Spatial-class`] or [`sf::sf()`] object. +#' @param y [`Raster-class`], [`Spatial-class`] or [`sf::sf()`] object. +#' @keywords internal, utils +#' @return [`logical`]. +#' @noRd +intersecting_extents <- function(x, y) { + assertthat::assert_that( + inherits(x, c("Raster", "Spatial", "sf")), + inherits(y, c("Raster", "Spatial", "sf"))) + isTRUE(sf::st_intersects( + sf::st_as_sf(methods::as(raster::extent(x), "SpatialPolygons")), + sf::st_as_sf(methods::as(raster::extent(y), "SpatialPolygons")), + sparse = FALSE)[[1]]) +} + + +#' Extract polygon data from intersecting point data +#' @param poly A [sf] object. +#' @param points A [`Spatial`] or [sf] object. +#' @param coords A [vector] pointing to the coordinate columns. (Default: \code{c("x", "y")}) +#' @keywords utils, internal +#' @return An object with the spatial intersection +#' @noRd +point_in_polygon <- function(poly, points, coords = c('x','y')){ + assertthat::assert_that( + inherits(poly,'sf'), + inherits(points,'sf') || inherits(points,'Spatial') || inherits(points,'data.frame'), + length(coords)>0 + ) + # Convert to sf + points <- sf::st_as_sf(points, coords = coords, crs = sf::st_crs(poly)) + assertthat::assert_that( + sf::st_crs(poly) == sf::st_crs(points) + ) + + # Parallize if number of points large and allowed + if(getOption("ibis.runparallel") && nrow(points) > 5000){ + # Within test + # Paralise any simple features analysis. + # @source https://www.spatialanalytics.co.nz/post/2018/04/01/fixing-st-par/ + st_parallel <- function(sf_df, sf_func, n_cores, ...){ + + # Create a vector to split the data set up by. + split_vector <- rep(1:n_cores, each = nrow(sf_df) / n_cores, length.out = nrow(sf_df)) + + # FIXME: + # MC.cores does not work properly on windows. To be replaced with future + if(Sys.info()['sysname']=="Windows") n_cores <- 1 + # Perform GIS analysis + split_results <- split(sf_df, split_vector) |> + parallel::mclapply(function(x) sf_func(x, ...), mc.cores = n_cores) + + # Define the output_class. If length is greater than two, then grab the second variable. + output_class <- class(split_results[[1]]) + if (length(output_class) == 2){ + output_class <- output_class[2] + } + + # Combine results back together. Method of combining depends on the output from the function. + if (output_class == "matrix"){ + result <- do.call("rbind", split_results) + names(result) <- NULL + } else if (output_class == "sfc") { + result <- do.call("c", split_results) + result <- sf_func(result) # do.call combines the list but there are still n_cores of the geometry which had been split up. Running st_union or st_collect gathers them up into one, as is the expected output of these two functions. + } else if (output_class %in% c('list', 'sgbp') ){ + result <- do.call("c", split_results) + names(result) <- NULL + } else if (output_class == "data.frame" ){ + result <- do.call("rbind", split_results) + } else { + stop("Unknown class. st_parallel only accepts the following outputs at present: sfc, list, sf, matrix, sgbp.") + } + + # Return result + return(result) + } + ov <- st_parallel(points, function(x) sf::st_join(x, poly, join = st_within), getOption("ibis.nthread")) + } else { + # Within test + ov <- suppressMessages( sf::st_join(points, poly, join = st_within) ) + } + return(ov) +} + +#' Create mask based on a zonal layer +#' +#' @description +#' This function has options to create a mask based on provided point data. It is identical in functionality to +#' the parameter \code{'limit'} in `train()`. Currently it has two available options: +#' +#' [*] It is either possible to provide a categorical zonal raster layer takes available point data and intersects it with a +#' zonal layer. The output is a [`RasterLayer`] object with only those classes in which a point occurrence fell. +#' Typical example for instance is layer of the distribution of Biomes, Ecoregions or Climatic Zones. +#' +#' [*] Buffer, in which case a buffer in the units of the geographic projection are created. Buffer width have to +#' be supplied as non-NULL parameter to \code{'buffer_width'}. The output mask thus allows to limit the prediction +#' to a spatial buffer of provided extent within any geovalid occurrence record. +#' +#' @param df A [`sf`] object with point information. +#' @param zones A [`sf`] or [`RasterLayer`] object with polygons of the zones to be used for occurrence masking. +#' @param buffer_width A [`numeric`] value specifying the buffer width. Ignored if a Zones layer is provided. +#' @param column A [`character`] giving the column in which zonal ids are found. Only used when zones is of +#' type [`sf`]. (Default: \code{"limits"}). +#' @param template An optional [`RasterLayer`] object on which which the zones should be rasterized (Default: \code{NULL}). +#' @returns A [`sf`] or [`RasterLayer`] object. +#' @keywords utils, internal +#' @noRd +create_zonaloccurrence_mask <- function(df, zones = NULL, buffer_width = NULL, column = "limits", template = NULL){ + assertthat::assert_that( + inherits(df, "sf"), + unique(sf::st_geometry_type(df)) %in% "POINT", + is.character(column), + is.null(zones) || (inherits(zones, "sf") || is.Raster(zones)), + is.null(buffer_width) || is.numeric(buffer_width), + is.null(template) || is.Raster(template), + # Can't have both set + !(is.null(zones) && is.null(buffer_width)) + ) + # Make zones mask + if(!is.null(zones)){ + # If zones is sf, check that it is of type polygon + if(inherits(zones, "sf")) assertthat::assert_that( all( unique(sf::st_geometry_type(zones)) %in% c("POLYGON", "MULTIPOLYGON") ) ) + + if(inherits(zones, "sf")){ + if(sf::st_crs(df)!=sf::st_crs(zones)){ + zones <- zones |> sf::st_transform(crs = sf::st_crs(df)) + } + + # Get zones from the limiting area, e.g. those intersecting with input + suppressMessages( + suppressWarnings( + zones <- sf::st_intersection(df, zones) + ) + ) + # Extract values from zonal raster layer + limit <- raster::extract(zones, df) |> unique() + + # Limit zones + zones <- subset(zones, limit %in% unique(zones[[column]]) ) + + # Finally rasterize if template is set + if(!is.null(template)) zones <- raster::rasterize(zones, template, field = column) + } else { + # Extract values from zonal raster layer + ex <- raster::extract(zones, df) |> unique() + # Remove NA if found + if(anyNA(ex)) ex <- ex[-which(is.na(ex))] + + # Now create copy of zonal raster and set all values other than ex to NA + new <- emptyraster(zones) + new[zones %in% ex] <- 1 + zones <- new + # Align with template if set + if(!is.null(template)){ + if(raster::compareRaster(zones, template,stopiffalse = FALSE)){ + zones <- raster::resample(zones, template, method = "ngb", func = raster::modal) + } + } + } + } else { + assertthat::assert_that( + is.Raster(template),msg = "A background layer has to be provided for this function to work!" + ) + # Buffer points width provided layer + suppressWarnings( + buf <- sf::st_buffer(x = df, dist = buffer_width, nQuadSegs = 50) + ) + # Rasterize + zones <- raster::rasterize(buf, background, field = 1, background = 0) + zones <- raster::mask(zones, background) + # Ratify + zones <- raster::ratify(zones) + } + return(zones) +} + +#' Converts a bounding box to a Well Known Text (WKT) polygon +#' +#' @param minx Minimum x value, or the most western longitude +#' @param miny Minimum y value, or the most southern latitude +#' @param maxx Maximum x value, or the most eastern longitude +#' @param maxy Maximum y value, or the most northern latitude +#' @param all A [`vector`] of length 4, with the elements: minx, miny, maxx, maxy +#' @return An object of class [`character`], a Well Known Text (WKT) string of the form +#' 'POLYGON((minx miny, maxx miny, maxx maxy, minx maxy, minx miny))' +#' @keywords internal, utils +#' @noRd +bbox2wkt <- function(minx=NA, miny=NA, maxx=NA, maxy=NA, bbox=NULL){ + if(is.null(bbox)) bbox <- c(minx, miny, maxx, maxy) + assertthat::assert_that(length(bbox)==4) #check for 4 digits + assertthat::assert_that(assertthat::noNA(bbox)) #check for NAs + assertthat::assert_that(is.numeric(as.numeric(bbox))) #check for numeric-ness + + paste('POLYGON((', + sprintf('%s %s',bbox[1],bbox[2]), ',', sprintf('%s %s',bbox[3],bbox[2]), ',', + sprintf('%s %s',bbox[3],bbox[4]), ',', sprintf('%s %s',bbox[1],bbox[4]), ',', + sprintf('%s %s',bbox[1],bbox[2]), + '))', sep="") +} + +#' Expand an extent by a certain number +#' @param e an [`extent`] object +#' @param f [`numeric`] value to increase the extent (Default: \code{0.1}) +#' @keywords utils, internal +#' @return Returns the unified total [`extent`] object +#' @noRd +extent_expand <- function(e,f=0.1){ + assertthat::assert_that(inherits(e,'Extent')) + xi <- (e@xmax-e@xmin)*(f/2) + yi <- (e@ymax-e@ymin)*(f/2) + + xmin <- e@xmin-xi + xmax <- e@xmax+xi + ymin <- e@ymin-yi + ymax <- e@ymax+yi + + return(extent(c(xmin,xmax,ymin,ymax))) +} + +#' Helper function rename the geometry of a provided +#' +#' @param g A [`sf`] object containing some data. +#' @param name A [`character`] with the new name for the geometry. +#' @source https://gis.stackexchange.com/questions/386584/sf-geometry-column-naming-differences-r +#' @keywords internal, utils +#' @noRd +rename_geometry <- function(g, name){ + assertthat::assert_that( + inherits(g, "sf"), + is.character(name) + ) + current = attr(g, "sf_column") + if(current == name) return(g) + names(g)[names(g)==current] = name + sf::st_geometry(g)=name + g +} + +#' Convert a data.frame or tibble to simple features +#' +#' @description This function tries to guess the coordinate field and converts a data.frame +#' to a simple feature. +#' @param df A [`data.frame`], [`tibble`] or [`sf`] object. +#' @param geom_name A [`character`] indicating the name of the geometry column (Default: \code{'geometry'}). +#' @keywords internal, utils +#' @noRd +guess_sf <- function(df, geom_name = 'geometry'){ + assertthat::assert_that( + inherits(df,'data.frame') || inherits(df, 'sf') || inherits(df, 'tibble') + ) + # If sf, return immediately + if(inherits(df, 'sf')) return(df) + # If there is an attribute, but for some reason the file is not sf, use that one + if(!is.null(attr(df, "sf_column"))) { + df <- sf::st_as_sf(df) + if(attr(df, "sf_column") != geom_name){ + names(df)[which(names(df) == attr(df, "sf_column"))] <- geom_name + sf::st_geometry(df) <- geom_name + } + return(df) + } + # Commonly used column names + nx = c("x","X","lon","longitude") + ny = c("y", "Y", "lat", "latitude") + ng = c("geom", "geometry", "geometry") + + # Check if geom is present + if(any( ng %in% names(df) )){ + attr(df, "sf_column") <- ng[which(ng %in% names(df))] + df <- sf::st_as_sf(df) + } + # Finally check if any commonly used coordinate name exist + if(any( nx %in% names(df))){ + df <- sf::st_as_sf(df, coords = c(nx[which(nx %in% names(df))], + ny[which(ny %in% names(df))]) + ) + } + # If at this point df is still not a sf object, then it is unlikely to be converted + assertthat::assert_that(inherits(df, 'sf'), + msg = "Point object could not be converted to an sf object.") + if(attr(df, "sf_column") != geom_name){ + names(df)[which(names(df) == attr(df, "sf_column"))] <- geom_name + sf::st_geometry(df) <- geom_name + } + return(df) +} + +#' Kernel density estimation of coordinates +#' +#' @description +#' Takes input point coordinates as [`sf`] layer and estimates the Gaussian Kernel density over +#' a specified bandwidth for constructing a bivariate Gaussian kernel (see also [`MASS::kde2d()`]). +#' @details +#' Requires the `MASS` R-package to be installed! +#' @param points A \code{POINTS} [`sf`] object. +#' @param background A template [`Raster`] object describing the background. +#' @param bandwidth A [`numeric`] of the input bandwidth (Default \code{2}). +#' @returns A [`RasterLayer`] with the density of point observations. +#' @keywords utils, intenral +#' @noRd +st_kde <- function(points, background, bandwidth = 3){ + assertthat::assert_that( + inherits(points, "sf"), + is.numeric(bandwidth) + ) + check_package("MASS") + + # Get extent and cellsize + cellsize <- raster::res(background)[1] + extent_vec <- sf::st_bbox(background)[c(1,3,2,4)] + + n_y <- ceiling((extent_vec[4]-extent_vec[3])/cellsize) + n_x <- ceiling((extent_vec[2]-extent_vec[1])/cellsize) + + extent_vec[2] <- extent_vec[1]+(n_x*cellsize)-cellsize + extent_vec[4] <- extent_vec[3]+(n_y*cellsize)-cellsize + + # Make + coords <- sf::st_coordinates(points) + matrix <- MASS::kde2d(coords[,1],coords[,2], + h = bandwidth, n = c(n_x, n_y), lims = extent_vec) + out <- raster::raster(matrix) + + # Resample output for small point mismatches + if(!raster::compareRaster(out, background,stopiffalse = FALSE)){ + out <- raster::resample(out, background) + } + out <- raster::mask(out, background) + names(out) <- "kde__coordinates" + rm(matrix, coords) + return( out ) +} + +#' Polygon to points +#' +#' @description +#' Converts a polygon [`sf`] layer to a point layer by rasterizing it +#' over a provided Raster layer. +#' @param poly A \code{POLYGON} or \code{MULTIPOLYGON} [`sf`] object. +#' @param template A template [`Raster`] object. +#' @param field_occurrence A [`character`] specifying the occurrence field. Should contain information on the type. +#' @keywords utils, internal +#' @noRd +polygon_to_points <- function(poly, template, field_occurrence ) { + assertthat::assert_that( + inherits(poly, 'sf'), + is.Raster(template), + is.character(field_occurrence), + assertthat::has_name(poly, field_occurrence) + ) + + # Rasterize the polygon to + out <- raster::rasterize(poly, template, field = field_occurrence) + + # Construct new point data + co <- raster::xyFromCell(out, cell = which(!is.na(out[])) ) |> as.data.frame() + co[[field_occurrence]] <- out[!is.na(out[])] + co <- guess_sf(co) # Convert to sf and add coordinates + co$x <- sf::st_coordinates(co)[,1] + co$y <- sf::st_coordinates(co)[,2] + sf::st_crs(co) <- sf::st_crs(template) + + assertthat::assert_that(inherits(co, 'sf')) + return(co) +} + +#' Calculate the dimensions from a provided extent object +#' +#' @description Calculate the dimensions of an extent +#' (either an extent object or four-element vector in the right order), either in projected or spherical space +#' @param ex Either a [`vector`], a [`extent`] or alternatively a [`Raster`],[`Spatial*`] or [`sf`] object +#' @param lonlat A [`logical`] indication whether the extent is WGS 84 projection (Default: TRUE) +#' @param output_unit [`character`] determining the units. Allowed is 'm' and 'km' (Default: 'km') +#' @keywords utils, internal +#' @noRd +extent_dimensions <- function(ex, lonlat = TRUE, output_unit = 'km') { + assertthat::assert_that(inherits(ex, 'Extent') || inherits(ex, 'numeric') || inherits(ex, 'sf') || inherits(ex, 'Raster') || inherits(ex, 'Spatial'), + is.logical(lonlat), + is.character(output_unit) && output_unit %in% c('m','km')) + # Coerce to vector if necessary + if(is.Raster(ex)) ex <- raster::extent(ex) + if(is.vector(ex)) assertthat::assert_that(length(ex)==4, is.numeric(ex),msg = 'No valid extent object supplied!') + + # Convert to vector + ex <- switch(class(ex)[1], + Extent = as.vector(ex), + Raster = as.vector( raster::extent(ex) ), + sf = as.vector( raster::extent(ex) ), + numeric = ex + ) + # Rename the vector + names(ex) <- c("xmin", "xmax", "ymin", "ymax") + + # Procedures for longlat raster + if(lonlat) { + # Dimensions in degrees + height <- as.numeric( abs(diff(ex[1:2])) ) + width <- as.numeric( abs(diff(cos(ex[3:4]))) ) + # Scaling to get spherical surface area in km2 + scaling <- (6371 ^ 2 * pi) / 180 + surface_area <- width * height * scaling + + # Ratio between GCD height and width + # Get ratio between height and width in great circle distance, given an extent vector in lat/long + lonLatRatio <- function(extent) { + # lower left point + p1 <- matrix(extent[c(1, 3)], nrow = 1) + # upper left and lower right points + p2 <- rbind(extent[c(1, 4)], extent[c(2, 3)]) + # get ratio between distances + dists <- raster::pointDistance(p1,p2,lonlat = TRUE) + ratio <- dists[1] / dists[2] + return (ratio) + } + ratio <- lonLatRatio( as.vector(ex) ) + # calculate equivalent dimensions in km + w <- sqrt(surface_area / ratio) + dim <- c(w, w * ratio) + if(output_unit == 'm') dim * 1000 + } else { + # else assume a rectangle in m and convert to km + dim <- abs(diff(extent)[c(1, 3)]) + if(output_unit=='km'){ + dim <- dim * 0.1 ^ 3 + } + } + return(dim) +} + +#' Align a [`Raster-class`] object to another by harmonizing geometry and extend. +#' +#' If the data is not in the same projection as the template, the alignment +#' will be computed by reprojection only. If the data has already the same +#' projection, the data set will be cropped and aggregated prior to resampling +#' in order to reduce computation time. +#' +#' @param data [`Raster-class`] object to be resampled. +#' @param template [`Raster-class`] or [`Spatial-class`] object from which geometry can be extracted. +#' @param method method for resampling (Options: \code{"ngb"} or \code{"bilinear"}). +#' @param func function for resampling (Default: [mean]). +#' @param cl [`logical`] value if multicore computation should be used (Default: \code{TRUE}). +#' @keywords utils +#' @details +#' Nearest Neighbour resampling (ngb) is recommended for discrete and Bilinear +#' resampling for continuous data. +#' @return New [`Raster`] object aligned to the supplied template layer +#' @examples +#' \dontrun{ +#' # Align one raster to another +#' ras1 <- alignRasters( ras1, ras2, method = "ngb", cl = FALSE) +#' } +#' @export +alignRasters <- function(data, template, method = "bilinear",func = mean,cl = TRUE){ + # Security checks + assertthat::assert_that( + inherits(data,'Raster'), inherits(template, c("Raster", "Spatial", "sf")), + is.character(method), + is.logical(cl) + ) + method <- match.arg(method, c("bilinear", "ngb"),several.ok = FALSE) + + # Start cluster if necessary + if(cl) raster::beginCluster(parallel::detectCores()-1) + if(raster::projection(data) == raster::projection(template)){ + # Crop raster to template + data <- raster::crop(data, template, snap = "out") + if(inherits(template, "RasterLayer")){ + # Aggregate to minimal scale + if(data@ncols / template@ncols >= 2){ + factor <- floor(data@ncols/template@ncols) + data <- aggregate(data, fact = factor, fun = func, + expand=TRUE) + } + # Resample with target method + data <- raster::resample(data, template, method = method) + } + } else { + # Project Raster layer + data <- projectRaster(data, template, method = method) + } + # Stop cluster + if(cl) endCluster() + return(data) +} + +#' @title Create an empty \code{RasterLayer} based on a template +#' +#' @description +#' This function creates an empty copy of a provided \code{RasterLayer} object. It +#' is primarily used in the package to create the outputs for the predictions. +#' @param x a \code{Raster*} object corresponding. +#' @param ... other arguments that can be passed to \code{\link{raster}} +#' @return an empty raster, i.e. all cells are \code{NA}. +#' @import raster +#' @keywords raster, utils +#' @examples +#' require(raster) +#' r <- raster(matrix(1:100, 5, 20)) +#' emptyraster(r) +#' @export +emptyraster <- function(x, ...) { # add name, filename, + assertthat::assert_that(is.Raster(x)) + raster::raster(nrows = nrow(x), ncols = ncol(x), + crs = x@crs, + ext = raster::extent(x), ...) +} + +#' Function to extract nearest neighbour predictor values of provided points +#' +#' @description +#' This function performs nearest neighbour matching between biodiversity observations and independent +#' predictors, and operates directly on provided data.frames. +#' **Note that despite being parallized this function can be rather slow for large data volumes of data!** +#' @param coords A [`matrix`], [`data.frame`] or [`sf`] object. +#' @param env A [`data.frame`] object with the predictors +#' @param longlat A [`logical`] variable indicating whether the projection is long-lat +#' @param field_space A [`vector`] highlight the columns from which coordinates are to be extracted (default: \code{c('x','y')}) +#' @param cheap A [`logical`] variable whether the dataset is considered to be large and faster computation could help. +#' @param ... other options. +#' @return A [`data.frame`] with the extracted covariate data from each provided data point. +#' @details Nearest neighbour matching is done via the [geodist] R-package (\code{geodist::geodist}) +#' @note If multiple values are of equal distance during the nearest neighbour check, then the results is by default averaged. +#' @references +#' * Mark Padgham and Michael D. Sumner (2021). geodist: Fast, Dependency-Free Geodesic Distance Calculations. R package version 0.0.7. https://CRAN.R-project.org/package=geodist +#' @keywords utils +#' @export +get_ngbvalue <- function(coords, env, longlat = TRUE, field_space = c('x','y'), cheap = FALSE, ...) { + # Security checks + assertthat::assert_that( + is.data.frame(coords) || inherits(coords,'sf') || inherits(coords,'matrix'), + assertthat::is.flag(longlat), + is.data.frame(env),assertthat::has_name(env, field_space), + length(field_space) == 2, is.vector(field_space) + ) + # Convert to matrices + coords <- as.matrix(coords) + coords_env <- as.matrix(env[,field_space]) + + # If either of the matrices are larger than 10000 records, process in parallel + if(is.null( getOption('ibis.runparallel') ) || getOption('ibis.runparallel') == TRUE ){ + process_in_parallel = ifelse(nrow(coords) > 10000 || nrow(coords_env) > 100000, TRUE, FALSE) + } else { + process_in_parallel = FALSE + } + + # Pairwise distance function + # FIXME: Potentially evaluate whether sf::st_distance is of similar speed for very large matrices. + # Thus making this dependency suggested and optional + # disfun <- geosphere::distHaversine + if(longlat){ + disfun <- function(x1,x2, m = ifelse(cheap,'cheap','haversine')) geodist::geodist(x1,x2, measure = m) + } else { + disfun <- function(x1, x2) raster::pointDistance(x1, x2, lonlat = longlat) + } + + if(process_in_parallel){ + + check_package("doParallel") + + # Split coordinates into equal size batches of 10 + coords_split <- ggplot2::cut_width(1:nrow(coords),10,boundary=0) + + cl <- doParallel::registerDoParallel(cores = getOption('ibis.nthread')) + out <- foreach::foreach(z = iterators::iter(unique(coords_split)), + .combine = 'rbind', + .inorder = FALSE, + .multicombine = TRUE, + .errorhandling = 'stop', + .export = c('coords','coords_env','coords_split', 'disfun'), + .packages = c('geodist') + ) %dopar% { + o <- + apply(coords[which(coords_split==z),], 1, function(xy1, xy2){ + dists <- disfun(xy2, xy1) + # In a few cases these can be multiple in equal distance + d <- which(dists==min(dists)) + if(length(d)>=2){ + # Average them both + o <- as.data.frame( + t( + apply(env[d, ,drop = FALSE], 2, function(x) mean(x, na.rm = TRUE) ) + ) + ) + return(o) + } else return( env[d, ,drop = FALSE] ) + }, xy2 = coords_env) + return(do.call(rbind, o)) + } + doParallel::stopImplicitCluster() + rm(cl) + } else { + env_sub <- apply(coords, 1, function(xy1, xy2) { + dists <- disfun(xy2, xy1) + # In a few cases these can be multiple in equal distance + d <- which(dists==min(dists)) + if(length(d)>=2){ + # Average them both + o <- as.data.frame( + t( + apply(env[d, ,drop = FALSE], 2, function(x) mean(x, na.rm = TRUE) ) + ) + ) + return(o) + } else return( env[d, ,drop = FALSE] ) + }, xy2 = coords_env) + # Combine + out <- do.call(rbind, env_sub) + out[,field_space] <- as.data.frame(coords) # Ensure that coordinates are back in + } + return(out) +} + +#' Function to extract directly the raster value of provided points +#' +#' @description +#' This function simply extracts the values from a provided [`RasterLayer`], +#' [`RasterStack`] or [`RasterBrick`] object. For points where or NA values were extracted +#' a small buffer is applied to try and obtain the remaining values. +#' @details +#' It is essentially a wrapper for [`terra::extract`]. +#' @param coords A [`Spatial`], [`data.frame`], [`matrix`] or [`sf`] object. +#' @param env A [`Raster`] object with the provided predictors. +#' @param rm.na [`logical`] parameter which - if set - removes all rows with a missing data point (\code{NA}) from the result. +#' @return A [`data.frame`] with the extracted covariate data from each provided data point. +#' @keywords utils +#' @examples +#' \dontrun{ +#' # Extract values +#' vals <- get_rastervalue(coords, env) +#' } +#' @export +get_rastervalue <- function(coords, env, rm.na = FALSE){ + assertthat::assert_that( + inherits(coords,"sf") || inherits(coords, "Spatial") || (is.data.frame(coords) || is.matrix(coords)), + is.Raster(env), + is.logical(rm.na) + ) + + # Try an extraction + try({ex <- raster::extract(x = env, + y = coords, + method = "simple", + df = TRUE)},silent = FALSE) + if(inherits(ex, "try-error")) stop(paste("Raster extraction failed: ", ex)) + # Find those that have NA in there + check_again <- apply(ex, 1, function(x) anyNA(x)) + if(any(check_again)){ + # Re-extract but with a small buffer + coords_sub <- coords[which(check_again),] + try({ex_sub <- raster::extract(x = env, + y = coords_sub, + method = "simple", + small = TRUE, + df = TRUE)},silent = FALSE) + if(inherits(ex_sub, "try-error")) stop(paste("Raster extraction failed: ", ex_sub)) + ex[which(check_again),] <- ex_sub + } + # Add coordinate fields to the predictors as these might be needed later + if(!any(assertthat::has_name(ex, c("x", "y")))){ + if(inherits(coords,"sf")) coords <- sf::st_coordinates(coords) + ex[["x"]] <- as.numeric(coords[,1]); ex[["y"]] <- as.numeric(coords[,2]) + } + # Convert to factor if any + if(any(is.factor(env))){ + ex[,names(env)[which(is.factor(env))]] <- factor(ex[,names(env)[which(is.factor(env))]]) + } + + if(rm.na){ + ex <- subset(ex, stats::complete.cases(ex)) + } + assertthat::assert_that(is.data.frame(ex), + nrow(ex)>0, + msg = "Something went wrong with the extraction or all points had missing data.") + return(ex) +} + +#' Hinge transformation of a given predictor +#' +#' @description +#' This function transforms a provided predictor variable with a hinge transformation, +#' e.g. a new range of values where any values lower than a certain knot are set to \code{0}, +#' while the remainder is left at the original values. +#' @param v A [`Raster`] object. +#' @param n A [`character`] describing the name of the variable. Used as basis for new names. +#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). +#' @param cutoffs A [`numeric`] vector of optionally used cutoffs to be used instead (Default: \code{NULL}). +#' @keywords utils, internal +#' @concept Concept taken from the [maxnet] package. +#' @returns A hinge transformed [`data.frame`]. +#' @noRd +makeHinge <- function(v, n, nknots = 4, cutoffs = NULL){ + assertthat::assert_that(is.Raster(v), + is.character(n), + is.numeric(nknots), + is.numeric(cutoffs) || is.null(cutoffs)) + # Get stats + v.min <- raster::cellStats(v, min) + v.max <- raster::cellStats(v, max) + if(is.null(cutoffs)){ + k <- seq(v.min, v.max, length = nknots) + } else { + k <- cutoffs + } + if(length(k)<=1) return(NULL) + + # Hinge up to max + lh <- outer(v[], utils::head(k, -1), function(w, h) hingeval(w,h, v.max)) + # Hinge starting from min + rh <- outer(v[], k[-1], function(w, h) hingeval(w, v.min, h)) + colnames(lh) <- paste0("hinge__",n,'__', round( utils::head(k, -1), 2),'_', round(v.max, 2)) + colnames(rh) <- paste0("hinge__",n,'__', round( v.min, 2),'_', round(k[-1], 2)) + o <- as.data.frame( + cbind(lh, rh) + ) + # Kick out first (min) and last (max) col as those are perfectly correlated + o <- o[,-c(1,ncol(o))] + attr(o, "deriv.hinge") <- k + return(o) +} + +#' Threshold transformation of a given predictor +#' +#' @description +#' This function transforms a provided predictor variable with a threshold transformation, +#' e.g. a new range of values where any values lower than a certain knot are set to 0, +#' while the remainder is set to 1. +#' @param v A [`Raster`] object. +#' @param n A [`character`] describing the name of the variable. Used as basis for new names. +#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). +#' @param cutoffs A [`numeric`] vector of optionally used cutoffs to be used instead (Default: \code{NULL}). +#' @keywords utils, internal +#' @concept Concept taken from the [maxnet] package. +#' @returns A threshold transformed [`data.frame`]. +#' @noRd +makeThresh <- function(v, n, nknots = 4, cutoffs = NULL){ + assertthat::assert_that(is.Raster(v), + is.character(n), + is.numeric(nknots), + is.numeric(cutoffs) || is.null(cutoffs)) + if(is.null(cutoffs)){ + # Get min max + v.min <- raster::cellStats(v,min) + v.max <- raster::cellStats(v,max) + k <- seq(v.min, v.max, length = nknots + 2)[2:nknots + 1] + } else { + k <- cutoffs + } + if(length(k)<=1) return(NULL) + f <- outer(v[], k, function(w, t) ifelse(w >= t, 1, 0)) + colnames(f) <- paste0("thresh__", n, "__", round(k, 2)) + f <- as.data.frame(f) + attr(f, "deriv.thresh") <- k + return(f) +} + +#' Binned transformation of a given predictor +#' +#' @description +#' This function takes predictor values and 'bins' them into categories based on a +#' percentile split. +#' @param v A [`Raster`] object. +#' @param n A [`character`] describing the name of the variable. Used as basis for new names. +#' @param nknots The number of knots to be used for the transformation (Default: \code{4}). +#' @param cutoffs A [`numeric`] vector of optionally used cutoffs to be used instead (Default: \code{NULL}). +#' @keywords utils, internal +#' @returns A binned transformed [`data.frame`] with columns representing each bin. +#' @noRd +makeBin <- function(v, n, nknots, cutoffs = NULL){ + assertthat::assert_that(is.Raster(v), + is.character(n), + is.numeric(nknots), + is.numeric(cutoffs) || is.null(cutoffs)) + if(is.null(cutoffs)){ + # Calculate cuts + cu <- raster::quantile(v, probs = seq(0, 1, by = 1/nknots) ) + } else { cu <- cutoffs} + + if(anyDuplicated(cu)){ + # If duplicated quantiles (e.g. 0, 0, 0.2..), sample from a larger number + cu <- raster::quantile(v, probs = seq(0, 1, by = 1/(nknots*2)) ) + cu <- cu[-which(duplicated(cu))] # Remove duplicated cuts + if(length(cu)<=2) return( NULL ) + if(length(cu) > nknots){ + cu <- cu[(length(cu)-(nknots)):length(cu)] + } + } + # Make cuts and explode + out <- explode_factorized_raster( + raster::ratify( + raster::cut(v, cu) + ) + ) + # Format threshold names + cu.brk <- as.character(cut(cu[-1], cu)) + cu.brk <- gsub(",","_",cu.brk) + cu.brk <- gsub("\\(|\\]", "", cu.brk) + # names(out) <- paste0("bin__",n, "__", gsub(x = names(cu)[-1], pattern = "\\D", replacement = ""),"__", cu.brk ) + names(out) <- paste0("bin__",n, "__", cu.brk ) + for(i in 1:nlayers(out)){ + attr(out[[i]], "deriv.bin") <- cu[i:(i+1)] + } + return(out) +} + +#' Create new raster stack from a given data.frame +#' +#' @param post A data.frame +#' @param background A [`Raster-class`] object for the background raster +#' @keywords internal, utils +#' @return A [`Raster-class`] object with number of columns equal to ncol(post) +#' @noRd +fill_rasters <- function(post, background){ + assertthat::assert_that( + is.data.frame(post),ncol(post)>1, + inherits(background,'Raster'), + nrow(post) == ncell(background) + ) + # Make names to be sure + names(post) <- base::make.names(names(post)) + + # If only one raster + if(ncol(post)==1){ + out <- emptyraster(background) + out[] <- post[,1] + } else { + # Loop through each column + out <- raster::stack() + for(co in 1:ncol(post)){ + o <- emptyraster(background) + o[] <- post[,co] # Assign values + # Add to stack + out <- raster::addLayer(out, o) + rm(o) + } + } + # Assign names + names(out) <- names(post) + + # Check that derivate attributes if existing are passed + if(length( grep("deriv", names(attributes(post)) ))>0){ + attr(out, grep("deriv", names(attributes(post)),value = TRUE) ) <- attr(post, grep("deriv", names(attributes(post)),value = TRUE) ) + } + + # Final check + assertthat::assert_that( + inherits(out,'Raster'), + nlayers(out) == ncol(post) + ) + return(out) +} + +#' Create a polynomial transformation from coordinates +#' +#' @description This function transforms the coordinates of a supplied file through a polynomial transform. +#' By default it applies weights and a QR decomposition for numerical stability. +#' @param coords A [`data.frame`], [`matrix`] or [`sf`] object with coordinates (2 columns named x-y). +#' @param degree The number of degrees used for polynominal transformation (Default: \code{2}). +#' @param weights Set by default to the inverse of the number of coordinates. +#' @returns A data.frame with transformed coordinates. +#' @keywords utils +#' @keywords internal +#' @references Dray S., Plissier R., Couteron P., Fortin M.J., Legendre P., Peres-Neto P.R., Bellier E., Bivand R., Blanchet F.G., De Caceres M., Dufour A.B., Heegaard E., Jombart T., Munoz F., Oksanen J., Thioulouse J., Wagner H.H. (2012). Community ecology in the age of multivariate multiscale spatial analysis. Ecological Monographs 82, 257–275. +#' @noRd +polynominal_transform <- function(coords, degree = 2, weights = rep(1/nrow(coords), nrow(coords)) ){ + assertthat::assert_that( + inherits(coords, 'data.frame') || inherits(coords, 'matrix') || inherits(coords, 'sf'), + is.numeric(degree), + !is.null(weights) && length(weights) == nrow(coords) + ) + # If spatial get coordinates + if(inherits(coords, 'sf')){ + coords <- sf::st_coordinates(coords) + } + # Polynomial transform + a0 <- stats::poly(x = as.matrix( coords ), degree = degree, simple = TRUE) + # Standardize colnames + poly.names <- colnames(a0) # Column names for later + poly.names <- paste0("spatialtrend_", gsub("\\.","_",poly.names) ) + + # Standardize the weights + weights <- weights/sum(weights) + a0 <- cbind(weights, a0) # Add to polynominal transform + a0 <- base::qr.Q(base::qr(a0)) # QR decomposition for better numerical stability + a0 <- as.data.frame(a0[, -1])/sqrt(weights) # Weighting + + # Rename + colnames(a0) <- poly.names + return(a0) +} + +#' Clean up raster layer from disk +#' +#' Completely deletes for instance a temporary created raster file. +#' @param A [`raster`] object. +#' @param verbose Print progress (Default: \code{FALSE}) +#' @keywords utils +#' @noRd +clean_rasterfile <- function(x, verbose = FALSE) +{ + stopifnot(grepl("Raster", class(x))) + if (!fromDisk(x)) + return(NULL) + sink(tempfile()) + tdir = rasterOptions()[["tmpdir"]] + sink(NULL) + if (inherits(x, "RasterLayer")) + files = basename(x@file@name) + if (inherits(x, "RasterStack")) + files = do.call(c, lapply(methods::slot(x, "layers"), + function(x) x@file@name)) + files = files[file.exists(files)] + if (length(files) == 0) + return(NULL) + lapply(files, function(f) { + if (fromDisk(x) & file.exists(f)) + file.remove(f, sub("grd", "gri", f)) + if (verbose) { + print(paste("Deleted: ", f)) + print(paste("Deleted: ", sub("grd", "gri", + f))) + } + }) + parent.var.name <- deparse(substitute(x)) + rm(list = parent.var.name, envir = sys.frame(-1)) +} + +#' Split raster factor levels to stack +#' +#' @description Takes a single raster that is a [`factor`] and creates +#' a new [`RasterStack`] that contains the individual levels. +#' @param ras A [`RasterLayer`] object that is a [`factor`]. Alternatively a [`RasterStack`] object +#' can be supplied in which only factor variables are 'exploded' +#' @param name An optional [`character`] name for the [raster]. +#' @param ... Other parameters (not used). +#' @returns A [`RasterStack`] object +#' @keywords utils, internal +#' @noRd +explode_factorized_raster <- function(ras, name = NULL, ...){ + assertthat::assert_that(is.Raster(ras), + is.null(name) || is.character(name)) + + # Simply return the input if there are no factors + if(!any(is.factor(ras))) return(ras) + + # If input is a RasterLayer + if(inherits(ras, 'RasterLayer')){ + # Get name + # Create output template + temp <- emptyraster(ras) + if(is.null(name)) name <- names(ras) + + # Extract data + o <- data.frame(val = values(ras));names(o) <- name;o[[name]] <- factor(o[[name]]) + + # Make function that converts all factors to split rasters + f <- as.data.frame( + outer(o[[name]], levels(o[[name]]), function(w, f) ifelse(w == f, 1, 0)) + ) + + # Fill template rasters + out <- fill_rasters(f,temp) + names(out) <- paste(name, levels(o[[name]]), sep = ".") + + } else if(inherits(ras, 'RasterStack') || inherits(ras, 'RasterBrick')){ + # Alternatively if input is stack + fcts <- is.factor(ras) + + # Get non-factor variables + out <- ras[[which(!fcts)]] + for(k in which(fcts)){ + + sub <- ras[[k]] + + temp <- emptyraster(sub) + if(is.null(name)) new_name <- names(sub) + + # Extract data + o <- data.frame(val = values(sub));names(o) <- new_name;o[[new_name]] <- factor(o[[new_name]]) + + # Make function that converts all factors to split rasters + f <- as.data.frame( + outer(o[[new_name]], levels(o[[new_name]]), function(w, f) ifelse(w == f, 1, 0)) + ) + + # Fill template rasters + new <- fill_rasters(f, temp) + names(new) <- paste(new_name, levels(o[[new_name]]), sep = ".") + out <- raster::addLayer(out, new) + } + } + return(out) # Return the result +} + +#' Functionality for geographic and environmental thinning +#' +#' @description +#' For most species distribution modelling approaches it is assumed that occurrence records are unbiased, which +#' is rarely the case. While model-based control can alleviate some of the effects of sampling bias, it can often be +#' desirable to account for some sampling biases through spatial thinning (Aiello‐Lammens et al. 2015). This +#' is an approach based on the assumption that oversampled grid cells contribute little more than bias, rather than +#' strengthing any environmental responses. +#' This function provides some methods to apply spatial thinning approaches. Note that this effectively removes +#' data prior to any estimation and its use should be considered with care (see also Steen et al. 2021). +#' +#' @details +#' Currently implemented thinning methods: +#' +#' * \code{"random"}: Samples at random up to number of \code{"minpoints"} across all occupied grid cells. +#' Does not account for any spatial or environmental distance between observations. +#' * \code{"bias"}: This option removed explicitly points that are considered biased (parameter \code{"env"}) only. +#' Points are preferentially thinned from grid cells which are in the 25% most biased (larger values assumed greater bias) +#' and have high point density. Thins the observations up to \code{"minpoints"}. +#' * \code{"zones"}: Assesses for each observation that it falls with a maximum of \code{"minpoints"} into +#' each occupied zone. Careful: If the zones are relatively wide this can remove quite a few observations. +#' * \code{"environmental"}: This approach creates an observation-wide clustering (k-means) under the assumption +#' that the full environmental niche has been comprehensively sampled and is covered by the provided covariates \code{env}. +#' We then obtain an number equal to (\code{"minpoints"}) of observations for each cluster. +#' * \code{"spatial"}: Calculates the spatial distance between all observations. Then points are removed +#' iteratively until the minimum distance between points is crossed. The \code{"mindistance"} parameter has to +#' be set for this function to work. +#' +#' @param df A [`sf`] or [`data.frame`] object with observed occurrence points. All methods threat presence-only +#' and presence-absence occurrence points equally. +#' @param background A [`RasterLayer`] object with the background of the study region. Use for assessing point density. +#' @param env A [`Raster`] object with environmental covariates. Needed when method is set to \code{"environmental"} +#' or \code{"bias"} (Default: \code{NULL}). +#' @param method A [`character`] of the method to be applied (Default: \code{"random"}). +#' @param minpoints A [`numeric`] giving the number of data points at minimum to take (Default: \code{10}). +#' @param mindistance A [`numeric`] for the minimum distance of neighbouring observations (Default: \code{NULL}). +#' @param zones A [`RasterLayer`] to be supplied when option \code{"method"} is chosen (Default: \code{NULL}). +#' @param verbose [`logical`] of whether to print some statistics about the thinning outcome (Default: \code{TRUE}). +#' @examples +#' \dontrun{ +#' # Thin a certain number of observations +#' # At random +#' thin_points <- thin_observations(points, background, method = "random") +#' # using a bias layer +#' thin_points <- thin_observations(points, background, method = "bias", env = bias) +#' } +#' @references +#' * Aiello‐Lammens, M. E., Boria, R. A., Radosavljevic, A., Vilela, B., & Anderson, R. P. (2015). spThin: an R package for spatial thinning of species occurrence records for use in ecological niche models. Ecography, 38(5), 541-545. +#' * Steen, V. A., Tingley, M. W., Paton, P. W., & Elphick, C. S. (2021). Spatial thinning and class balancing: Key choices lead to variation in the performance of species distribution models with citizen science data. Methods in Ecology and Evolution, 12(2), 216-226. +#' @keywords utils +#' @export +thin_observations <- function(df, background, env = NULL, method = "random", minpoints = 10, mindistance = NULL, + zones = NULL, verbose = TRUE){ + assertthat::assert_that( + inherits(df, "sf") || inherits(df, "data.frame"), + nrow(df) > 0, + is.Raster(background), + is.Raster(env) || is.null(env), + is.character(method), + is.numeric(minpoints) && minpoints > 0, + is.null(mindistance) || is.numeric(mindistance), + is.Raster(zones) || is.null(zones) + ) + check_package("dplyr") + # Match method + method <- match.arg(method, choices = c("random", "spatial", "bias", "environmental", "zones"), several.ok = FALSE) + + # Label background with id + bg <- background + bg[] <- 1:raster::ncell(bg) + bg <- raster::mask(bg, background) + + # Check that environment has the same projection + if(is.Raster(env) && method == "environmental"){ + assertthat::assert_that( raster::compareRaster(bg, env) ) + } + # Check that CRS is the same as background + if(sf::st_crs(df) != sf::st_crs(bg)){ + message("Projection is different from input data. Reprojecting!") + df <- df |> sf::st_transform(crs = sf::st_crs(bg)) + } + + # Take coordinates of supplied data and rasterize + coords <- sf::st_coordinates( df ) + ras <- raster::rasterize(coords, bg) # Get the number of observations per grid cell + + # Bounds for thining + totake <- c(lower = minpoints, upper = max(raster::cellStats(ras,"min"), minpoints)) + + # -- # + if(method == "random"){ + # For each unique grid cell id, get the minimum value up to a maximum of the points + # by sampling at random from the occupied grid cells + + # Output vector + sel <- vector() + + ex <- data.frame(id = 1:nrow(coords), + cid = raster::extract(bg, coords) + ) + ex <- subset(ex, stats::complete.cases(ex)) # Don't need missing points + + ex <- dplyr::left_join(ex, + ex |> dplyr::group_by(cid) |> dplyr::summarise(N = dplyr::n()), + by = "cid" + ) + # Points to take + sel <- append(sel, ex$id[which(ex$N <= min(totake))] ) + + # For those where we have more than the minimum, take at random the upper limits of observations + ex$oversampled <- ifelse(ex$N >= totake["upper"], 1, 0) + if(dplyr::n_distinct(ex$oversampled) > 1){ + # If there any oversampled + # Now sample at random up to the maximum amount. Got tired of doing this outside tidyverse + o <- ex |> dplyr::filter(oversampled == 1) |> + dplyr::group_by(cid) |> + dplyr::slice_sample(n = min(totake)) + if(nrow(o)>0) sel <- append(sel, o$id) + rm(o) + } + if(anyDuplicated(sel)) sel <- unique(sel) + + try({rm(ex)},silent = TRUE) + } else if(method == "bias"){ + assertthat::assert_that(is.Raster(env), + raster::nlayers(env)==1, + msg = "Bias requires a single Raster layer provided to env.") + + sel <- vector() + + # Convert bias layer into percentile (largest being) + bias_perc <- raster::quantile(env, c(.75)) + + # Now extract + ex <- data.frame(id = 1:nrow(coords), + cid = raster::extract(bg, coords), + pres = raster::extract(ras, coords), + bias = raster::extract(env, coords) + ) + ex <- subset(ex, stats::complete.cases(ex)) # Don't need missing points + # Now identify those to be thinned + ex$tothin <- ifelse((ex$bias >= bias_perc) & (ex$pres > totake[1]), 1, 0) + assertthat::assert_that(dplyr::n_distinct(ex$tothin) == 2) + # Now thin those points that are to be thinned + ss <- ex |> dplyr::filter(tothin == 1) |> + dplyr::group_by(cid) |> + dplyr::slice_sample(n = totake[1], weight_by = bias, replace = T) |> + dplyr::distinct() + + # Points to take + sel <- append(sel, ex$id[ex$tothin==0] ) + sel <- append(sel, ss$id ) + + try({rm(ss, ex)},silent = TRUE) + } else if(method == "zones"){ + # Thinning by zones + assertthat::assert_that(is.Raster(zones), + is.factor(zones)) + + if(!raster::compareRaster(bg, zones,stopiffalse = FALSE)){ + zones <- alignRasters(zones, bg, method = "ngb", func = raster::modal, cl = FALSE) + } + + # Output vector + sel <- vector() + + ex <- data.frame(id = 1:nrow(coords), + cid = raster::extract(bg, coords), + zones = raster::extract(zones, coords) + ) + # Now for each zone, take the minimum amount at random + ss <- ex |> + dplyr::group_by(zones) |> + dplyr::slice_sample(n = max(totake[1]), replace = TRUE) |> + dplyr::distinct() + + # Take the zone data points + sel <- append(sel, ss$id ) + try({rm(ss, ex)},silent = TRUE) + + } else if(method == "environmental"){ + # Environmental clustering + + if(!raster::compareRaster(bg, env,stopiffalse = FALSE)){ + env <- alignRasters(env, bg, method = "ngb", func = raster::modal, cl = FALSE) + } + # If there are any factors, explode + if(any(is.factor(env))){ + env <- explode_factorized_raster(env) + } + + # Output vector + sel <- vector() + + # Get a matrix of all environmental data, also with coordinates + # However first normalize all data + stk <- raster::as.data.frame( + predictor_transform(env, option = "norm"), + xy = TRUE) + + stk$cid <- 1:nrow(stk) + stk <- subset(stk, stats::complete.cases(stk)) + + # Cluster + E <- stats::kmeans(x = subset(stk, select = -cid), + centers = ncol(stk)-1, iter.max = 10) + + stk$cluster <- E$cluster + + # Now fill an empty raster and re-xtract + new <- emptyraster(env) + new[stk$cid] <- stk$cluster + + # Now re-extract and sampling points + ex <- data.frame(id = 1:nrow(coords), + cid = raster::extract(bg, coords), + zones = raster::extract(new, coords) + ) + + # Now for each zone, take the minimum amount at random + ss <- ex |> + dplyr::group_by(zones) |> + dplyr::slice_sample(n = max(totake[1]), replace = TRUE) |> + dplyr::distinct() + + # Take the zone data points + sel <- append(sel, ss$id ) + + try({rm(new, stk, ss, ex, E)},silent = TRUE) + } else if(method == "spatial"){ + # Spatial thinning + stop("Not yet implemented!") + + } + + # Return subsampled coordinates + out <- df[sel,] + if(nrow(out)==0) { + message("Thinning failed for some reason") + return(df) + } else { + if(verbose){ + message(paste0( + "(", method, ")", " thinning completed! \n", + "Original number of records: ", nrow(df), "\n", + "Number of retained records: ", nrow(out)) + ) + } + return(out) + } +} diff --git a/R/utils-stan.R b/R/utils-stan.R index 0c530b9f..8edc09af 100644 --- a/R/utils-stan.R +++ b/R/utils-stan.R @@ -1,512 +1,512 @@ -#' Built formula for STAN model -#' -#' @description -#' This function built a formula for a `engine_stan()` model. -#' @param model A [`list()`] object containing the prepared model data for a given biodiversity dataset. -#' @param x A [`BiodiversityDistribution`] object. -#' @param id The id for the species formula. -#' @param settings A [`Settings`] object. -#' @author Martin Jung -#' @note Function is not meant to be run outside the train() call. -#' @keywords internal -#' @noRd -built_formula_stan <- function(model, id, x, settings){ - assertthat::assert_that( - is.list(model), - length(model) > 0, - assertthat::has_name(model, "predictors"), - inherits(x, "BiodiversityDistribution"), - inherits(settings, 'Settings'), - is.character(id) || is.Id(id), - msg = "Error in model object. This function is not meant to be called outside ouf train()." - ) - # Get object for id - obj <- model$biodiversity[[id]] - # Extract basic stats from the model object - types <- as.character( sapply( model$biodiversity, function(x) x$type ) ) - fams <- as.character( sapply( model$biodiversity, function(z) z$family ) ) - bionames = sapply(model$biodiversity, function(x) x$name) - ids <- names(model$biodiversity) - priors <- model$priors - - # Default equation found (e.g. no separate specification of effects) - if(model$biodiversity[[id]]$equation==''){ - - # Go through each variable and build formula for likelihood - form <- to_formula(paste("observed", - " ~ ", "Intercept + ", - ifelse(model$biodiversity[[id]]$family=='poisson', " offset(log(w)) + ", ""), # Use log area as offset - paste(model$biodiversity[[id]]$predictors_names,collapse = " + "), - # Check whether a single dataset is provided, otherwise add other intercepts - ifelse(length(types)==1, - '', - paste('+',paste0('Intercept_', - make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name - sapply( model$biodiversity, function(x) x$type ),collapse = ' + ') - ) - ), - # # If multiple datasets, don't use intercept - # ifelse(length(ids)>1,"-1", ""), - collapse = " ") - ) - - # Add offset if specified - if(!is.Waiver(x$offset)){ form <- update.formula(form, paste0('~ . + offset(spatial_offset)') ) } - # if( length( grep('Spatial',x$get_latent() ) ) > 0 ) {} # Possible to be implemented for CAR models - } else { - if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation.') - form <- to_formula(model$biodiversity[[1]]$equation) - # Update formula to weights if forgotten - if(model$biodiversity[[1]]$family=='poisson') form <- update.formula(form, 'observed ~ .') - assertthat::assert_that( - all( all.vars(form) %in% c('observed','w', model[['predictors_names']]) ) - ) - } - return(form) -} - -#' Logistic (invlogit) transformation function -#' @param x A [`numeric`] value -#' @keywords utils -#' @noRd -logistic <- function(a){ - if(is.data.frame(a)){ - apply(a, 2, function(x) ilink(x, link = "logit") ) - } else { - ilink(a, link = "logit") - } -} - -#' Logit transformation function -#' @param x A [`numeric`] value -#' @keywords utils -#' @noRd -logit <- function(a){ - if(is.data.frame(a)){ - apply(a, 2, function(x) log(x/(1-x)) ) - } else { - log(a/(1-a)) - } -} - -#' Inverse transformation function for the link function -#' @description back transforms a [numeric] vector using the appropriate link function -#' @param x A [`numeric`] vector generated by a model -#' @param link [`character`] indicating the link function to use. Default: "log" -#' @noRd -ilink <- function(x, link = "log"){ - assertthat::assert_that(is.numeric(x), - is.character(link) - ) - link <- match.arg(link, c("identity", "log","logm1","log1p", - "inverse", "sqrt", "logit", "probit", - "cauchit", "cloglog") , several.ok = FALSE) - switch (link, - identity = x, - log = exp(x), - logm1 = expp1(x), - log1p = expm1(x), - inverse = 1/x, - sqrt = x^2, - logit = ( 1 / (1 + exp(-x)) ), - probit = pnorm(x), - cauchit = pcauchy(x), - cloglog = (1 - exp(-exp(x))) - ) -} - -#' Checks whether cmdstanr is available and otherwise tries to install it -#' -#' @param install A [`logical`] factor to indicate whether [cmdstanr] should be directly installed (Default: \code{TRUE}). -#' @param ask [`logical`] whether the cmdstanr package is to be installed (Default: \code{FALSE}). -#' @keywords stan, utils, internal -stan_check_cmd <- function(install = TRUE, ask = FALSE){ - assertthat::assert_that( - is.logical(install), is.logical(ask) - ) - # Check if available - if(!requireNamespace("cmdstanr", quietly = TRUE)){ - if(install){ - if(ask){ a <- askYesNo("Install cmdstanr?") } else { a <- TRUE} - if(a){ - install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos"))) - check_cmdstan_toolchain() - install_cmdstan(cores = 2) - } - } else { - check_package("cmdstanr") - } - } else { - invisible() - } -} - -#' Wrap a list with stan model code -#' -#' @description [engine_stan] builds a list with stan model code. This function -#' concatenates them together. -#' @param sm_code A [list] object with exactly 7 entries. -#' @returns A [character] object. -#' @keywords stan, utils -wrap_stanmodel <- function(sm_code){ - assertthat::assert_that(is.list(sm_code), - length(sm_code)==7) - out <- character(0) - - # Functions - out <- paste0("functions {") - for(i in sm_code$functions) out <- paste0(out, i, "\n") - out <- paste0(out, "\n}\n") - # Data - out <- paste(out, "data {") - for(i in sm_code$data) out <- paste0(out, i, "\n") - out <- paste0(out, "\n}\n") - # Transformed data - out <- paste(out, "transformed data {") - for(i in sm_code$transformed_data) out <- paste0(out, i, "\n") - out <- paste0(out, "\n}\n") - # Parameters - out <- paste(out, "parameters {") - for(i in sm_code$parameters) out <- paste0(out, i, "\n") - out <- paste0(out, "\n}\n") - # Transformed parameters - out <- paste(out, "transformed parameters {") - for(i in sm_code$transformed_parameters) out <- paste0(out, i, "\n") - out <- paste0(out, "\n}\n") - # Model - out <- paste(out, "model {") - for(i in sm_code$model) out <- paste0(out, i, "\n") - out <- paste0(out, "\n}\n") - # Generated quantities - out <- paste(out, "generated quantities {") - for(i in sm_code$generated_quantities) out <- paste0(out, i, "\n") - out <- paste0(out, "}") - - assertthat::assert_that(is.character(out), length(out)>0) - return(out) -} - -#' Write a cmdstanr model output to a specific file -#' -#' @description Write a [cmdstanr] model output to a specific destination -#' @param mod A supplied [cmdstanr] model -#' @param dir The model directory where the model chould be written. Should be a character / existing dir. -#' @keywords stan, utils -write_stanmodel <- function( mod, dir = tempdir() ) { - assertthat::assert_that( - dir.exists(dir) - ) - fname <- file.path( dir , paste0("rt_cmdstanr_", digest::digest(mod,"md5")) ) - file_stan <- paste0( fname, ".stan" ) - fileConn <- file( file_stan ) - writeLines( mod , fileConn ) - close(fileConn) - return(file_stan) -} - -#' Fit [cmdstanr] model and convert to [rstan] object -#' -#' @description This function fits a stan model using the light-weight interface provided -#' by [cmdstanr]. The code was adapted from McElreath [rethinking] package. -#' @param model_code A [`character`] pointing to the stan modelling code. -#' @param data A [`list`] with all the parameters required to run the [model_code] in stan. -#' @param algorithm A [`character`] giving the algorithm to use. Either \code{'sampling'} (Default), \code{'optimize'} or \code{'variational'} for penalized likelihood estimation. -#' @param chains A [`numeric`] indicating the number of chains to use for estimation. -#' @param cores Number of threads for sampling. Default set to \code{'getOption("ibis.nthread")'}. See [ibis_options()]. -#' @param threads [`numeric`] giving the number of threads to be run per chain. Has to be specified in accordance with cores. -#' @param iter A [`numeric`] value giving the number of MCMC samples to generate. -#' @param warmup [`numeric`] for the number of warm-up samples for MCMC. Default set to 1/2 of iter. -#' @param control A [`list`] with further control options for stan. -#' @param cpp_options A [`list`] with options for the Cpp compiling. -#' @param force [`logical`] indication whether to force recompile the model (Default: \code{FALSE}). -#' @param path [`character`] indicating a path to be made available to the stan compiler. -#' @param save_warmup A [`logical`] flag whether to save the warmup samples. -#' @param ... Other non-specified parameters. -#' @seealso [rethinking] R package -#' @returns A [rstan] object -#' @keywords misc, stan -#' @export -run_stan <- function( model_code, data = list(), - algorithm = "sampling", - chains = 4, cores = getOption("ibis.nthread"), - threads = 1, - iter = 1000, warmup = floor(iter / 2), - control = list(adapt_delta = 0.95), - cpp_options = list(), - force = FALSE, - path = base::getwd(), - save_warmup = TRUE, ... ) { - assertthat::assert_that( - is.numeric(chains), is.numeric(cores), - is.numeric(iter), is.numeric(warmup), - is.numeric(threads), - threads < cores, - is.list(data), - is.list(control), is.list(cpp_options), - is.logical(save_warmup), - is.logical(force) - ) - # Check that cmdstanr is available - check_package("cmdstanr") - cmdstanr::check_cmdstan_toolchain(quiet = TRUE) - - # Match the algorithm to be used - algorithm <- match.arg(algorithm, c("sampling", "optimize", "variational"), several.ok = FALSE) - - if( threads > 1 ) cpp_options[['stan_threads']] <- TRUE - - # Check extension - assertthat::assert_that( - is.character(model_code), - assertthat::has_extension(model_code, "stan") - ) - - # Now compile the model - mod <- cmdstanr::cmdstan_model( model_code, - compile = TRUE, - force_recompile = force, - cpp_options = cpp_options, - include_paths = path - # stanc_options = list("O1") # Can result in substantial speedups! - ) - - # Final parameters for sampling - samp <- iter - warmup - warm <- warmup - - # pull out any control arguments - carg_adapt_delta <- 0.95 - if ( !is.null( control[['adapt_delta']] ) ) - carg_adapt_delta <- as.numeric(control[['adapt_delta']]) - carg_max_treedepth <- 11 - if ( !is.null( control[['max_treedepth']] ) ) - carg_max_treedepth <- as.numeric(control[['max_treedepth']]) - - if(algorithm == "sampling"){ - # Sample - if ( threads > 1 ) { - cmdstanfit <- mod$sample( data = data, - chains = chains, - parallel_chains = cores, - iter_sampling = samp, iter_warmup = warm, - adapt_delta = carg_adapt_delta, - max_treedepth = carg_max_treedepth, - threads_per_chain = threads, - save_warmup = save_warmup, - ... ) - # coerce to stanfit object - stanfit <- rstan::read_stan_csv( cmdstanfit$output_files() ) - - } else { - cmdstanfit <- mod$sample( data = data, - chains = chains, - parallel_chains = cores, - iter_sampling = samp , iter_warmup = warm, - adapt_delta = carg_adapt_delta, - max_treedepth = carg_max_treedepth, - save_warmup = save_warmup, - ... ) - } - # coerce to stanfit object - stanfit <- rstan::read_stan_csv( cmdstanfit$output_files() ) - - } else if(algorithm == "optimize"){ - # Optimize for getting point estimates - stanfit <- mod$optimize(data = data, - #seed = seed, # This could be passed on - threads = threads - ) - } else if(algorithm == "variational") { - # Variational for approximating the posterior - stanfit <- mod$variational(data = data, - # seed = seed, - threads = threads - ) - } - - return(stanfit) -} - -#' Create a posterior prediction from a rstanfit object -#' -#' @description This function does simulates from the posterior -#' of a created stan model, therefore providing a fast and efficient way -#' to project coefficients obtained from Bayesian models to new/novel contexts. -#' -#' @param obj A [`stanfit`] object (as used by [rstan]). -#' @param form A [`formula`] object created for the [ibis.iSDM::DistributionModel]. -#' @param newdata A [data.frame] with new data to be used for prediction. -#' @param mode A [`character`] of whether the linear `predictor` or the `response` is to be summarized. -#' @param family A [`character`] giving the family for simulating linear response values (Default: \code{NULL}) -#' @param offset A [vector] with an optionally specified offset. -#' @param draws [numeric] indicating whether a specific number of draws should be taken. -#' @import posterior -#' @references -#' * [https://medium.com/@alex.pavlakis/making-predictions-from-stan-models-in-r-3e349dfac1ed](https://medium.com/@alex.pavlakis/making-predictions-from-stan-models-in-r-3e349dfac1ed). -#' * The [brms] R-package. -#' @export -posterior_predict_stanfit <- function(obj, form, newdata, mode = "predictor", family = NULL, offset = NULL, draws = NULL){ - assertthat::assert_that( - inherits(obj, "stanfit") || inherits(obj, "CmdStanFit"), - is.formula(form), - is.data.frame(newdata), - is.null(family) || is.character(family), - is.null(draws) || is.numeric(draws), - is.null(offset) || (length(offset) == nrow(newdata)) - ) - mode <- match.arg(mode, c("predictor", "response"), several.ok = FALSE) - # Build model matrix - # Note: This removes all NA cells from matrix - A <- model.matrix(object = delete.response(terms(form)), - data = newdata) - assertthat::assert_that(nrow(A)>0, inherits(A, "matrix") || inherits(A, "dgCMatrix")) - # Remove intercept unless set - if(attr(terms(form),"intercept") == 1) { - if(length(grep("Intercept", colnames(A), ignore.case = TRUE))>0){ - A <- A[,-(grep("(Intercept)", colnames(A),fixed = T))] - } - } - - # Draw from the posterior - if(inherits(obj, "stanfit")) { - pp <- posterior::as_draws_df(obj) - } else { - pp <- obj$draws() |> as.data.frame() - } - # Create a subset? - if (!is.null(draws)) { - pp <- pp[sample.int(nrow(pp), draws),] - } - # Get only beta coefficients and Intercept if set - if("Intercept" %in% colnames(pp)) what <- "beta|Intercept" else what <- "beta" - suppressWarnings( pp <- pp[ c(grep(what, colnames(pp), value = TRUE)) ] ) - if(hasName(pp, "b_Intercept")) pp <- pp[ grep("b_Intercept",colnames(pp), invert = T)] - - # Prepare offset if set - if(!is.null(offset)) { - # Get only the rows in the A matrix (minus NA) - offset <- offset[as.numeric(row.names(A))] - } else { offset <- rep(0, nrow(A) ) } - - # Security checks - assertthat::assert_that( - nrow(A)>0, nrow(pp) > 0, - ncol(pp) == ncol(A), - is.numeric(offset) - ) - - # 16/01/2023 - Change towards matrix multiplication by default (below) - # if(mode == "predictor"){ - # # Summarize the coefficients from the posterior - # pp <- posterior::summarise_draws(pp) |> - # subset(select = c("variable", "mean", "q5", "median", "q95", "sd")) |> - # as.data.frame() - # # --- # - # pp$variable <- colnames(A) - # # Calculate b*X + offset if set - # preds <- cbind( - # A %*% pp[,"mean"] + ifelse(is.null(offset),0, offset), - # A %*% pp[,"q5"] + ifelse(is.null(offset),0, offset), - # A %*% pp[,"median"] + ifelse(is.null(offset),0, offset), - # A %*% pp[,"q95"] + ifelse(is.null(offset),0, offset), - # A %*% pp[,"sd"] + ifelse(is.null(offset),0, offset) - # ) - # - # # Add random noise equivalent to the posterior length and sd of the posterior - # # Necessary since we already summarize the moment above - # .rnorm_matrix <- function(mean, sd) { - # stopifnot(length(dim(mean)) == 2) - # error <- matrix(rnorm(length(mean), 0, sd), ncol = ncol(mean), byrow=TRUE) - # mean + error - # } - # preds <- .rnorm_matrix(preds, pp[,"sd"]) # FIXME: This only makes sense for mean. Apply mad to median? - # - # # Apply ilink - # if(!is.null(family)){ - # preds <- switch (family, - # "poisson" = ilink(preds, link = "log"), - # "binomial" = ilink(preds, link = "logit"), - # ilink(preds, link = "log") - # ) - # } - # - # } else { - # Simulate linear response approximating poisson_rng in stan - out <- vector("list", nrow(pp)) - # TODO: Parallelize over threads? - pb <- progress::progress_bar$new(total = nrow(pp), - format = "Simulating posterior samples (:spin) [:bar] :percent") - for(i in 1:nrow(pp)){ - pb$tick() - # Build eta as additive beta with the A matrix row - eta <- 0 + base::tcrossprod(as.matrix(pp)[i,] |> base::unname(), A) + offset - out[[i]] <- base::unname(eta) - } - - # Combine link - a <- do.call(rbind, out) - colnames(a) <- rownames(a) <- NULL - - # Backtransformation - if(mode == "response"){ - if(family == "poisson"){ - a <- apply(a, 2, function(lambda) ilink(lambda, link = "log")) - } else if(family == "binomial") { - a <- apply(a, 2, function(mu) ilink(mu, link = "logit")) - } - } - # # Draw random variable for each draw and lambda value - # if(family == "poisson"){ - # a <- suppressWarnings( lapply(out, function(lambda) rpois(nrow(A), ilink(lambda, link = "log")) ) ) - # } else if(family == "binomial") { - # a <- suppressWarnings( lapply(out, function(mu) rbinom(nrow(A), size = 1, prob = ilink(mu, link = "logit")) ) ) - # } else { - # stop("Not yet implemented method for prediction the linear response.") - # } - - # Finally summarize - preds <- cbind( - matrixStats::colMeans2(a, na.rm = TRUE), - matrixStats::colQuantiles(a, probs = c(.05,.5,.95), na.rm = TRUE), - matrixStats::colSds(a, na.rm = TRUE) - ) - - # ---- # - # Create output with cellid - out <- tibble::rowid_to_column(newdata, var = "cellid")["cellid"] |> as.data.frame() - out$cv <- out$q95 <- out$q50 <- out$q05 <- out$sd <- out$mean <- NA - out$mean[as.numeric(row.names(A))] <- preds[,1] - out$sd[as.numeric(row.names(A))] <- preds[,5] - out$q05[as.numeric(row.names(A))] <- preds[,2] - out$q50[as.numeric(row.names(A))] <- preds[,3] - out$q95[as.numeric(row.names(A))] <- preds[,4] - out$cv[as.numeric(row.names(A))] <- preds[,5] / preds[,1] - out$cellid <- NULL - - return(out) -} - -#' Show the stan code from a trained model -#' -#' @description -#' This helper function shows the code from a trained [DistributionModel] -#' using the [`engine_stan`]. -#' This function is emulated after a similar functionality in the [brms] R-package. -#' **It only works with models inferred with stan!** -#' @param obj Any prepared object. -#' @param ... not used. -#' -#' @return None. -#' @keywords engine -#' @seealso [rstan], [cmdstanr], [brms] -#' @name stancode -NULL -methods::setGeneric("stancode", - signature = methods::signature("obj"), - function(obj, ...) standardGeneric("stancode")) - -#' @rdname stancode -#' @method stancode DistributionModel -#' @keywords engine -#' @export -stancode.DistributionModel <- function(x, ...) x$stancode() +#' Built formula for STAN model +#' +#' @description +#' This function built a formula for a `engine_stan()` model. +#' @param model A [`list()`] object containing the prepared model data for a given biodiversity dataset. +#' @param x A [`BiodiversityDistribution`] object. +#' @param id The id for the species formula. +#' @param settings A [`Settings`] object. +#' @author Martin Jung +#' @note Function is not meant to be run outside the train() call. +#' @keywords internal +#' @noRd +built_formula_stan <- function(model, id, x, settings){ + assertthat::assert_that( + is.list(model), + length(model) > 0, + assertthat::has_name(model, "predictors"), + inherits(x, "BiodiversityDistribution"), + inherits(settings, 'Settings'), + is.character(id) || is.Id(id), + msg = "Error in model object. This function is not meant to be called outside ouf train()." + ) + # Get object for id + obj <- model$biodiversity[[id]] + # Extract basic stats from the model object + types <- as.character( sapply( model$biodiversity, function(x) x$type ) ) + fams <- as.character( sapply( model$biodiversity, function(z) z$family ) ) + bionames = sapply(model$biodiversity, function(x) x$name) + ids <- names(model$biodiversity) + priors <- model$priors + + # Default equation found (e.g. no separate specification of effects) + if(model$biodiversity[[id]]$equation==''){ + + # Go through each variable and build formula for likelihood + form <- to_formula(paste("observed", + " ~ ", "Intercept + ", + ifelse(model$biodiversity[[id]]$family=='poisson', " offset(log(w)) + ", ""), # Use log area as offset + paste(model$biodiversity[[id]]$predictors_names,collapse = " + "), + # Check whether a single dataset is provided, otherwise add other intercepts + ifelse(length(types)==1, + '', + paste('+',paste0('Intercept_', + make.names(tolower(sapply( model$biodiversity, function(x) x$name ))),'_', # Make intercept from name + sapply( model$biodiversity, function(x) x$type ),collapse = ' + ') + ) + ), + # # If multiple datasets, don't use intercept + # ifelse(length(ids)>1,"-1", ""), + collapse = " ") + ) + + # Add offset if specified + if(!is.Waiver(x$offset)){ form <- stats::update.formula(form, paste0('~ . + offset(spatial_offset)') ) } + # if( length( grep('Spatial',x$get_latent() ) ) > 0 ) {} # Possible to be implemented for CAR models + } else { + if(getOption('ibis.setupmessages')) myLog('[Estimation]','yellow','Use custom model equation.') + form <- to_formula(model$biodiversity[[1]]$equation) + # Update formula to weights if forgotten + if(model$biodiversity[[1]]$family=='poisson') form <- stats::update.formula(form, 'observed ~ .') + assertthat::assert_that( + all( all.vars(form) %in% c('observed','w', model[['predictors_names']]) ) + ) + } + return(form) +} + +#' Logistic (invlogit) transformation function +#' @param x A [`numeric`] value +#' @keywords utils +#' @noRd +logistic <- function(a){ + if(is.data.frame(a)){ + apply(a, 2, function(x) ilink(x, link = "logit") ) + } else { + ilink(a, link = "logit") + } +} + +#' Logit transformation function +#' @param x A [`numeric`] value +#' @keywords utils +#' @noRd +logit <- function(a){ + if(is.data.frame(a)){ + apply(a, 2, function(x) log(x/(1-x)) ) + } else { + log(a/(1-a)) + } +} + +#' Inverse transformation function for the link function +#' @description back transforms a [numeric] vector using the appropriate link function +#' @param x A [`numeric`] vector generated by a model +#' @param link [`character`] indicating the link function to use. Default: "log" +#' @noRd +ilink <- function(x, link = "log"){ + assertthat::assert_that(is.numeric(x), + is.character(link) + ) + link <- match.arg(link, c("identity", "log","logm1","log1p", + "inverse", "sqrt", "logit", "probit", + "cauchit", "cloglog") , several.ok = FALSE) + switch (link, + identity = x, + log = exp(x), + logm1 = exp(x) + 1, + log1p = expm1(x), + inverse = 1/x, + sqrt = x^2, + logit = ( 1 / (1 + exp(-x)) ), + probit = stats::pnorm(x), + cauchit = stats::pcauchy(x), + cloglog = (1 - exp(-exp(x))) + ) +} + +#' Checks whether cmdstanr is available and otherwise tries to install it +#' +#' @param install A [`logical`] factor to indicate whether [cmdstanr] should be directly installed (Default: \code{TRUE}). +#' @param ask [`logical`] whether the cmdstanr package is to be installed (Default: \code{FALSE}). +#' @keywords stan, utils, internal +stan_check_cmd <- function(install = TRUE, ask = FALSE){ + assertthat::assert_that( + is.logical(install), is.logical(ask) + ) + # Check if available + if(!requireNamespace("cmdstanr", quietly = TRUE)){ + if(install){ + if(ask){ a <- utils::askYesNo("Install cmdstanr?") } else { a <- TRUE} + if(a){ + utils::install.packages("cmdstanr", repos = c("https://mc-stan.org/r-packages/", getOption("repos"))) + cmdstanr::check_cmdstan_toolchain() + cmdstanr::install_cmdstan(cores = 2) + } + } else { + check_package("cmdstanr") + } + } else { + invisible() + } +} + +#' Wrap a list with stan model code +#' +#' @description [engine_stan] builds a list with stan model code. This function +#' concatenates them together. +#' @param sm_code A [list] object with exactly 7 entries. +#' @returns A [character] object. +#' @keywords stan, utils +wrap_stanmodel <- function(sm_code){ + assertthat::assert_that(is.list(sm_code), + length(sm_code)==7) + out <- character(0) + + # Functions + out <- paste0("functions {") + for(i in sm_code$functions) out <- paste0(out, i, "\n") + out <- paste0(out, "\n}\n") + # Data + out <- paste(out, "data {") + for(i in sm_code$data) out <- paste0(out, i, "\n") + out <- paste0(out, "\n}\n") + # Transformed data + out <- paste(out, "transformed data {") + for(i in sm_code$transformed_data) out <- paste0(out, i, "\n") + out <- paste0(out, "\n}\n") + # Parameters + out <- paste(out, "parameters {") + for(i in sm_code$parameters) out <- paste0(out, i, "\n") + out <- paste0(out, "\n}\n") + # Transformed parameters + out <- paste(out, "transformed parameters {") + for(i in sm_code$transformed_parameters) out <- paste0(out, i, "\n") + out <- paste0(out, "\n}\n") + # Model + out <- paste(out, "model {") + for(i in sm_code$model) out <- paste0(out, i, "\n") + out <- paste0(out, "\n}\n") + # Generated quantities + out <- paste(out, "generated quantities {") + for(i in sm_code$generated_quantities) out <- paste0(out, i, "\n") + out <- paste0(out, "}") + + assertthat::assert_that(is.character(out), length(out)>0) + return(out) +} + +#' Write a cmdstanr model output to a specific file +#' +#' @description Write a [cmdstanr] model output to a specific destination +#' @param mod A supplied [cmdstanr] model +#' @param dir The model directory where the model chould be written. Should be a character / existing dir. +#' @keywords stan, utils +write_stanmodel <- function( mod, dir = tempdir() ) { + assertthat::assert_that( + dir.exists(dir) + ) + fname <- file.path( dir , paste0("rt_cmdstanr_", digest::digest(mod,"md5")) ) + file_stan <- paste0( fname, ".stan" ) + fileConn <- file( file_stan ) + writeLines( mod , fileConn ) + close(fileConn) + return(file_stan) +} + +#' Fit [cmdstanr] model and convert to [rstan] object +#' +#' @description This function fits a stan model using the light-weight interface provided +#' by [cmdstanr]. The code was adapted from McElreath [rethinking] package. +#' @param model_code A [`character`] pointing to the stan modelling code. +#' @param data A [`list`] with all the parameters required to run the [model_code] in stan. +#' @param algorithm A [`character`] giving the algorithm to use. Either \code{'sampling'} (Default), \code{'optimize'} or \code{'variational'} for penalized likelihood estimation. +#' @param chains A [`numeric`] indicating the number of chains to use for estimation. +#' @param cores Number of threads for sampling. Default set to \code{'getOption("ibis.nthread")'}. See [ibis_options()]. +#' @param threads [`numeric`] giving the number of threads to be run per chain. Has to be specified in accordance with cores. +#' @param iter A [`numeric`] value giving the number of MCMC samples to generate. +#' @param warmup [`numeric`] for the number of warm-up samples for MCMC. Default set to 1/2 of iter. +#' @param control A [`list`] with further control options for stan. +#' @param cpp_options A [`list`] with options for the Cpp compiling. +#' @param force [`logical`] indication whether to force recompile the model (Default: \code{FALSE}). +#' @param path [`character`] indicating a path to be made available to the stan compiler. +#' @param save_warmup A [`logical`] flag whether to save the warmup samples. +#' @param ... Other non-specified parameters. +#' @seealso [rethinking] R package +#' @returns A [rstan] object +#' @keywords misc, stan +#' @export +run_stan <- function( model_code, data = list(), + algorithm = "sampling", + chains = 4, cores = getOption("ibis.nthread"), + threads = 1, + iter = 1000, warmup = floor(iter / 2), + control = list(adapt_delta = 0.95), + cpp_options = list(), + force = FALSE, + path = base::getwd(), + save_warmup = TRUE, ... ) { + assertthat::assert_that( + is.numeric(chains), is.numeric(cores), + is.numeric(iter), is.numeric(warmup), + is.numeric(threads), + threads < cores, + is.list(data), + is.list(control), is.list(cpp_options), + is.logical(save_warmup), + is.logical(force) + ) + # Check that cmdstanr is available + check_package("cmdstanr") + cmdstanr::check_cmdstan_toolchain(quiet = TRUE) + + # Match the algorithm to be used + algorithm <- match.arg(algorithm, c("sampling", "optimize", "variational"), several.ok = FALSE) + + if( threads > 1 ) cpp_options[['stan_threads']] <- TRUE + + # Check extension + assertthat::assert_that( + is.character(model_code), + assertthat::has_extension(model_code, "stan") + ) + + # Now compile the model + mod <- cmdstanr::cmdstan_model( model_code, + compile = TRUE, + force_recompile = force, + cpp_options = cpp_options, + include_paths = path + # stanc_options = list("O1") # Can result in substantial speedups! + ) + + # Final parameters for sampling + samp <- iter - warmup + warm <- warmup + + # pull out any control arguments + carg_adapt_delta <- 0.95 + if ( !is.null( control[['adapt_delta']] ) ) + carg_adapt_delta <- as.numeric(control[['adapt_delta']]) + carg_max_treedepth <- 11 + if ( !is.null( control[['max_treedepth']] ) ) + carg_max_treedepth <- as.numeric(control[['max_treedepth']]) + + if(algorithm == "sampling"){ + # Sample + if ( threads > 1 ) { + cmdstanfit <- mod$sample( data = data, + chains = chains, + parallel_chains = cores, + iter_sampling = samp, iter_warmup = warm, + adapt_delta = carg_adapt_delta, + max_treedepth = carg_max_treedepth, + threads_per_chain = threads, + save_warmup = save_warmup, + ... ) + # coerce to stanfit object + stanfit <- rstan::read_stan_csv( cmdstanfit$output_files() ) + + } else { + cmdstanfit <- mod$sample( data = data, + chains = chains, + parallel_chains = cores, + iter_sampling = samp , iter_warmup = warm, + adapt_delta = carg_adapt_delta, + max_treedepth = carg_max_treedepth, + save_warmup = save_warmup, + ... ) + } + # coerce to stanfit object + stanfit <- rstan::read_stan_csv( cmdstanfit$output_files() ) + + } else if(algorithm == "optimize"){ + # Optimize for getting point estimates + stanfit <- mod$optimize(data = data, + #seed = seed, # This could be passed on + threads = threads + ) + } else if(algorithm == "variational") { + # Variational for approximating the posterior + stanfit <- mod$variational(data = data, + # seed = seed, + threads = threads + ) + } + + return(stanfit) +} + +#' Create a posterior prediction from a rstanfit object +#' +#' @description This function does simulates from the posterior +#' of a created stan model, therefore providing a fast and efficient way +#' to project coefficients obtained from Bayesian models to new/novel contexts. +#' +#' @param obj A [`stanfit`] object (as used by [rstan]). +#' @param form A [`formula`] object created for the [ibis.iSDM::DistributionModel]. +#' @param newdata A [data.frame] with new data to be used for prediction. +#' @param mode A [`character`] of whether the linear `predictor` or the `response` is to be summarized. +#' @param family A [`character`] giving the family for simulating linear response values (Default: \code{NULL}) +#' @param offset A [vector] with an optionally specified offset. +#' @param draws [numeric] indicating whether a specific number of draws should be taken. +#' @import posterior +#' @references +#' * [https://medium.com/@alex.pavlakis/making-predictions-from-stan-models-in-r-3e349dfac1ed](https://medium.com/@alex.pavlakis/making-predictions-from-stan-models-in-r-3e349dfac1ed). +#' * The [brms] R-package. +#' @export +posterior_predict_stanfit <- function(obj, form, newdata, mode = "predictor", family = NULL, offset = NULL, draws = NULL){ + assertthat::assert_that( + inherits(obj, "stanfit") || inherits(obj, "CmdStanFit"), + is.formula(form), + is.data.frame(newdata), + is.null(family) || is.character(family), + is.null(draws) || is.numeric(draws), + is.null(offset) || (length(offset) == nrow(newdata)) + ) + mode <- match.arg(mode, c("predictor", "response"), several.ok = FALSE) + # Build model matrix + # Note: This removes all NA cells from matrix + A <- stats::model.matrix(object = stats::delete.response(stats::terms(form)), + data = newdata) + assertthat::assert_that(nrow(A)>0, inherits(A, "matrix") || inherits(A, "dgCMatrix")) + # Remove intercept unless set + if(attr(stats::terms(form),"intercept") == 1) { + if(length(grep("Intercept", colnames(A), ignore.case = TRUE))>0){ + A <- A[,-(grep("(Intercept)", colnames(A),fixed = T))] + } + } + + # Draw from the posterior + if(inherits(obj, "stanfit")) { + pp <- posterior::as_draws_df(obj) + } else { + pp <- obj$draws() |> as.data.frame() + } + # Create a subset? + if (!is.null(draws)) { + pp <- pp[sample.int(nrow(pp), draws),] + } + # Get only beta coefficients and Intercept if set + if("Intercept" %in% colnames(pp)) what <- "beta|Intercept" else what <- "beta" + suppressWarnings( pp <- pp[ c(grep(what, colnames(pp), value = TRUE)) ] ) + if(utils::hasName(pp, "b_Intercept")) pp <- pp[ grep("b_Intercept",colnames(pp), invert = T)] + + # Prepare offset if set + if(!is.null(offset)) { + # Get only the rows in the A matrix (minus NA) + offset <- offset[as.numeric(row.names(A))] + } else { offset <- rep(0, nrow(A) ) } + + # Security checks + assertthat::assert_that( + nrow(A)>0, nrow(pp) > 0, + ncol(pp) == ncol(A), + is.numeric(offset) + ) + + # 16/01/2023 - Change towards matrix multiplication by default (below) + # if(mode == "predictor"){ + # # Summarize the coefficients from the posterior + # pp <- posterior::summarise_draws(pp) |> + # subset(select = c("variable", "mean", "q5", "median", "q95", "sd")) |> + # as.data.frame() + # # --- # + # pp$variable <- colnames(A) + # # Calculate b*X + offset if set + # preds <- cbind( + # A %*% pp[,"mean"] + ifelse(is.null(offset),0, offset), + # A %*% pp[,"q5"] + ifelse(is.null(offset),0, offset), + # A %*% pp[,"median"] + ifelse(is.null(offset),0, offset), + # A %*% pp[,"q95"] + ifelse(is.null(offset),0, offset), + # A %*% pp[,"sd"] + ifelse(is.null(offset),0, offset) + # ) + # + # # Add random noise equivalent to the posterior length and sd of the posterior + # # Necessary since we already summarize the moment above + # .rnorm_matrix <- function(mean, sd) { + # stopifnot(length(dim(mean)) == 2) + # error <- matrix(rnorm(length(mean), 0, sd), ncol = ncol(mean), byrow=TRUE) + # mean + error + # } + # preds <- .rnorm_matrix(preds, pp[,"sd"]) # FIXME: This only makes sense for mean. Apply mad to median? + # + # # Apply ilink + # if(!is.null(family)){ + # preds <- switch (family, + # "poisson" = ilink(preds, link = "log"), + # "binomial" = ilink(preds, link = "logit"), + # ilink(preds, link = "log") + # ) + # } + # + # } else { + # Simulate linear response approximating poisson_rng in stan + out <- vector("list", nrow(pp)) + # TODO: Parallelize over threads? + pb <- progress::progress_bar$new(total = nrow(pp), + format = "Simulating posterior samples (:spin) [:bar] :percent") + for(i in 1:nrow(pp)){ + pb$tick() + # Build eta as additive beta with the A matrix row + eta <- 0 + base::tcrossprod(as.matrix(pp)[i,] |> base::unname(), A) + offset + out[[i]] <- base::unname(eta) + } + + # Combine link + a <- do.call(rbind, out) + colnames(a) <- rownames(a) <- NULL + + # Backtransformation + if(mode == "response"){ + if(family == "poisson"){ + a <- apply(a, 2, function(lambda) ilink(lambda, link = "log")) + } else if(family == "binomial") { + a <- apply(a, 2, function(mu) ilink(mu, link = "logit")) + } + } + # # Draw random variable for each draw and lambda value + # if(family == "poisson"){ + # a <- suppressWarnings( lapply(out, function(lambda) rpois(nrow(A), ilink(lambda, link = "log")) ) ) + # } else if(family == "binomial") { + # a <- suppressWarnings( lapply(out, function(mu) rbinom(nrow(A), size = 1, prob = ilink(mu, link = "logit")) ) ) + # } else { + # stop("Not yet implemented method for prediction the linear response.") + # } + + # Finally summarize + preds <- cbind( + matrixStats::colMeans2(a, na.rm = TRUE), + matrixStats::colQuantiles(a, probs = c(.05,.5,.95), na.rm = TRUE), + matrixStats::colSds(a, na.rm = TRUE) + ) + + # ---- # + # Create output with cellid + out <- tibble::rowid_to_column(newdata, var = "cellid")["cellid"] |> as.data.frame() + out$cv <- out$q95 <- out$q50 <- out$q05 <- out$sd <- out$mean <- NA + out$mean[as.numeric(row.names(A))] <- preds[,1] + out$sd[as.numeric(row.names(A))] <- preds[,5] + out$q05[as.numeric(row.names(A))] <- preds[,2] + out$q50[as.numeric(row.names(A))] <- preds[,3] + out$q95[as.numeric(row.names(A))] <- preds[,4] + out$cv[as.numeric(row.names(A))] <- preds[,5] / preds[,1] + out$cellid <- NULL + + return(out) +} + +#' Show the stan code from a trained model +#' +#' @description +#' This helper function shows the code from a trained [DistributionModel] +#' using the [`engine_stan`]. +#' This function is emulated after a similar functionality in the [brms] R-package. +#' **It only works with models inferred with stan!** +#' @param obj Any prepared object. +#' @param ... not used. +#' +#' @return None. +#' @keywords engine +#' @seealso [rstan], [cmdstanr], [brms] +#' @name stancode +NULL +methods::setGeneric("stancode", + signature = methods::signature("obj"), + function(obj, ...) standardGeneric("stancode")) + +#' @rdname stancode +#' @method stancode DistributionModel +#' @keywords engine +#' @export +stancode.DistributionModel <- function(x, ...) x$stancode() diff --git a/R/utils.R b/R/utils.R index d20ed941..27352440 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,863 +1,728 @@ -#' Inverse of in call for convenience -#' Calculates the set of entries not present in the second vector -#' -#' @param a First [`vector`] object. -#' @param b Second [`vector`] object. -#' @keywords internal, utils -#' @noRd -`%notin%` = function(a, b){!(a %in% b)} - -#' Custom messaging function for scripts -#' -#' @description -#' This functions prints a message with a custom header and colour. -#' @param title The title in the log output -#' @param col A [`character`] indicating the text colour to be used. Supported are 'green' / 'yellow' / 'red' -#' @param ... Any additional outputs or words for display -#' @examples -#' myLog("[Setup]", "red", "Some error occurred during data preparation.") -#' @keywords internal, utils -#' @export -myLog <- function(title = "[Processing]", col = 'green', ...) { - assertthat::assert_that(col %in% c('green','yellow','red')) - textwrap <- switch (col, - 'green' = text_green, - 'yellow' = text_yellow, - 'red' = text_red - ) - message(textwrap( - paste0(title,' ', Sys.time(), " | ", ...) - ) - ) -} - -#' Colour helpers for message logs -#' @param text A [`character`]. -#' @keywords internal, utils -#' @aliases text_red -#' @noRd -text_red <- function(text) { paste0('\033[31m',text,'\033[39m') } -#' @inheritParams text_red -#' @aliases text_yellow -text_yellow <- function(text) { paste0('\033[33m',text,'\033[39m') } -#' @inheritParams text_red -#' @aliases text_green -text_green <- function(text) { paste0('\033[32m',text,'\033[39m') } - -#' Calculate the mode -#' @param A [`vector`] of values or characters. -#' @keywords utils -#' @noRd -mode <- function(x) { - ux <- unique(x) - ux[which.max(tabulate(match(x, ux)))] -} -#' Check whether function exist in name space -#' -#' @param x The [character] name of a package from which a function is needed. -#' @keywords internal, utils -#' @noRd -check_package <- function(x) { - assertthat::assert_that(is.character(x)) - if (!requireNamespace(x, quietly = TRUE)) { - stop(paste0("Package \"",x,"\" needed for this function to work. Please install it."), - call. = FALSE) - } -} - -#' Camel case conversion of a string -#' -#' @param x A [`vector`] or [`character`] object. -#' @keywords internal, utils -#' @noRd -to_camelcase <- function(x){ - assertthat::assert_that(is.character(x) || is.vector(x)) - substr(x, 1, 1) <- toupper( - substr(x, 1, 1) - ) - x -} - -#' Atomic representation of a name -#' -#' Return a pretty character representation of an object with elements and -#' names. -#' @param x A [`vector`] object -#' @return [`character`] object. -#' @concept function taken from `prioritizr` package -#' @keywords internal, utils -#' @examples -#' name_atomic(letters) -#' name_atomic(letters, "characters") -#' @noRd -name_atomic <- function(x, description = "") { - n <- length(x) - if (nchar(description) > 0) - description <- paste0(" ", description) - if (length(x) <= 4) { - x <- x[seq_len(min(length(x), 4))] - } else { - x <- c(x[seq_len(min(length(x), 3))], "...") - } - paste0(paste(x, collapse = ", "), " (", n, description, ")") -} - -#' Aligns text with new characters -#' -#' Format text by adding a certain number of spaces after new line characters. -#' -#' @param x [`character`] text. -#' @param n [`integer`] number of spaces. -#' @return [`character`]. -#' @concept function taken from `prioritizr` package -#' -#' @examples -#' # make some text -#' original_text <- "animals: horse\npig\nbear" -#' -#' # print text -#' message(original_text) -#' -#' # this look really ugly so we will align it -#' aligned_text <- align_text(original_text, 9) -#' -#' # print aligned text -#' message(aligned_text) -#' -#' @keywords utils -#' @noRd -align_text <- function(x, n) { - assertthat::assert_that(assertthat::is.string(x), assertthat::is.count(n)) - if (!grepl("\n", x)) - return(x) - return(gsub("\n", paste0("\n", paste(rep(" ", n), collapse = "")), x, - fixed = TRUE)) -} - -#' Convert character to capital text -#' -#' @param x [`character`] text. -#' @examples -#' capitalize_text('presence') -#' capitalize_text('ducks are the best birds') -#' -#' @keywords utils -#' @noRd -capitalize_text <- function(x) { - assertthat::assert_that(is.character(x)) - s <- strsplit(x, " ")[[1]] - paste(toupper(substring(s, 1,1)), substring(s, 2), - sep="", collapse=" ") -} - -#' Convert character to formula object -#' -#' @param x [`character`] text. -#' @keywords utils -#' @noRd -to_formula <- function(formula){ - # Convert to formula object - if(!is.null(formula)) { - formula = as.formula(formula) - } else { - # Asign a new waiver object - formula = new_waiver() - } - return(formula) -} - -#' Guess time to Posix -#' -#' @description -#' This little wrapper converts and ensures that a vector of time objects are in POSIXct format. -#' @param vec A [`vector`] with [`numeric`] or [`Posixct`] data -#' @keywords utils -#' @noRd -to_POSIXct <- function(vec){ - # Check th - # Parse differently depending on time - if(inherits(vec, "POSIXct")){ - out <- vec - } else if(inherits(vec, "units") || inherits(vec, "Date")){ - check_package("units") - # Try and format directly to posixct - out <- as.POSIXct(vec) - assertthat::assert_that(any(!is.na.POSIXlt(out))) - } else if(inherits(vec, "numeric")){ - if(all(nchar(vec)==4)){ - # Assume that the numeric is a year - vec <- paste0(vec, "-01-01") - out <- as.POSIXct(vec) - } - } else if(inherits(vec, "character")){ - # Try and convert to posix directly - out <- as.POSIXct(vec) - if(any(is.na.POSIXlt(out))){ - # Situation not yet encountered. To be added when use cases are known. - message("Date formats probably need some more prior handling.") - } - } - return(out) -} - -#' Hingeval transformation -#' @param x A [`vector`] with numeric values. -#' @param min [`numeric`] minimum value for the hinge transformation -#' @param max [`numeric`] maximum value for the hinge transformation -#' @keywords internal -#' @noRd -hingeval <- function (x, min, max) ifelse(is.na(x),NA, pmin(1, pmax(0, (x - min)/(max - min),na.rm = TRUE),na.rm = TRUE)) - -#' Threshold transformation -#' @param x A [`vector`] with numeric values. -#' @param knot [`numeric`] threshold value as cutoff. -#' @keywords internal -#' @noRd -thresholdval <- function(x, knot) { - ifelse(x >= knot, 1, 0) -} - -#' Parallel computation of function -#' -#' @description -#' Some computations take considerable amount of time to execute. This -#' function provides a helper wrapper for running functions of the [`apply`] -#' family to specified outputs. -#' @details -#' By default, the [parallel] package is used for parallel computation, -#' however an option exists to use the [future] package instead. -#' @param X A [`list`], [`data.frame`] or [`matrix`] object to be fed to a single core or parallel [apply] call. -#' @param FUN A [`function`] passed on for computation. -#' @param cores A [numeric] of the number of cores to use (Default: \code{1}). -#' @param approach [`character`] for the parallelization approach taken (Options: \code{"parallel"} or \code{"future"}). -#' @param export_package A [`vector`] with packages to export for use on parallel nodes (Default: \code{NULL}). -#' @examples -#' \dontrun{ -#' run_par(list, mean, cores = 4) -#' } -#' @keywords utils -#' @noRd -run_parallel <- function (X, FUN, cores = 1, approach = "parallel", export_packages = NULL, ...) { - assertthat::assert_that( - is.list(X) || is.data.frame(X) || is.matrix(X), - is.function(FUN), - is.numeric(cores), - is.null(export_packages) || is.character(export_packages) - ) - # Match approach - approach <- match.arg(approach, c("parallel", "future"), several.ok = FALSE) - - # Collect dots - dots <- list(...) - - if(!is.list(X)){ - # Convert input object to a list of split parameters - n_vars <- nrow(X) - chunk_size <- ceiling(n_vars / cores) - n_chunks <- ceiling(n_vars / chunk_size) - chunk_list <- vector(length = n_chunks, mode = "list") - - for (i in seq_len(n_chunks)) { - if ((chunk_size * (i - 1) + 1) <= n_vars) { - chunk <- (chunk_size * (i - 1) + 1):(min(c(chunk_size * - i, n_vars))) - chunk_list[[i]] <- X[chunk, ] - } - } - assertthat::assert_that(sum(sapply(chunk_list, nrow)) == nrow(X)) - X <- chunk_list;rm(chunk_list) - input_type = "data.frame" # Save to aggregate later again - } else { input_type = "list"} - - # Process depending on cores - if (cores == 1) { - out <- lapply(X, FUN, ...) - } else { - if(approach == "parallel"){ - # check_package('doParallel') - # require(foreach) - # isTRUE(Sys.info()[["sysname"]] == "Windows") - # Other operating systems - if(!isTRUE(Sys.info()[["sysname"]] == "Windows") && is.list(X)) { - out <- parallel::mclapply(X = X, FUN = FUN, mc.cores = cores, - ...) - } else { - # Other operating systems - cl <- parallel::makePSOCKcluster(cores) - on.exit(parallel::stopCluster(cl)) - if(!is.null(export_packages)){ - # Send all specified packages to the cluster - for(val in export_packages){ - parallel::clusterExport(cl, varlist = package_function_names(val), - envir = as.environment(asNamespace(val))) - } - } - out <- parallel::parLapply(cl = cl, X = X, fun = FUN, ...) - } - # out <- foreach::foreach(z = iterators::iter(X), - # .combine = ifelse(input_type!="list", "rbind", foreach:::defcombine), - # .inorder = FALSE, - # .multicombine = TRUE, - # .errorhandling = 'stop', - # .export = c("FUN"), - # .packages = export_packages, - # ... - # ) %dopar% { return( FUN(z, ...) ) } - } else { - # Check that future is loaded - check_package('future.apply') - # Check that plan for future has been set up! - assertthat::assert_that( getOption("ibis.use_future") == TRUE, - msg = "Set up a future plan via [ibis_future] to use this approach.") - out <- future.apply::future_lapply(cl = cl, X = X, fun = FUN, ...) - } - } - # If input data was not a list, combine again - if(input_type != "list" && is.list(out)){ - out <- do.call(rbind, out) - } - return( out ) -} - -#' Clamp a predictor matrix by given values -#' -#' @description -#' To limit extreme extrapolation it is possible to \code{'clamp'} an existing projection to the range -#' of predictor values observed during model training. -#' This function takes an internal model matrix and restricts the values seen in the predictor matrix -#' to those observed during training. -#' @note This function is meant to be used within a certain [`engine`] or within [`project`]. -#' @param model A [`list`] with the input data used for inference. Created during model setup. -#' @param pred An optional [`data.frame`] of the prediction container. -#' @returns A [`data.frame`] with the clamped predictors. -#' @keywords utils -#' @keywords internal -#' @references Phillips, S. J., Anderson, R. P., Dudík, M., Schapire, R. E., & Blair, M. E. (2017). Opening the black box: An open-source release of Maxent. Ecography. https://doi.org/10.1111/ecog.03049 -clamp_predictions <- function(model, pred){ - assertthat::assert_that( - is.list(model), - assertthat::has_name(model, "biodiversity"), - (is.data.frame(pred) || is.matrix(pred)) || missing(pred) - ) - - # For each biodiversity dataset, calculate the range of predictors observed - vars_clamp <- data.frame() - for(ds in model$biodiversity){ - # Calculate range for each variable - rr <- apply(ds$predictors[,ds$predictors_names], 2, function(z) range(z, na.rm = TRUE)) |> - t() |> as.data.frame() |> tibble::rownames_to_column("variable") - names(rr) <- c("variable", "min", "max") - vars_clamp <- rbind(vars_clamp, rr) - rm(rr) - } - # Aggregate if multiple variables - if(anyDuplicated(vars_clamp$variable)){ - o1 <- aggregate(variable ~ min, data = vars_clamp, - FUN = function(x) min(x) ) - o2 <- aggregate(variable ~ max, data = vars_clamp, - FUN = function(x) max(x) ) - vars_clamp <- merge(o1,o2) - } - # --- # - # Now clamp either predictors - if(missing(pred)) pred <- model$predictors - - # Now clamp the prediction matrix with the clamped variables - for (v in intersect(vars_clamp$variable, names(pred))) { - pred[, v] <- pmin( - pmax(pred[, v], vars_clamp$min[vars_clamp==v] ), - vars_clamp$max[vars_clamp==v]) - } - - assertthat::assert_that( is.data.frame(pred) || is.matrix(pred), - nrow(pred)>0) - return(pred) -} - -#' Create formula matrix -#' -#' Function to create list of formulas with all possible combinations of variables -#' @param form An input [`formula`] object. -#' @param response A [`character`] object giving the response. (Default: \code{NULL}) -#' @param type Currently implemented are \code{'inla'} (variable groups), -#' \code{'All'} (All possible combinations) or \code{'forward'}. -#' @returns A [`vector`] object with [`formula`] objects. -#' @examples \dontrun{ -#' formula_combinations(form) -#' } -#' @keywords utils -#' @noRd -formula_combinations <- function(form, response = NULL, type= 'forward'){ - assertthat::assert_that(is.formula(form), - is.character(response) || is.null(response), - tolower(type) %in% c('inla','forward','all')) - # --- # - # Response - if(is.null(response)) response <- all.vars(form)[1] - # Formula terms - te <- attr(stats::terms.formula(form),'term.label') - # Varnames - varnames <- all.vars(form) - varnames <- varnames[varnames %notin% c('spde','spatial.field','observed','Intercept')] # Exclude things not necessarily needed in there - # Variable length - fl <- length(varnames) - # --- # - assertthat::assert_that(fl>0, !is.null(response)) - - if(tolower(type) == 'inla'){ - # INLA modelling groups - # Instead of selecting variables piece by piece, consider individual groups - form_temp <- c() - val_int <- grep(pattern = 'Intercept',x = te, value = T) - val_lin <- grep(pattern = 'linear',x = te, value = T) - val_rw1 <- grep(pattern = 'rw1',x = te,value = TRUE) - # Alternative quadratic variables in case rw1 fails - if(length(val_rw1)>0){ - val_quad <- all.vars(as.formula(paste('observed ~ ', paste0(val_rw1,collapse = '+'))))[-1] - } else { val_quad <- all.vars(as.formula(paste('observed ~ ', paste0(val_lin,collapse = '+'))))[-1] } - val_spde <- grep(pattern = 'spde',x = te,value = TRUE) - val_ofs <- grep(pattern = 'offset',x = te,value = TRUE) - - # Construct formulas --- - # Original form - form_temp <- c(form_temp, deparse1(form)) - - # Intercept only - form_temp <- c(form_temp, - paste0(response,' ~ 0 +', paste(val_int,collapse = ' + ') )) - - # Add all linear variables as base - form_temp <- c(form_temp, - paste0(response,' ~ 0 +', paste(val_int,collapse = ' + '), - '+', paste0(varnames, collapse = ' + ') )) - - # Intercept + linear effect - if(length(val_lin)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_lin,collapse = ' + ')) - ) - } - # Intercept + rw1 effects (if existing) - if(length(val_rw1)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_rw1,collapse = ' + ')) - ) - } - # Alternative formulation using quadratic - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste0('I(',val_quad,'^2)',collapse = ' + ')) - ) - - # Intercept + spde - if(length(val_spde)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_spde,collapse = ' + ')) - ) - } - # Intercept + linear + spde - if(length(val_spde)>0 && length(val_lin)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_spde,collapse = ' + '),'+',paste(val_lin,collapse = ' + ')) - ) - form_temp <- c(form_temp, - paste0(response,' ~ 0 +', paste(val_int,collapse = ' + '), - '+', paste0(varnames, collapse = ' + '), - '+',paste(val_spde,collapse = ' + '))) - - } - # intercept + rw1 + spde - if(length(val_spde)>0 && length(val_rw1)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_spde,collapse = ' + '),'+',paste(val_rw1,collapse = ' + ')) - ) - } - if(length(val_spde)>0){ - # Quad replacement - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_spde,collapse = ' + '),'+',paste0('I(',val_quad,'^2)',collapse = ' + ')) - ) - } - # intercept + linear + rw1 + spde - if(length(val_rw1)>0 && length(val_lin)>0 && length(val_spde)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_lin,collapse = ' + '),'+',paste(val_rw1,collapse = ' + '),'+',paste(val_spde,collapse = ' + ')) - ) - - } - if(length(val_spde)>0){ - # Quad replacement - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_lin,collapse = ' + '),'+',paste0('I(',val_quad,'^2)',collapse = ' + '),'+',paste(val_spde,collapse = ' + ')) - ) - } - # intercept + linear + offset - if(length(val_lin)>0 && length(val_ofs)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_lin,collapse = ' + '),'+',paste(val_ofs,collapse = ' + ')) - ) - } - # intercept + linear + rw1 + offset - if(length(val_rw1)>0 && length(val_lin)>0 && length(val_ofs)>0){ - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_lin,collapse = ' + '),'+',paste(val_rw1,collapse = ' + '),'+',paste(val_ofs,collapse = ' + ')) - ) - } - if(length(val_lin)>0 && length(val_ofs)>0){ - # Quad replacement - form_temp <- c(form_temp, - paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), - '+', - paste(val_lin,collapse = ' + '),'+', - paste0('I(',val_quad,'^2)',collapse = ' + '),'+',paste(val_ofs,collapse = ' + ')) - ) - } - - # Other types of variable selection - } else if(tolower(type) == 'forward'){ - # Forward variable addition - # Note this ignores unique combinations - form_temp <- c() - for(i in 1:fl) { - new <- paste0(response, '~ 0 + ',paste(val_int,collapse = '+'),'+', - paste(varnames[1:i],collapse = ' + ') ) - form_temp <- c(form_temp, new) - } - - } else if(tolower(type) == 'all'){ - assertthat::assert_that('purrr' %in% loadedNamespaces()) - # Construct all possible unique combinations - varnames_comb <- 1:length(varnames) %>% - purrr::map(~ combn(varnames, .x) %>% apply(2, list) %>% unlist(recursive = F)) %>% - unlist(recursive = F) - - form_temp <- varnames_comb %>% purrr::map(~paste0(response, " ~ ", - paste(val_int,collapse = '+'),'+', - paste(.x, collapse = " + ")) ) - } - - return(form_temp) -} - -#' Outlier detection via reverse jackknife -#' -#' @description -#' Implemententation of a Reverse Jackknife procedure as described by Chapman (2005). -#' Can be used to identify outliers in environmental predictors or predictions. -#' @param vals A [`numeric`] vector from which outliers are to be identified and removed. -#' @param procedure [`character`] denoting what to do with outliers. -#' Options include: \code{'missing'} (Default) and \code{'remove'}, with the former replacing the outliers with \code{NA} and the latter removing them. -#' @references -#' * Chapman, A.D. (2005) Principles and Methods of Data Cleaning - Primary Species and Species- Occurrence Data, version 1.0. Report for the Global Biodiversity Information Facility, Copenhagen. -#' @source [`bioGeo`] package code served as inspiration -#' @keywords utils -#' @noRd -rm_outlier_revjack <- function(vals, procedure = "missing"){ - assertthat::assert_that( - is.numeric(vals), - length(vals)>0, - is.character(procedure) - ) - procedure <- match.arg(procedure, c("missing", "remove"), several.ok = FALSE) - - v2 <- vals # Make a copy - vals <- unique(vals) - lgh <- length(vals) - 1 - t1 <- (0.95 * sqrt(length(vals))) + 0.2 - x <- sort(vals) - y <- rep(0, lgh) - for (i in seq_len(lgh)) { - x1 <- x[i + 1] - if (x[i] < mean(vals, na.rm = TRUE)) { - y[i] <- (x1 - x[i]) * (mean(vals, na.rm = TRUE) - x[i]) - } else { - y[i] <- (x1 - x[i]) * (x1 - mean(vals, na.rm = TRUE)) - } - } - my <- mean(y, na.rm = TRUE) - z <- y / (sqrt(sum((y - my)^2, na.rm = TRUE) / lgh)) - out <- rep(0, length(v2)) - if (any(z > t1, na.rm = TRUE)) { - f <- which(z > t1) - vals <- x[f] - if (vals < median(x, na.rm = TRUE)) { - xa <- (v2 <= vals) * 1 - out <- out + xa - } - if (vals > median(x, na.rm = TRUE)) { - xb <- (v2 >= vals) * 1 - out <- out + xb - } - } else { - out <- out - } - # Which ones are outliers? - found <- which(out == 1) - if(length(found)>0) { - if(procedure == "missing") v2[found] <- NA else v2 <- v2[-found] - } - return(v2) -} -#' Filter a set of correlated predictors to fewer ones -#' -#' @param env A [`data.frame`] with extracted environmental covariates for a given species. -#' @param keep A [`vector`] with variables to keep regardless. -#' @param cutoff A [`numeric`] variable specifying the maximal correlation cutoff. -#' @param method Which method to use for constructing the correlation matrix (Options: \code{'pearson'}| \code{'spearman'}| \code{'kendal'}) -#' @concept Code inspired from the [`caret`] package -#' @keywords utils -#' @returns vector of variable names to exclude -find_correlated_predictors <- function( env, keep = NULL, cutoff = 0.7, method = 'pearson'){ - # Security checks - assertthat::assert_that(is.data.frame(env), - is.character(method), - is.numeric(cutoff), - is.null(keep) || is.vector(keep) - ) - keep <- keep[keep %in% names(env)] # Remove those not in the data.frame. For instance if a spatial effect is selected - if(!is.null(keep) || length(keep) == 0) x <- env %>% dplyr::select(-keep) else x <- env - - # Removing non-numeric columns - non.numeric.columns <- colnames(x)[!sapply(x, is.numeric)] - x <- x[, !(colnames(x) %in% non.numeric.columns)] - - # Get all variables that are singular or unique in value - singular_var <- which(round( apply(x, 2, var),4) == 0) - if(length(singular_var)>0) x <- x[,-singular_var] - - # Calculate correlation matrix - cm <- cor(x, method = method) - - # Copied from the \code{caret} package to avoid further dependencies - if (any(!stats::complete.cases(cm))) stop("The correlation matrix has some missing values.") - averageCorr <- colMeans(abs(cm)) - averageCorr <- as.numeric(as.factor(averageCorr)) - cm[lower.tri(cm, diag = TRUE)] <- NA - - # Determine combinations over cutoff - combsAboveCutoff <- which(abs(cm) > cutoff) - colsToCheck <- ceiling(combsAboveCutoff/nrow(cm)) - rowsToCheck <- combsAboveCutoff%%nrow(cm) - - # Exclude columns with variables over average correlation - colsToDiscard <- averageCorr[colsToCheck] > averageCorr[rowsToCheck] - rowsToDiscard <- !colsToDiscard - - # Get columns to discard - deletecol <- c(colsToCheck[colsToDiscard], rowsToCheck[rowsToDiscard]) - deletecol <- unique(deletecol) - - # Which variables to discard - o <- names(env)[deletecol] - if(length(singular_var)>0) o <- unique( c(o, names(singular_var) ) ) - o -} - -#' Apply the adaptive best subset selection framework on a set of predictors -#' -#' @description -#' This is a wrapper function to fit the adaptive subset selection procedure outlined -#' in Zhu et al. (2021) and Zhu et al. (2020). -#' @param env A [`data.frame`] with extracted environmental covariates for a given species. -#' @param observed A [`vector`] with the observed response variable. -#' @param family A [`character`] indicating the family the observational data originates from. -#' @param tune.type [`character`] indicating the type used for subset evaluation. -#' Options are \code{c("gic", "ebic", "bic", "aic", "cv")} as listed in [abess]. -#' @param lambda A [`numeric`] single lambda value for regularized best subset selection (Default: \code{0}). -#' @param weight Observation weights. When weight = \code{NULL}, we set weight = \code{1} for each observation as default. -#' @param keep A [`vector`] with variables to keep regardless (Default: \code{NULL}). -#' @references -#' * abess: A Fast Best Subset Selection Library in Python and R. Jin Zhu, Liyuan Hu, Junhao Huang, Kangkang Jiang, Yanhang Zhang, Shiyun Lin, Junxian Zhu, Xueqin Wang (2021). arXiv preprint arXiv:2110.09697. -#' * A polynomial algorithm for best-subset selection problem. Junxian Zhu, Canhong Wen, Jin Zhu, Heping Zhang, Xueqin Wang. Proceedings of the National Academy of Sciences Dec 2020, 117 (52) 33117-33123; doi: 10.1073/pnas.2014241117 -#' @keywords utils, internal -#' @returns vector of variable names to exclude -find_subset_of_predictors <- function( env, observed, family, tune.type = "cv", lambda = 0, - weight = NULL, keep = NULL){ - # Security checks - assertthat::assert_that(is.data.frame(env), - is.vector(observed), - is.numeric(lambda), - is.character(tune.type), - is.null(weight) || is.vector(weight) - ) - assertthat::assert_that( - length(observed) == nrow(env), msg = "Number of observation unequal to number of covariate rows." - ) - # Match family and type - family <- match.arg(family, c("gaussian", "binomial", "poisson", "cox", "mgaussian", "multinomial", - "gamma"), several.ok = FALSE) - tune.type <- match.arg(tune.type, c("gic", "ebic", "bic", "aic", "cv"), several.ok = FALSE) - - # Check that abess package is available - check_package("abess") - if(!isNamespaceLoaded("abess")) { attachNamespace("abess");requireNamespace('abess') } - - # Build model - abess_fit <- abess::abess(x = env, - y = observed, - family = family, - tune.type = tune.type, - weight = weight, - lambda = lambda, - always.include = keep, - nfolds = 100, # Increase from default 5 - num.threads = 0 - ) - - if(anyNA(coef(abess_fit)[,1]) ) { - # Refit with minimum support size - abess_fit <- abess::abess(x = env, - y = observed, - family = family, - lambda = lambda, - tune.type = tune.type, - weight = weight, - always.include = keep, - nfolds = 100, # Increase from default 5 - # Minimum support site of 10% of number of covariates - support.size = ceiling(ncol(env) * 0.1), - num.threads = 0 - ) - - } - # Get best vars - co <- coef(abess_fit, support.size = abess_fit[["best.size"]]) - co <- names( which(co[,1] != 0)) - co <- co[grep("Intercept", co, ignore.case = TRUE, invert = TRUE)] - # Make some checks on the list of reduced variables - if(length(co) <= 2) { - warning("Abess was likely to rigours. Likely to low signal-to-noise ratio.") - return(NULL) - } else { - co - } -} - -#' Aggregate count observations to a grid -#' -#' @description -#' This function aggregates provided point data to a reference grid, by, -#' depending on the type, either counting the number of observations per grid cell -#' or aggregating them via a sum. -#' @param df A [`sf`], [`data.frame`] or [`tibble`] object containing point data. -#' @param template A [`RasterLayer`] object that is aligned with the predictors. -#' @param field_occurrence A [`character`] name of the column containing the presence information (Default: \code{observed}). -#' @returns A [`sf`] object with the newly aggregated points. -#' @keywords internal -#' @noRd -aggregate_observations2grid <- function(df, template, field_occurrence = 'observed'){ - assertthat::assert_that( - is.data.frame(df) || inherits(df, 'sf') || tibble::is_tibble(df), - is.Raster(template), - is.character(field_occurrence), - assertthat::has_name(df, field_occurrence) - ) - # Try and guess the geometry - if(!inherits(df, 'sf')) df <- guess_sf(df) - assertthat::assert_that(inherits(df, 'sf'), msg = "Could not convert input to sf. Prepare data first.") - # Add coordinates if not present - if(!assertthat::has_name(df, 'x') && !assertthat::has_name(df, 'y')) { - df$x <- sf::st_coordinates(df[attr(df, "sf_column")])[,1] - df$y <- sf::st_coordinates(df[attr(df, "sf_column")])[,2] - } - - # First take presence observations and rasterize them to reduce them to a count per grid cell - if( max(df[[field_occurrence]],na.rm = TRUE) > 1){ - # Count the sum of them - pres <- raster::rasterize(df, field = field_occurrence, - template, fun = 'sum', background = 0) - - } else { - # Simply count them - if(inherits(df, 'sf')) df <- df %>% sf::st_drop_geometry() - pres <- raster::rasterize(df[,c("x","y")], - template, fun = 'count', background = 0) - } - assertthat::assert_that( - is.Raster(pres), is.finite(raster::cellStats(pres, "max")) - ) - if(inherits(df, 'sf')) df <- df %>% sf::st_drop_geometry() - # Get cell ids - ce <- raster::cellFromXY(pres, df[,c("x","y")]) - # Remove any NA if present - if(anyNA(ce)) ce <- subset(ce, stats::complete.cases(ce)) - # Get new presence data - obs <- cbind( - data.frame(observed = raster::values(pres)[ce], - raster::xyFromCell(pres, ce) # Center of cell - ) - ) %>% - # Unique to remove any duplicate values (otherwise double counted cells) - unique() - - # Convert to sf again - obs <- sf::st_as_sf(obs, coords = c("x", "y"), crs = sf::st_crs(df)) - obs$x <- sf::st_coordinates(obs[attr(obs, "sf_column")])[,1] - obs$y <- sf::st_coordinates(obs[attr(obs, "sf_column")])[,2] - - # Set CRS again - if(is.na(sf::st_crs(obs))){ - suppressWarnings( - obs <- sf::st_set_crs(obs, value = sf::st_crs(template)) - ) - } - return(obs) -} - -#' Get all occurrence point locations -#' -#' @description -#' This is a small helper function that simply goes over all biodiversity sets in -#' the model object. -#' **This function is intended to only run within ibis and with the model packages created by it.** -#' @param model A [`list`] object containing the biodiversity and predictor objects. -#' @param include_absences A [`logical`] of whether absences should be included (Default: \code{FALSE}). -#' @returns A [`sf`] object with the newly aggregated points. -#' @keywords internal -#' @noRd -collect_occurrencepoints <- function(model, include_absences = FALSE){ - assertthat::assert_that( - is.list(model), - assertthat::has_name(model, "id"), - assertthat::has_name(model, "biodiversity"), - is.logical(include_absences) - ) - - # Get the locations - locs <- do.call("rbind", - lapply(model$biodiversity, function(x){ - z <- x$observations - if(!include_absences) z <- subset(z, observed > 0) - o <- sf::st_coordinates( guess_sf( z )[,1:2]) - o <- as.matrix(o) - colnames(o) <- c("x", "y") - return(o) - } - ) - ) - assertthat::assert_that( - is.matrix(locs), nrow(locs)>1 - ) - return(locs) -} +#' Inverse of in call for convenience +#' Calculates the set of entries not present in the second vector +#' +#' @param a First [`vector`] object. +#' @param b Second [`vector`] object. +#' @keywords internal, utils +#' @noRd +`%notin%` = function(a, b){!(a %in% b)} + +#' Custom messaging function for scripts +#' +#' @description +#' This functions prints a message with a custom header and colour. +#' @param title The title in the log output +#' @param col A [`character`] indicating the text colour to be used. Supported are 'green' / 'yellow' / 'red' +#' @param ... Any additional outputs or words for display +#' @examples +#' myLog("[Setup]", "red", "Some error occurred during data preparation.") +#' @keywords internal, utils +#' @export +myLog <- function(title = "[Processing]", col = 'green', ...) { + assertthat::assert_that(col %in% c('green','yellow','red')) + textwrap <- switch (col, + 'green' = text_green, + 'yellow' = text_yellow, + 'red' = text_red + ) + message(textwrap( + paste0(title,' ', Sys.time(), " | ", ...) + ) + ) +} + +#' Colour helpers for message logs +#' @param text A [`character`]. +#' @keywords internal, utils +#' @aliases text_red +#' @noRd +text_red <- function(text) { paste0('\033[31m',text,'\033[39m') } +#' @inheritParams text_red +#' @aliases text_yellow +text_yellow <- function(text) { paste0('\033[33m',text,'\033[39m') } +#' @inheritParams text_red +#' @aliases text_green +text_green <- function(text) { paste0('\033[32m',text,'\033[39m') } + +#' Calculate the mode +#' @param A [`vector`] of values or characters. +#' @keywords utils +#' @noRd +mode <- function(x) { + ux <- unique(x) + ux[which.max(tabulate(match(x, ux)))] +} +#' Check whether function exist in name space +#' +#' @param x The [character] name of a package from which a function is needed. +#' @keywords internal, utils +#' @noRd +check_package <- function(x) { + assertthat::assert_that(is.character(x)) + if (!requireNamespace(x, quietly = TRUE)) { + stop(paste0("Package \"",x,"\" needed for this function to work. Please install it."), + call. = FALSE) + } +} + +#' Camel case conversion of a string +#' +#' @param x A [`vector`] or [`character`] object. +#' @keywords internal, utils +#' @noRd +to_camelcase <- function(x){ + assertthat::assert_that(is.character(x) || is.vector(x)) + substr(x, 1, 1) <- toupper( + substr(x, 1, 1) + ) + x +} + +#' Atomic representation of a name +#' +#' Return a pretty character representation of an object with elements and +#' names. +#' @param x A [`vector`] object +#' @return [`character`] object. +#' @concept function taken from `prioritizr` package +#' @keywords internal, utils +#' @examples +#' name_atomic(letters) +#' name_atomic(letters, "characters") +#' @noRd +name_atomic <- function(x, description = "") { + n <- length(x) + if (nchar(description) > 0) + description <- paste0(" ", description) + if (length(x) <= 4) { + x <- x[seq_len(min(length(x), 4))] + } else { + x <- c(x[seq_len(min(length(x), 3))], "...") + } + paste0(paste(x, collapse = ", "), " (", n, description, ")") +} + +#' Aligns text with new characters +#' +#' Format text by adding a certain number of spaces after new line characters. +#' +#' @param x [`character`] text. +#' @param n [`integer`] number of spaces. +#' @return [`character`]. +#' @concept function taken from `prioritizr` package +#' +#' @examples +#' # make some text +#' original_text <- "animals: horse\npig\nbear" +#' +#' # print text +#' message(original_text) +#' +#' # this look really ugly so we will align it +#' aligned_text <- align_text(original_text, 9) +#' +#' # print aligned text +#' message(aligned_text) +#' +#' @keywords utils +#' @noRd +align_text <- function(x, n) { + assertthat::assert_that(assertthat::is.string(x), assertthat::is.count(n)) + if (!grepl("\n", x)) + return(x) + return(gsub("\n", paste0("\n", paste(rep(" ", n), collapse = "")), x, + fixed = TRUE)) +} + +#' Convert character to capital text +#' +#' @param x [`character`] text. +#' @examples +#' capitalize_text('presence') +#' capitalize_text('ducks are the best birds') +#' +#' @keywords utils +#' @noRd +capitalize_text <- function(x) { + assertthat::assert_that(is.character(x)) + s <- strsplit(x, " ")[[1]] + paste(toupper(substring(s, 1,1)), substring(s, 2), + sep="", collapse=" ") +} + +#' Convert character to formula object +#' +#' @param x [`character`] text. +#' @keywords utils +#' @noRd +to_formula <- function(formula){ + # Convert to formula object + if(!is.null(formula)) { + formula = stats::as.formula(formula) + } else { + # Asign a new waiver object + formula = new_waiver() + } + return(formula) +} + +#' Guess time to Posix +#' +#' @description +#' This little wrapper converts and ensures that a vector of time objects are in POSIXct format. +#' @param vec A [`vector`] with [`numeric`] or [`Posixct`] data +#' @keywords utils +#' @noRd +to_POSIXct <- function(vec){ + # Check th + # Parse differently depending on time + if(inherits(vec, "POSIXct")){ + out <- vec + } else if(inherits(vec, "units") || inherits(vec, "Date")){ + check_package("units") + # Try and format directly to posixct + out <- as.POSIXct(vec) + assertthat::assert_that(any(!is.na.POSIXlt(out))) + } else if(inherits(vec, "numeric")){ + if(all(nchar(vec)==4)){ + # Assume that the numeric is a year + vec <- paste0(vec, "-01-01") + out <- as.POSIXct(vec) + } + } else if(inherits(vec, "character")){ + # Try and convert to posix directly + out <- as.POSIXct(vec) + if(any(is.na.POSIXlt(out))){ + # Situation not yet encountered. To be added when use cases are known. + message("Date formats probably need some more prior handling.") + } + } + return(out) +} + +#' Hingeval transformation +#' @param x A [`vector`] with numeric values. +#' @param min [`numeric`] minimum value for the hinge transformation +#' @param max [`numeric`] maximum value for the hinge transformation +#' @keywords internal +#' @noRd +hingeval <- function (x, min, max) ifelse(is.na(x),NA, pmin(1, pmax(0, (x - min)/(max - min),na.rm = TRUE),na.rm = TRUE)) + +#' Threshold transformation +#' @param x A [`vector`] with numeric values. +#' @param knot [`numeric`] threshold value as cutoff. +#' @keywords internal +#' @noRd +thresholdval <- function(x, knot) { + ifelse(x >= knot, 1, 0) +} + +#' Parallel computation of function +#' +#' @description +#' Some computations take considerable amount of time to execute. This +#' function provides a helper wrapper for running functions of the [`apply`] +#' family to specified outputs. +#' @details +#' By default, the [parallel] package is used for parallel computation, +#' however an option exists to use the [future] package instead. +#' @param X A [`list`], [`data.frame`] or [`matrix`] object to be fed to a single core or parallel [apply] call. +#' @param FUN A [`function`] passed on for computation. +#' @param cores A [numeric] of the number of cores to use (Default: \code{1}). +#' @param approach [`character`] for the parallelization approach taken (Options: \code{"parallel"} or \code{"future"}). +#' @param export_package A [`vector`] with packages to export for use on parallel nodes (Default: \code{NULL}). +#' @examples +#' \dontrun{ +#' run_par(list, mean, cores = 4) +#' } +#' @keywords utils +#' @noRd +run_parallel <- function (X, FUN, cores = 1, approach = "parallel", export_packages = NULL, ...) { + assertthat::assert_that( + is.list(X) || is.data.frame(X) || is.matrix(X), + is.function(FUN), + is.numeric(cores), + is.null(export_packages) || is.character(export_packages) + ) + # Match approach + approach <- match.arg(approach, c("parallel", "future"), several.ok = FALSE) + + # Collect dots + dots <- list(...) + + if(!is.list(X)){ + # Convert input object to a list of split parameters + n_vars <- nrow(X) + chunk_size <- ceiling(n_vars / cores) + n_chunks <- ceiling(n_vars / chunk_size) + chunk_list <- vector(length = n_chunks, mode = "list") + + for (i in seq_len(n_chunks)) { + if ((chunk_size * (i - 1) + 1) <= n_vars) { + chunk <- (chunk_size * (i - 1) + 1):(min(c(chunk_size * + i, n_vars))) + chunk_list[[i]] <- X[chunk, ] + } + } + assertthat::assert_that(sum(sapply(chunk_list, nrow)) == nrow(X)) + X <- chunk_list;rm(chunk_list) + input_type = "data.frame" # Save to aggregate later again + } else { input_type = "list"} + + # Process depending on cores + if (cores == 1) { + out <- lapply(X, FUN, ...) + } else { + if(approach == "parallel"){ + # check_package('doParallel') + # require(foreach) + # isTRUE(Sys.info()[["sysname"]] == "Windows") + # Other operating systems + if(!isTRUE(Sys.info()[["sysname"]] == "Windows") && is.list(X)) { + out <- parallel::mclapply(X = X, FUN = FUN, mc.cores = cores, + ...) + } else { + # Other operating systems + cl <- parallel::makePSOCKcluster(cores) + on.exit(parallel::stopCluster(cl)) + if(!is.null(export_packages)){ + # Send all specified packages to the cluster + for(val in export_packages){ + parallel::clusterExport(cl, varlist = package_function_names(val), + envir = as.environment(asNamespace(val))) + } + } + out <- parallel::parLapply(cl = cl, X = X, fun = FUN, ...) + } + # out <- foreach::foreach(z = iterators::iter(X), + # .combine = ifelse(input_type!="list", "rbind", foreach:::defcombine), + # .inorder = FALSE, + # .multicombine = TRUE, + # .errorhandling = 'stop', + # .export = c("FUN"), + # .packages = export_packages, + # ... + # ) %dopar% { return( FUN(z, ...) ) } + } else { + # Check that future is loaded + check_package('future.apply') + # Check that plan for future has been set up! + assertthat::assert_that( getOption("ibis.use_future") == TRUE, + msg = "Set up a future plan via [ibis_future] to use this approach.") + out <- future.apply::future_lapply(cl = cl, X = X, fun = FUN, ...) + } + } + # If input data was not a list, combine again + if(input_type != "list" && is.list(out)){ + out <- do.call(rbind, out) + } + return( out ) +} + +#' Clamp a predictor matrix by given values +#' +#' @description +#' To limit extreme extrapolation it is possible to \code{'clamp'} an existing projection to the range +#' of predictor values observed during model training. +#' This function takes an internal model matrix and restricts the values seen in the predictor matrix +#' to those observed during training. +#' @note This function is meant to be used within a certain [`engine`] or within [`project`]. +#' @param model A [`list`] with the input data used for inference. Created during model setup. +#' @param pred An optional [`data.frame`] of the prediction container. +#' @returns A [`data.frame`] with the clamped predictors. +#' @keywords utils +#' @keywords internal +#' @references Phillips, S. J., Anderson, R. P., Dudík, M., Schapire, R. E., & Blair, M. E. (2017). Opening the black box: An open-source release of Maxent. Ecography. https://doi.org/10.1111/ecog.03049 +clamp_predictions <- function(model, pred){ + assertthat::assert_that( + is.list(model), + assertthat::has_name(model, "biodiversity"), + (is.data.frame(pred) || is.matrix(pred)) || missing(pred) + ) + + # For each biodiversity dataset, calculate the range of predictors observed + vars_clamp <- data.frame() + for(ds in model$biodiversity){ + # Calculate range for each variable + rr <- apply(ds$predictors[,ds$predictors_names], 2, function(z) range(z, na.rm = TRUE)) |> + t() |> as.data.frame() |> tibble::rownames_to_column("variable") + names(rr) <- c("variable", "min", "max") + vars_clamp <- rbind(vars_clamp, rr) + rm(rr) + } + # Aggregate if multiple variables + if(anyDuplicated(vars_clamp$variable)){ + o1 <- aggregate(variable ~ min, data = vars_clamp, + FUN = function(x) min(x) ) + o2 <- aggregate(variable ~ max, data = vars_clamp, + FUN = function(x) max(x) ) + vars_clamp <- merge(o1,o2) + } + # --- # + # Now clamp either predictors + if(missing(pred)) pred <- model$predictors + + # Now clamp the prediction matrix with the clamped variables + for (v in intersect(vars_clamp$variable, names(pred))) { + pred[, v] <- pmin( + pmax(pred[, v], vars_clamp$min[vars_clamp==v] ), + vars_clamp$max[vars_clamp==v]) + } + + assertthat::assert_that( is.data.frame(pred) || is.matrix(pred), + nrow(pred)>0) + return(pred) +} + +#' Create formula matrix +#' +#' Function to create list of formulas with all possible combinations of variables +#' @param form An input [`formula`] object. +#' @param response A [`character`] object giving the response. (Default: \code{NULL}) +#' @param type Currently implemented are \code{'inla'} (variable groups), +#' \code{'All'} (All possible combinations) or \code{'forward'}. +#' @returns A [`vector`] object with [`formula`] objects. +#' @examples \dontrun{ +#' formula_combinations(form) +#' } +#' @keywords utils +#' @noRd +formula_combinations <- function(form, response = NULL, type= 'forward'){ + assertthat::assert_that(is.formula(form), + is.character(response) || is.null(response), + tolower(type) %in% c('inla','forward','all')) + # --- # + # Response + if(is.null(response)) response <- all.vars(form)[1] + # Formula terms + te <- attr(stats::terms.formula(form),'term.label') + # Varnames + varnames <- all.vars(form) + varnames <- varnames[varnames %notin% c('spde','spatial.field','observed','Intercept')] # Exclude things not necessarily needed in there + # Variable length + fl <- length(varnames) + # --- # + assertthat::assert_that(fl>0, !is.null(response)) + + if(tolower(type) == 'inla'){ + # INLA modelling groups + # Instead of selecting variables piece by piece, consider individual groups + form_temp <- c() + val_int <- grep(pattern = 'Intercept',x = te, value = T) + val_lin <- grep(pattern = 'linear',x = te, value = T) + val_rw1 <- grep(pattern = 'rw1',x = te,value = TRUE) + # Alternative quadratic variables in case rw1 fails + if(length(val_rw1)>0){ + val_quad <- all.vars(stats::as.formula(paste('observed ~ ', paste0(val_rw1,collapse = '+'))))[-1] + } else { val_quad <- all.vars(stats::as.formula(paste('observed ~ ', paste0(val_lin,collapse = '+'))))[-1] } + val_spde <- grep(pattern = 'spde',x = te,value = TRUE) + val_ofs <- grep(pattern = 'offset',x = te,value = TRUE) + + # Construct formulas --- + # Original form + form_temp <- c(form_temp, deparse1(form)) + + # Intercept only + form_temp <- c(form_temp, + paste0(response,' ~ 0 +', paste(val_int,collapse = ' + ') )) + + # Add all linear variables as base + form_temp <- c(form_temp, + paste0(response,' ~ 0 +', paste(val_int,collapse = ' + '), + '+', paste0(varnames, collapse = ' + ') )) + + # Intercept + linear effect + if(length(val_lin)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_lin,collapse = ' + ')) + ) + } + # Intercept + rw1 effects (if existing) + if(length(val_rw1)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_rw1,collapse = ' + ')) + ) + } + # Alternative formulation using quadratic + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste0('I(',val_quad,'^2)',collapse = ' + ')) + ) + + # Intercept + spde + if(length(val_spde)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_spde,collapse = ' + ')) + ) + } + # Intercept + linear + spde + if(length(val_spde)>0 && length(val_lin)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_spde,collapse = ' + '),'+',paste(val_lin,collapse = ' + ')) + ) + form_temp <- c(form_temp, + paste0(response,' ~ 0 +', paste(val_int,collapse = ' + '), + '+', paste0(varnames, collapse = ' + '), + '+',paste(val_spde,collapse = ' + '))) + + } + # intercept + rw1 + spde + if(length(val_spde)>0 && length(val_rw1)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_spde,collapse = ' + '),'+',paste(val_rw1,collapse = ' + ')) + ) + } + if(length(val_spde)>0){ + # Quad replacement + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_spde,collapse = ' + '),'+',paste0('I(',val_quad,'^2)',collapse = ' + ')) + ) + } + # intercept + linear + rw1 + spde + if(length(val_rw1)>0 && length(val_lin)>0 && length(val_spde)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_lin,collapse = ' + '),'+',paste(val_rw1,collapse = ' + '),'+',paste(val_spde,collapse = ' + ')) + ) + + } + if(length(val_spde)>0){ + # Quad replacement + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_lin,collapse = ' + '),'+',paste0('I(',val_quad,'^2)',collapse = ' + '),'+',paste(val_spde,collapse = ' + ')) + ) + } + # intercept + linear + offset + if(length(val_lin)>0 && length(val_ofs)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_lin,collapse = ' + '),'+',paste(val_ofs,collapse = ' + ')) + ) + } + # intercept + linear + rw1 + offset + if(length(val_rw1)>0 && length(val_lin)>0 && length(val_ofs)>0){ + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_lin,collapse = ' + '),'+',paste(val_rw1,collapse = ' + '),'+',paste(val_ofs,collapse = ' + ')) + ) + } + if(length(val_lin)>0 && length(val_ofs)>0){ + # Quad replacement + form_temp <- c(form_temp, + paste0(response,' ~ 0 + ', paste(val_int,collapse = ' + '), + '+', + paste(val_lin,collapse = ' + '),'+', + paste0('I(',val_quad,'^2)',collapse = ' + '),'+',paste(val_ofs,collapse = ' + ')) + ) + } + + # Other types of variable selection + } else if(tolower(type) == 'forward'){ + # Forward variable addition + # Note this ignores unique combinations + form_temp <- c() + for(i in 1:fl) { + new <- paste0(response, '~ 0 + ',paste(val_int,collapse = '+'),'+', + paste(varnames[1:i],collapse = ' + ') ) + form_temp <- c(form_temp, new) + } + + } else if(tolower(type) == 'all'){ + # Construct all possible unique combinations + varnames_comb <- lapply(1:length(varnames), function(i){ + utils::combn(varnames, i) |> apply(2, list) |> unlist(recursive = F) + })|> unlist(recursive = F) + + form_temp <- lapply(varnames_comb, function(i) { + paste0(response, " ~ ", paste(val_int,collapse = '+'),'+', paste(i, collapse = " + ")) + }) + } + + return(form_temp) +} + +#' Outlier detection via reverse jackknife +#' +#' @description +#' Implemententation of a Reverse Jackknife procedure as described by Chapman (2005). +#' Can be used to identify outliers in environmental predictors or predictions. +#' @param vals A [`numeric`] vector from which outliers are to be identified and removed. +#' @param procedure [`character`] denoting what to do with outliers. +#' Options include: \code{'missing'} (Default) and \code{'remove'}, with the former replacing the outliers with \code{NA} and the latter removing them. +#' @references +#' * Chapman, A.D. (2005) Principles and Methods of Data Cleaning - Primary Species and Species- Occurrence Data, version 1.0. Report for the Global Biodiversity Information Facility, Copenhagen. +#' @source [`bioGeo`] package code served as inspiration +#' @keywords utils +#' @noRd +rm_outlier_revjack <- function(vals, procedure = "missing"){ + assertthat::assert_that( + is.numeric(vals), + length(vals)>0, + is.character(procedure) + ) + procedure <- match.arg(procedure, c("missing", "remove"), several.ok = FALSE) + + v2 <- vals # Make a copy + vals <- unique(vals) + lgh <- length(vals) - 1 + t1 <- (0.95 * sqrt(length(vals))) + 0.2 + x <- sort(vals) + y <- rep(0, lgh) + for (i in seq_len(lgh)) { + x1 <- x[i + 1] + if (x[i] < mean(vals, na.rm = TRUE)) { + y[i] <- (x1 - x[i]) * (mean(vals, na.rm = TRUE) - x[i]) + } else { + y[i] <- (x1 - x[i]) * (x1 - mean(vals, na.rm = TRUE)) + } + } + my <- mean(y, na.rm = TRUE) + z <- y / (sqrt(sum((y - my)^2, na.rm = TRUE) / lgh)) + out <- rep(0, length(v2)) + if (any(z > t1, na.rm = TRUE)) { + f <- which(z > t1) + vals <- x[f] + if (vals < stats::median(x, na.rm = TRUE)) { + xa <- (v2 <= vals) * 1 + out <- out + xa + } + if (vals > stats::median(x, na.rm = TRUE)) { + xb <- (v2 >= vals) * 1 + out <- out + xb + } + } else { + out <- out + } + # Which ones are outliers? + found <- which(out == 1) + if(length(found)>0) { + if(procedure == "missing") v2[found] <- NA else v2 <- v2[-found] + } + return(v2) +} + +#' Aggregate count observations to a grid +#' +#' @description +#' This function aggregates provided point data to a reference grid, by, +#' depending on the type, either counting the number of observations per grid cell +#' or aggregating them via a sum. +#' @param df A [`sf`], [`data.frame`] or [`tibble`] object containing point data. +#' @param template A [`RasterLayer`] object that is aligned with the predictors. +#' @param field_occurrence A [`character`] name of the column containing the presence information (Default: \code{observed}). +#' @returns A [`sf`] object with the newly aggregated points. +#' @keywords internal +#' @noRd +aggregate_observations2grid <- function(df, template, field_occurrence = 'observed'){ + assertthat::assert_that( + is.data.frame(df) || inherits(df, 'sf') || tibble::is_tibble(df), + is.Raster(template), + is.character(field_occurrence), + assertthat::has_name(df, field_occurrence) + ) + # Try and guess the geometry + if(!inherits(df, 'sf')) df <- guess_sf(df) + assertthat::assert_that(inherits(df, 'sf'), msg = "Could not convert input to sf. Prepare data first.") + # Add coordinates if not present + if(!assertthat::has_name(df, 'x') && !assertthat::has_name(df, 'y')) { + df$x <- sf::st_coordinates(df[attr(df, "sf_column")])[,1] + df$y <- sf::st_coordinates(df[attr(df, "sf_column")])[,2] + } + + # First take presence observations and rasterize them to reduce them to a count per grid cell + if( max(df[[field_occurrence]],na.rm = TRUE) > 1){ + # Count the sum of them + pres <- raster::rasterize(df, field = field_occurrence, + template, fun = 'sum', background = 0) + + } else { + # Simply count them + if(inherits(df, 'sf')) df <- df |> sf::st_drop_geometry() + pres <- raster::rasterize(df[,c("x","y")], + template, fun = 'count', background = 0) + } + assertthat::assert_that( + is.Raster(pres), is.finite(raster::cellStats(pres, "max")) + ) + if(inherits(df, 'sf')) df <- df |> sf::st_drop_geometry() + # Get cell ids + ce <- raster::cellFromXY(pres, df[,c("x","y")]) + # Remove any NA if present + if(anyNA(ce)) ce <- subset(ce, stats::complete.cases(ce)) + # Get new presence data + obs <- cbind( + data.frame(observed = raster::values(pres)[ce], + raster::xyFromCell(pres, ce) # Center of cell + ) + ) |> + # Unique to remove any duplicate values (otherwise double counted cells) + unique() + + # Convert to sf again + obs <- sf::st_as_sf(obs, coords = c("x", "y"), crs = sf::st_crs(df)) + obs$x <- sf::st_coordinates(obs[attr(obs, "sf_column")])[,1] + obs$y <- sf::st_coordinates(obs[attr(obs, "sf_column")])[,2] + + # Set CRS again + if(is.na(sf::st_crs(obs))){ + suppressWarnings( + obs <- sf::st_set_crs(obs, value = sf::st_crs(template)) + ) + } + return(obs) +} + +#' Get all occurrence point locations +#' +#' @description +#' This is a small helper function that simply goes over all biodiversity sets in +#' the model object. +#' **This function is intended to only run within ibis and with the model packages created by it.** +#' @param model A [`list`] object containing the biodiversity and predictor objects. +#' @param include_absences A [`logical`] of whether absences should be included (Default: \code{FALSE}). +#' @returns A [`sf`] object with the newly aggregated points. +#' @keywords internal +#' @noRd +collect_occurrencepoints <- function(model, include_absences = FALSE){ + assertthat::assert_that( + is.list(model), + assertthat::has_name(model, "id"), + assertthat::has_name(model, "biodiversity"), + is.logical(include_absences) + ) + + # Get the locations + locs <- do.call("rbind", + lapply(model$biodiversity, function(x){ + z <- x$observations + if(!include_absences) z <- subset(z, observed > 0) + o <- sf::st_coordinates( guess_sf( z )[,1:2]) + o <- as.matrix(o) + colnames(o) <- c("x", "y") + return(o) + } + ) + ) + assertthat::assert_that( + is.matrix(locs), nrow(locs)>1 + ) + return(locs) +} diff --git a/R/validate.R b/R/validate.R index 9eb76621..83d8d286 100644 --- a/R/validate.R +++ b/R/validate.R @@ -1,498 +1,498 @@ -#' Validation of distribution object -#' -#' @description This function conducts a comprehensive model evaluation based on -#' either on the fitted point data or any supplied independent. -#' **Currently only supporting point datasets. For validation of integrated models more work is needed.** -#' @param mod A fitted [`BiodiversityDistribution`] object with set predictors. Alternatively one can also -#' provide directly a [`RasterLayer`], however in this case the `point` layer also needs to be provided. -#' @param method Should the validation be conducted on continuous metrics or thresholded? See Details. -#' @param layer In case multiple layers exist, which one to use? (Default: \code{'mean'}). -#' @param point A [`sf`] object with type `POINT` or `MULTIPOINT`. -#' @param point_column A [`character`] vector with the name of the column containing the independent observations. -#' (Default: \code{'observed'}). -#' @param ... Other parameters that are passed on. Currently unused. -#' @returns Return a tidy [`tibble`] with validation results. -#' @details The \code{validate} function does not work for all datasets equally. -#' @note If you use the Boyce Index, cite the original Hirzel et al. (2006) paper. -#' -#' @references -#' * Liu, C., White, M., Newell, G., 2013. Selecting thresholds for the prediction of species occurrence with presence-only data. J. Biogeogr. 40, 778–789. https://doi.org/10.1111/jbi.12058 -#' * Hirzel, A. H., Le Lay, G., Helfer, V., Randin, C., & Guisan, A. (2006). Evaluating the ability of habitat suitability models to predict species presences. Ecological modelling, 199(2), 142-152. -#' @examples -#' \dontrun{ -#' # Assuming that mod is a distribution object and has a thresholded layer -#' mod <- threshold(mod, method = "TSS") -#' validate(mod, method = "discrete") -#' } -#' @name validate -#' @aliases validate -#' @keywords train -#' @exportMethod validate -#' @export -NULL -methods::setGeneric("validate", - signature = methods::signature("mod"), - function(mod, method = 'continuous', layer = "mean", - point = NULL, point_column = 'observed', ...) standardGeneric("validate")) - -#' @name validate -#' @rdname validate -#' @usage \S4method{validate}{ANY, character, sf, character, character}(mod, method, point, layer, point_column) -methods::setMethod( - "validate", - methods::signature(mod = "ANY"), - function(mod, method = 'continuous', layer = "mean", - point = NULL, point_column = 'observed', ...){ - assertthat::assert_that( - inherits(mod, "DistributionModel"), - inherits(point, 'sf') || is.null(point), - is.null(point_column) || is.character(point_column), - is.character(layer), - is.character(method) - ) - assertthat::assert_that( "prediction" %in% mod$show_rasters(),msg = "No prediction of the fitted model found!" ) - # Check that independent data is provided and if so that the used column is there - if(!is.null(point)){ - assertthat::assert_that(is.character(point_column), - utils::hasName(point, point_column), - anyNA(point[[point_column]])==FALSE - ) - } - # Match method to be sure - method <- match.arg(method, c('continuous', 'discrete'), several.ok = FALSE) - - # Get settings from model object - settings <- mod$settings - - # Get prediction and threshold if available - prediction <- mod$get_data('prediction')[[layer]] - if( any(grep('threshold', mod$show_rasters())) ){ - tr_lyr <- grep('threshold', mod$show_rasters(),value = TRUE) - if(length(tr_lyr)>1) warning("There appear to be multiple thresholds. Using the first one.") - threshold <- mod$get_data(tr_lyr[1]) - # Get mean layer if there are multiple - if( grep(layer, names(threshold),value = TRUE ) != "") threshold <- threshold[[grep(layer, names(threshold),value = TRUE )]] - } else { threshold <- NULL } - - # Check that threshold and method match - if(is.null(threshold) && method == 'discrete'){ - if(getOption('ibis.setupmessages')) myLog('[Validation]','red','No threshold data found. Switching to continuous validation metrics.') - method <- 'continuous' - } - # If mode truncate was used, also switch to continuous data - if((attr(threshold,'format')!="binary") && method == "discrete"){ - if(attr(threshold,'format')!="binary"){ - if(getOption('ibis.setupmessages')) myLog('[Validation]','red','Only truncated threshold found. Switching to continuous validation metrics.') - method <- 'continuous' - } - } - - # Check whether limits were applied and if so, set background to 0 everywhere for validation - if(settings$get("has_limits")){ - temp <- mod$model$predictors_object$get_data()[[1]]; temp[!is.na(temp)] <- 0 - if(!is.null(threshold)){ - new <- sum(threshold, temp, na.rm = TRUE); new <- raster::mask(new, temp) - attr(new,'format') <- attr(threshold,'format') - if(attr(threshold,'format')=="binary") new <- raster::ratify(new) - threshold <- new - rm(new) - } - # Same for prediction layer, where missing data are set to 0 for validation - prediction <- sum(prediction, temp, na.rm = TRUE) - prediction <- raster::mask(prediction, temp) - rm(temp) - } - - # Get/check point data - if(!is.null(point)){ - if(is.factor(point[[point_column]])){ - point[[point_column]] <- as.numeric(as.character(point[[point_column]])) - } - assertthat::assert_that( - unique(sf::st_geometry_type(point)) %in% c('POINT', 'MULTIPOINT'), - # Check that the point data has presence-absence information - utils::hasName(point, point_column), - !is.na(sf::st_crs(point)$proj) - ) - # If sf is different, reproject to prediction - if(sf::st_crs(point)!= sf::st_crs(prediction)){ - point <- sf::st_transform(point, crs = sf::st_crs(prediction) ) - } - if(!utils::hasName(point, "name")) point$name <- "Validation data" # Assign a name for validation. Assuming only one dataset is present - if(!utils::hasName(point, "type")) point$type <- ifelse(length(unique(point[[point_column]]))>1, "poipa", "poipo") # Type depending on input - # Ensure comparable columns - point <- subset(point, select = c(point_column, "name", "type", attr(point, "sf_column") )) - } else { - # TODO: Think about how to do validation with non-point data - if(getOption('ibis.setupmessages')) myLog('[Validation]','red','Validating model with non-independent training data. Results can be misleading!') - # Get all point datasets and combine them - point <- do.call(sf:::rbind.sf, - lapply(mod$model$biodiversity, function(y){ - o <- guess_sf(y$observations) - o$name <- y$name; o$type <- y$type - subset(o, select = c(point_column, "name", "type", attr(o, "sf_column"))) - } ) - ) %>% tibble::remove_rownames() - if(is.factor(point[[point_column]])){ - point[[point_column]] <- as.numeric(as.character(point[[point_column]])) - } - } - assertthat::assert_that(nrow(point)>0, - utils::hasName(point, point_column)) - # --- # - # Do the extraction - df <- as.data.frame(point) - df$pred <- raster::extract(prediction, point) - if(!is.null(threshold)) df$pred_tr <- raster::extract(threshold, point) - # Remove any sfc column if present - if(!is.null(attr(df, "sf_column"))) df[[attr(df, "sf_column")]] <- NULL - # Remove any NAs - df <- subset(df, complete.cases(df)) - if(nrow(df) < 2) stop("Validation was not possible owing to missing data.") - # --- # - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Validation]','green','Calculating validation statistics') - - # Output container - results <- data.frame() - for(dataset in unique(df$name)){ - # Subset to name - df2 <- subset.data.frame(df, name == dataset) - - # Check that absence points are present, otherwise add some. - # Reason is that some engine such as inlabru don't save their integration points - if( !any(df2[[point_column]]==0) && method == "discrete"){ - if(getOption('ibis.setupmessages')) myLog('[Validation]','yellow','No absence data found for threshold. Generating random points.') - - # Use the pseudo-absence generation - o <- add_pseudoabsence(df = point, - field_occurrence = point_column, - template = threshold, - settings = pseudoabs_settings(background = threshold,nrpoints = nrow(df2)*2)) |> - subset(subset = observed == 0) - - abs <- list(); abs[[point_column]] <- o[[point_column]] - abs[["name"]] <- dataset; abs[["type"]] <- "poipo" - abs[["pred"]] <- raster::extract(prediction, o); abs[["pred_tr"]] <- raster::extract(threshold, o) - - df2 <- rbind(df2, as.data.frame(abs)) - } - # Validate the threshold - out <- try({.validatethreshold(df2 = df2, point_column = point_column, mod = mod, - name = dataset, method = method, id = as.character(mod$id)) - }) - if(inherits(out, "try-error")) return(NULL) - results <- rbind.data.frame(results, out) - } - # Return result - return(results) - } -) - -#' @name validate -#' @rdname validate -#' @usage \S4method{validate}{RasterLayer, character, sf, character}(mod, method, point, point_column) -methods::setMethod( - "validate", - methods::signature(mod = "RasterLayer"), - function(mod, method = 'continuous', layer = NULL, point = NULL, point_column = 'observed', ...){ - assertthat::assert_that( - is.Raster(mod), - inherits(point, 'sf'), - is.character(method), - is.character(point_column) - ) - method <- match.arg(method, c("discrete", "continuous"),several.ok = FALSE) - - # If mode truncate was used, also switch to continuous data - if( attr(threshold,'format')!="binary" && method == "discrete"){ - if(getOption('ibis.setupmessages')) myLog('[Validation]','red','Only truncated threshold found. Switching to continuous validation metrics.') - method <- 'continuous' - } - assertthat::assert_that(nrow(point)>0, - utils::hasName(point, point_column)) - point <- subset(point, select = point_column) - - # Correct point column in case larger 1 - # FIXME: Only reasonable for discrete validation - if(method == "discrete"){ - if(any(point[[point_column]] > 1)) point[[point_column]] <- ifelse(point[[point_column]]>=1, 1, 0) - } - - # --- # - df <- as.data.frame(point) - df$pred_tr <- get_rastervalue(point, mod)[[names(mod)]] - if(method == "continuous") df$pred <- get_rastervalue(point, mod)[[names(mod)]] - - # Remove any sfc column if present - if(!is.null(attr(df, "sf_column"))) df[[attr(df, "sf_column")]] <- NULL - # Remove any NAs - df <- subset(df, complete.cases(df)) - if(nrow(df) < 2) stop("Validation was not possible owing to missing data.") - # --- # - # Messenger - if(getOption('ibis.setupmessages')) myLog('[Validation]','green','Calculating validation statistics') - - if(!is.null(layer)) dataset <- layer else dataset <- "External" - - # Validate the threshold - out <- .validatethreshold(df, point_column = point_column, mod = NULL, - name = dataset, method = method, id = NA) - return(out) - } -) - -#' @noRd -#' @keywords internal -.validatethreshold <- function(df2, point_column, mod = NULL, name = NULL, method = 'fixed', id = NULL) { - - if(method == 'continuous'){ - # continuous evaluation - assertthat::assert_that(utils::hasName(df2, 'pred'), - utils::hasName(df2, point_column) - ) - #### Calculating Boyce index as in Hirzel et al. 2006 - # fit: A vector or Raster-Layer containing the predicted suitability values - # obs: A vector containing the predicted suitability values or xy-coordinates (if fit is a Raster-Layer) of the validation points (presence records) - # nclass : number of classes or vector with classes threshold. If nclass=0, Boyce index is calculated with a moving window (see next parameters) - # windows.w : width of the moving window (by default 1/10 of the suitability range) - # res : resolution of the moving window (by default 101 focals) - # PEplot : if True, plot the predicted to expected ratio along the suitability class - ecospat.boyce <- - function(fit, - obs, - nclass = 0, - window.w = "default", - res = 100, - PEplot = TRUE){ - boycei <- function(interval, obs, fit) { - fit.bin <- fit - obs.bin <- obs - fit.bin[fit[] >= interval[1] & fit[] <= interval[2]] <- "i" - fit.bin[fit.bin != "i"] <- 0 - obs.bin[obs[] >= interval[1] & obs[] <= interval[2]] <- "i" - obs.bin[obs.bin != "i"] <- 0 - pi <- length(which(obs.bin == "i")) / length(obs) - ei <- length(which(fit.bin == "i")) / length(fit.bin) - fi <- pi / ei - return(fi) - } - - if (window.w == "default") { - window.w <- (max(fit, na.rm = TRUE) - min(fit, na.rm = TRUE)) / 10 - } - - interval <- c(min(fit, na.rm = TRUE), max(fit, na.rm = TRUE)) - mini <- interval[1] - maxi <- interval[2] - - if (nclass == 0) { - vec.mov <- - seq( - from = mini, - to = maxi - window.w, - by = (maxi - mini - window.w) / res - ) - - vec.mov[res + 1] <- - vec.mov[res + 1] + 1 #Trick to avoid error with closed interval in R - - interval <- cbind(vec.mov, vec.mov + window.w) - } else if (length(nclass) > 1) { - vec.mov <- c(mini, nclass) - interval <- cbind(vec.mov, c(vec.mov[-1], maxi)) - } else if (nclass > 0 & length(nclass) < 2) { - vec.mov <- seq(from = mini, - to = maxi, - by = (maxi - mini) / nclass) - } - - f <- apply(interval, 1, boycei, obs, fit) - to.keep <- which(f != "NaN") # index to keep no NaN data - f <- f[to.keep] - - if (length(f) < 2) { - b <- NA #at least two points are necessary to draw a correlation - } else { - r <- c(1:length(f))[f != c(f[-1], FALSE)] #index to remove successive duplicates - b <- stats::cor(f[r], vec.mov[to.keep][r], method = "spearman") # calculation of the spearman correlation (i.e. Boyce index) after removing successive duplicated values - } - - HS <- apply(interval, 1, sum) / 2 # mean habitat suitability in the moving window - HS[length(HS)] <- HS[length(HS)] - 1 #Correction of the 'trick' to deal with closed interval - HS <- HS[to.keep] # exlude the NaN - - if (PEplot == TRUE) { - plot( - HS, - f, - xlab = "Habitat suitability", - ylab = "Predicted/Expected ratio", - col = "grey", - cex = 0.75 - ) - graphics::points(HS[r], f[r], pch = 19, cex = 0.75) - - } - - results <- list(F.ratio = f, - Spearman.cor = round(b, 3), - HS = HS) - return(results) - } - - # Function for Root-mean square error - RMSE <- function(pred, obs, na.rm = TRUE) { - sqrt(mean((pred - obs)^2, na.rm = na.rm)) - } - # Mean absolute error - MAE <- function(pred, obs, na.rm = TRUE) { - mean(abs(pred - obs), na.rm = na.rm) - } - # Function for log loss/cross-entropy loss. - Poisson_LogLoss <- function(y_pred, y_true) { - eps <- 1e-15 - y_pred <- pmax(y_pred, eps) - Poisson_LogLoss <- mean(log(gamma(y_true + 1)) + y_pred - log(y_pred) * y_true) - return(Poisson_LogLoss) - } - # Normalized Gini Coefficient - NormalizedGini <- function(y_pred, y_true) { - SumGini <- function(y_pred, y_true) { - y_true_sort <- y_true[order(y_pred, decreasing = TRUE)] - y_random <- 1:length(y_pred) / length(y_pred) - y_Lorentz <- cumsum(y_true_sort) / sum(y_true_sort) - SumGini <- sum(y_Lorentz - y_random) - return(SumGini) - } - NormalizedGini <- SumGini(y_pred, y_true) / SumGini(y_true, y_true) - return(NormalizedGini) - } - # Create output container - out <- data.frame( - modelid = id, - name = name, - method = method, - metric = c('n','rmse', 'mae', - 'logloss','normgini', - 'cont.boyce'), - value = NA - ) - # - # - out$value[out$metric=='n'] <- nrow(df2) # Number of records - out$value[out$metric=='rmse'] <- RMSE(pred = df2$pred, obs = df2[[point_column]]) # RMSE - out$value[out$metric=='mae'] <- MAE(pred = df2$pred, obs = df2[[point_column]]) # Mean absolute error - out$value[out$metric=='normgini'] <- NormalizedGini(y_pred = df2$pred, y_true = df2[[point_column]]) - - if(!is.null(mod)){ - if( any( sapply(mod$model$biodiversity, function(x) x$family) == "binomial" ) ){ - LogLoss <- function(y_pred, y_true) { - LogLoss <- -mean(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred)) - return(LogLoss) - } - out$value[out$metric=='logloss'] <- LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) - } else { - out$value[out$metric=='logloss'] <- Poisson_LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) - } - } else { - # Assume Poisson distributed values, calculate log-loss - out$value[out$metric=='logloss'] <- Poisson_LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) - } - - - # Boyce index. Wrap in try since is known to crash - try({ - if("modEvA" %in% utils::installed.packages()[,1]){ - check_package("modEvA") - suppressWarnings( - boi <- modEvA::Boyce(obs = df2[[point_column]], pred = df2$pred, plot = FALSE) - ) - } else { - # Run boyce a few times as average sample ? - # obs <- df2[df2[[point_column]]>0,] - # abs <- df2[sample(which(df2[[point_column]]==0), size = nrow(obs)), ] - # test <- rbind(obs, abs) - boi <- ecospat.boyce(obs = df2[[point_column]], fit = df2$pred, nclass = 0, PEplot = FALSE) - boi$Boyce <- boi$Spearman.cor - } - }, silent = TRUE) - if(exists('boi')) out$value[out$metric=='cont.boyce'] <- boi$Boyce - - } else { - # discrete evaluation - assertthat::assert_that(utils::hasName(df2, 'pred_tr'), - length(unique(df2[[point_column]])) > 1, - msg = "It appears as either the observed data or the threshold does not allow discrete validation.") - # For discrete functions to work correctly, ensure that all values are 0/1 - df2[[point_column]] <- ifelse(df2[[point_column]] > 0, 1, 0 ) - # Build the confusion matrix - ta <- sum((df2["pred_tr"] == 0) & (df2[point_column] == 0)) # True absence - fp <- sum((df2["pred_tr"] == 1) & (df2[point_column] == 0)) # False presence - fa <- sum((df2["pred_tr"] == 0) & (df2[point_column] == 1)) # False absence - tp <- sum((df2["pred_tr"] == 1) & (df2[point_column] == 1)) # True presence - - # Binary brier Score - BS <- function(pred, obs, na.rm = TRUE) { - if(assertthat::see_if(length(unique(pred)) <= 2, - length(unique(obs)) <= 2)){ - mean( (pred - obs)^2, na.rm = na.rm) - } else return(NA) - } - - # Output data.frame - out <- data.frame( - modelid = id, - name = name, - method = method, - metric = c('n','auc','overall.accuracy', 'true.presence.ratio', - 'precision','sensitivity', 'specificity', - 'tss', 'f1', 'logloss', - 'expected.accuracy', 'kappa', 'brier.score'), - value = NA - ) - - # Accuracy indices - out$value[out$metric=='n'] <- N <- ta + fp + fa + tp # Total number of records - out$value[out$metric=='overall.accuracy'] <- OA <- (tp + ta) / N # Overall accuracy - out$value[out$metric=='true.presence.ratio'] <- FOM <- tp / (tp + fp + fa) # True presence classifications - out$value[out$metric=='precision'] <- precision <- tp / (tp + fp) # Precision - out$value[out$metric=='sensitivity'] <- Sensitivity <- tp / (tp + fa) # Sensitivity - out$value[out$metric=='specificity'] <- Specificity <- ta / (ta + fp) # Specificity - out$value[out$metric=='tss'] <- TSS <- Sensitivity + Specificity - 1 # True Skill statistic - out$value[out$metric=='f1'] <- 2 * (precision * Sensitivity) / (precision + Sensitivity) # F1 score - Prob_1and1 <- ((tp + fp) / N) * ((tp + fa) / N) # Probability presence - Prob_0and0 <- ((ta + fa) / N) * ((ta + fp) / N) # Probability absence - out$value[out$metric=='expected.accuracy'] <- Expected_accuracy <- Prob_1and1 + Prob_0and0 # Expected accuracy - out$value[out$metric=='kappa'] <- (OA - Expected_accuracy) / (1 - Expected_accuracy) - - if("modEvA" %in% installed.packages()[,1]){ - check_package("modEvA") - # Calculate AUC - out$value[out$metric=='auc'] <- modEvA::AUC(obs = df2[[point_column]], pred = df2[['pred_tr']], simplif = TRUE, plot = FALSE) - } - # Add brier score - out$value[out$metric=='brier.score'] <- BS(obs = df2[[point_column]], pred = df2[['pred_tr']]) - - # Evaluate Log loss / Cross-Entropy Loss for a predicted probability measure - # FIXME: Hacky. This likely won't work with specific formulations - if(!is.null(mod)){ - if( any( sapply(mod$model$biodiversity, function(x) x$family) == "binomial" ) ){ - LogLoss <- function(y_pred, y_true) { - LogLoss <- -mean(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred)) - return(LogLoss) - } - out$value[out$metric=='logloss'] <- LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) - } else { - # Function for log loss/cross-entropy loss. - Poisson_LogLoss <- function(y_pred, y_true) { - eps <- 1e-15 - y_pred <- pmax(y_pred, eps) - Poisson_LogLoss <- mean(log(gamma(y_true + 1)) + y_pred - log(y_pred) * y_true) - return(Poisson_LogLoss) - } - out$value[out$metric=='logloss'] <- Poisson_LogLoss(y_pred = df2$pred_tr, y_true = df2[[point_column]]) - } - } - } # End of discrete clause - return(out) -} +#' Validation of distribution object +#' +#' @description This function conducts a comprehensive model evaluation based on +#' either on the fitted point data or any supplied independent. +#' **Currently only supporting point datasets. For validation of integrated models more work is needed.** +#' @param mod A fitted [`BiodiversityDistribution`] object with set predictors. Alternatively one can also +#' provide directly a [`RasterLayer`], however in this case the `point` layer also needs to be provided. +#' @param method Should the validation be conducted on continuous metrics or thresholded? See Details. +#' @param layer In case multiple layers exist, which one to use? (Default: \code{'mean'}). +#' @param point A [`sf`] object with type `POINT` or `MULTIPOINT`. +#' @param point_column A [`character`] vector with the name of the column containing the independent observations. +#' (Default: \code{'observed'}). +#' @param ... Other parameters that are passed on. Currently unused. +#' @returns Return a tidy [`tibble`] with validation results. +#' @details The \code{validate} function does not work for all datasets equally. +#' @note If you use the Boyce Index, cite the original Hirzel et al. (2006) paper. +#' +#' @references +#' * Liu, C., White, M., Newell, G., 2013. Selecting thresholds for the prediction of species occurrence with presence-only data. J. Biogeogr. 40, 778–789. https://doi.org/10.1111/jbi.12058 +#' * Hirzel, A. H., Le Lay, G., Helfer, V., Randin, C., & Guisan, A. (2006). Evaluating the ability of habitat suitability models to predict species presences. Ecological modelling, 199(2), 142-152. +#' @examples +#' \dontrun{ +#' # Assuming that mod is a distribution object and has a thresholded layer +#' mod <- threshold(mod, method = "TSS") +#' validate(mod, method = "discrete") +#' } +#' @name validate +#' @aliases validate +#' @keywords train +#' @exportMethod validate +#' @export +NULL +methods::setGeneric("validate", + signature = methods::signature("mod"), + function(mod, method = 'continuous', layer = "mean", + point = NULL, point_column = 'observed', ...) standardGeneric("validate")) + +#' @name validate +#' @rdname validate +#' @usage \S4method{validate}{ANY, character, sf, character, character}(mod, method, point, layer, point_column) +methods::setMethod( + "validate", + methods::signature(mod = "ANY"), + function(mod, method = 'continuous', layer = "mean", + point = NULL, point_column = 'observed', ...){ + assertthat::assert_that( + inherits(mod, "DistributionModel"), + inherits(point, 'sf') || is.null(point), + is.null(point_column) || is.character(point_column), + is.character(layer), + is.character(method) + ) + assertthat::assert_that( "prediction" %in% mod$show_rasters(),msg = "No prediction of the fitted model found!" ) + # Check that independent data is provided and if so that the used column is there + if(!is.null(point)){ + assertthat::assert_that(is.character(point_column), + utils::hasName(point, point_column), + anyNA(point[[point_column]])==FALSE + ) + } + # Match method to be sure + method <- match.arg(method, c('continuous', 'discrete'), several.ok = FALSE) + + # Get settings from model object + settings <- mod$settings + + # Get prediction and threshold if available + prediction <- mod$get_data('prediction')[[layer]] + if( any(grep('threshold', mod$show_rasters())) ){ + tr_lyr <- grep('threshold', mod$show_rasters(),value = TRUE) + if(length(tr_lyr)>1) warning("There appear to be multiple thresholds. Using the first one.") + threshold <- mod$get_data(tr_lyr[1]) + # Get mean layer if there are multiple + if( grep(layer, names(threshold),value = TRUE ) != "") threshold <- threshold[[grep(layer, names(threshold),value = TRUE )]] + } else { threshold <- NULL } + + # Check that threshold and method match + if(is.null(threshold) && method == 'discrete'){ + if(getOption('ibis.setupmessages')) myLog('[Validation]','red','No threshold data found. Switching to continuous validation metrics.') + method <- 'continuous' + } + # If mode truncate was used, also switch to continuous data + if((attr(threshold,'format')!="binary") && method == "discrete"){ + if(attr(threshold,'format')!="binary"){ + if(getOption('ibis.setupmessages')) myLog('[Validation]','red','Only truncated threshold found. Switching to continuous validation metrics.') + method <- 'continuous' + } + } + + # Check whether limits were applied and if so, set background to 0 everywhere for validation + if(settings$get("has_limits")){ + temp <- mod$model$predictors_object$get_data()[[1]]; temp[!is.na(temp)] <- 0 + if(!is.null(threshold)){ + new <- sum(threshold, temp, na.rm = TRUE); new <- raster::mask(new, temp) + attr(new,'format') <- attr(threshold,'format') + if(attr(threshold,'format')=="binary") new <- raster::ratify(new) + threshold <- new + rm(new) + } + # Same for prediction layer, where missing data are set to 0 for validation + prediction <- sum(prediction, temp, na.rm = TRUE) + prediction <- raster::mask(prediction, temp) + rm(temp) + } + + # Get/check point data + if(!is.null(point)){ + if(is.factor(point[[point_column]])){ + point[[point_column]] <- as.numeric(as.character(point[[point_column]])) + } + assertthat::assert_that( + unique(sf::st_geometry_type(point)) %in% c('POINT', 'MULTIPOINT'), + # Check that the point data has presence-absence information + utils::hasName(point, point_column), + !is.na(sf::st_crs(point)$proj) + ) + # If sf is different, reproject to prediction + if(sf::st_crs(point)!= sf::st_crs(prediction)){ + point <- sf::st_transform(point, crs = sf::st_crs(prediction) ) + } + if(!utils::hasName(point, "name")) point$name <- "Validation data" # Assign a name for validation. Assuming only one dataset is present + if(!utils::hasName(point, "type")) point$type <- ifelse(length(unique(point[[point_column]]))>1, "poipa", "poipo") # Type depending on input + # Ensure comparable columns + point <- subset(point, select = c(point_column, "name", "type", attr(point, "sf_column") )) + } else { + # TODO: Think about how to do validation with non-point data + if(getOption('ibis.setupmessages')) myLog('[Validation]','red','Validating model with non-independent training data. Results can be misleading!') + # Get all point datasets and combine them + point <- do.call(sf:::rbind.sf, + lapply(mod$model$biodiversity, function(y){ + o <- guess_sf(y$observations) + o$name <- y$name; o$type <- y$type + subset(o, select = c(point_column, "name", "type", attr(o, "sf_column"))) + } ) + ) |> tibble::remove_rownames() + if(is.factor(point[[point_column]])){ + point[[point_column]] <- as.numeric(as.character(point[[point_column]])) + } + } + assertthat::assert_that(nrow(point)>0, + utils::hasName(point, point_column)) + # --- # + # Do the extraction + df <- as.data.frame(point) + df$pred <- raster::extract(prediction, point) + if(!is.null(threshold)) df$pred_tr <- raster::extract(threshold, point) + # Remove any sfc column if present + if(!is.null(attr(df, "sf_column"))) df[[attr(df, "sf_column")]] <- NULL + # Remove any NAs + df <- subset(df, stats::complete.cases(df)) + if(nrow(df) < 2) stop("Validation was not possible owing to missing data.") + # --- # + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Validation]','green','Calculating validation statistics') + + # Output container + results <- data.frame() + for(dataset in unique(df$name)){ + # Subset to name + df2 <- subset.data.frame(df, name == dataset) + + # Check that absence points are present, otherwise add some. + # Reason is that some engine such as inlabru don't save their integration points + if( !any(df2[[point_column]]==0) && method == "discrete"){ + if(getOption('ibis.setupmessages')) myLog('[Validation]','yellow','No absence data found for threshold. Generating random points.') + + # Use the pseudo-absence generation + o <- add_pseudoabsence(df = point, + field_occurrence = point_column, + template = threshold, + settings = pseudoabs_settings(background = threshold,nrpoints = nrow(df2)*2)) |> + subset(subset = observed == 0) + + abs <- list(); abs[[point_column]] <- o[[point_column]] + abs[["name"]] <- dataset; abs[["type"]] <- "poipo" + abs[["pred"]] <- raster::extract(prediction, o); abs[["pred_tr"]] <- raster::extract(threshold, o) + + df2 <- rbind(df2, as.data.frame(abs)) + } + # Validate the threshold + out <- try({.validatethreshold(df2 = df2, point_column = point_column, mod = mod, + name = dataset, method = method, id = as.character(mod$id)) + }) + if(inherits(out, "try-error")) return(NULL) + results <- rbind.data.frame(results, out) + } + # Return result + return(results) + } +) + +#' @name validate +#' @rdname validate +#' @usage \S4method{validate}{RasterLayer, character, sf, character}(mod, method, point, point_column) +methods::setMethod( + "validate", + methods::signature(mod = "RasterLayer"), + function(mod, method = 'continuous', layer = NULL, point = NULL, point_column = 'observed', ...){ + assertthat::assert_that( + is.Raster(mod), + inherits(point, 'sf'), + is.character(method), + is.character(point_column) + ) + method <- match.arg(method, c("discrete", "continuous"),several.ok = FALSE) + + # If mode truncate was used, also switch to continuous data + if( attr(threshold,'format')!="binary" && method == "discrete"){ + if(getOption('ibis.setupmessages')) myLog('[Validation]','red','Only truncated threshold found. Switching to continuous validation metrics.') + method <- 'continuous' + } + assertthat::assert_that(nrow(point)>0, + utils::hasName(point, point_column)) + point <- subset(point, select = point_column) + + # Correct point column in case larger 1 + # FIXME: Only reasonable for discrete validation + if(method == "discrete"){ + if(any(point[[point_column]] > 1)) point[[point_column]] <- ifelse(point[[point_column]]>=1, 1, 0) + } + + # --- # + df <- as.data.frame(point) + df$pred_tr <- get_rastervalue(point, mod)[[names(mod)]] + if(method == "continuous") df$pred <- get_rastervalue(point, mod)[[names(mod)]] + + # Remove any sfc column if present + if(!is.null(attr(df, "sf_column"))) df[[attr(df, "sf_column")]] <- NULL + # Remove any NAs + df <- subset(df, stats::complete.cases(df)) + if(nrow(df) < 2) stop("Validation was not possible owing to missing data.") + # --- # + # Messenger + if(getOption('ibis.setupmessages')) myLog('[Validation]','green','Calculating validation statistics') + + if(!is.null(layer)) dataset <- layer else dataset <- "External" + + # Validate the threshold + out <- .validatethreshold(df, point_column = point_column, mod = NULL, + name = dataset, method = method, id = NA) + return(out) + } +) + +#' @noRd +#' @keywords internal +.validatethreshold <- function(df2, point_column, mod = NULL, name = NULL, method = 'fixed', id = NULL) { + + if(method == 'continuous'){ + # continuous evaluation + assertthat::assert_that(utils::hasName(df2, 'pred'), + utils::hasName(df2, point_column) + ) + #### Calculating Boyce index as in Hirzel et al. 2006 + # fit: A vector or Raster-Layer containing the predicted suitability values + # obs: A vector containing the predicted suitability values or xy-coordinates (if fit is a Raster-Layer) of the validation points (presence records) + # nclass : number of classes or vector with classes threshold. If nclass=0, Boyce index is calculated with a moving window (see next parameters) + # windows.w : width of the moving window (by default 1/10 of the suitability range) + # res : resolution of the moving window (by default 101 focals) + # PEplot : if True, plot the predicted to expected ratio along the suitability class + ecospat.boyce <- + function(fit, + obs, + nclass = 0, + window.w = "default", + res = 100, + PEplot = TRUE){ + boycei <- function(interval, obs, fit) { + fit.bin <- fit + obs.bin <- obs + fit.bin[fit[] >= interval[1] & fit[] <= interval[2]] <- "i" + fit.bin[fit.bin != "i"] <- 0 + obs.bin[obs[] >= interval[1] & obs[] <= interval[2]] <- "i" + obs.bin[obs.bin != "i"] <- 0 + pi <- length(which(obs.bin == "i")) / length(obs) + ei <- length(which(fit.bin == "i")) / length(fit.bin) + fi <- pi / ei + return(fi) + } + + if (window.w == "default") { + window.w <- (max(fit, na.rm = TRUE) - min(fit, na.rm = TRUE)) / 10 + } + + interval <- c(min(fit, na.rm = TRUE), max(fit, na.rm = TRUE)) + mini <- interval[1] + maxi <- interval[2] + + if (nclass == 0) { + vec.mov <- + seq( + from = mini, + to = maxi - window.w, + by = (maxi - mini - window.w) / res + ) + + vec.mov[res + 1] <- + vec.mov[res + 1] + 1 #Trick to avoid error with closed interval in R + + interval <- cbind(vec.mov, vec.mov + window.w) + } else if (length(nclass) > 1) { + vec.mov <- c(mini, nclass) + interval <- cbind(vec.mov, c(vec.mov[-1], maxi)) + } else if (nclass > 0 & length(nclass) < 2) { + vec.mov <- seq(from = mini, + to = maxi, + by = (maxi - mini) / nclass) + } + + f <- apply(interval, 1, boycei, obs, fit) + to.keep <- which(f != "NaN") # index to keep no NaN data + f <- f[to.keep] + + if (length(f) < 2) { + b <- NA #at least two points are necessary to draw a correlation + } else { + r <- c(1:length(f))[f != c(f[-1], FALSE)] #index to remove successive duplicates + b <- stats::cor(f[r], vec.mov[to.keep][r], method = "spearman") # calculation of the spearman correlation (i.e. Boyce index) after removing successive duplicated values + } + + HS <- apply(interval, 1, sum) / 2 # mean habitat suitability in the moving window + HS[length(HS)] <- HS[length(HS)] - 1 #Correction of the 'trick' to deal with closed interval + HS <- HS[to.keep] # exlude the NaN + + if (PEplot == TRUE) { + plot( + HS, + f, + xlab = "Habitat suitability", + ylab = "Predicted/Expected ratio", + col = "grey", + cex = 0.75 + ) + graphics::points(HS[r], f[r], pch = 19, cex = 0.75) + + } + + results <- list(F.ratio = f, + Spearman.cor = round(b, 3), + HS = HS) + return(results) + } + + # Function for Root-mean square error + RMSE <- function(pred, obs, na.rm = TRUE) { + sqrt(mean((pred - obs)^2, na.rm = na.rm)) + } + # Mean absolute error + MAE <- function(pred, obs, na.rm = TRUE) { + mean(abs(pred - obs), na.rm = na.rm) + } + # Function for log loss/cross-entropy loss. + Poisson_LogLoss <- function(y_pred, y_true) { + eps <- 1e-15 + y_pred <- pmax(y_pred, eps) + Poisson_LogLoss <- mean(log(gamma(y_true + 1)) + y_pred - log(y_pred) * y_true) + return(Poisson_LogLoss) + } + # Normalized Gini Coefficient + NormalizedGini <- function(y_pred, y_true) { + SumGini <- function(y_pred, y_true) { + y_true_sort <- y_true[order(y_pred, decreasing = TRUE)] + y_random <- 1:length(y_pred) / length(y_pred) + y_Lorentz <- cumsum(y_true_sort) / sum(y_true_sort) + SumGini <- sum(y_Lorentz - y_random) + return(SumGini) + } + NormalizedGini <- SumGini(y_pred, y_true) / SumGini(y_true, y_true) + return(NormalizedGini) + } + # Create output container + out <- data.frame( + modelid = id, + name = name, + method = method, + metric = c('n','rmse', 'mae', + 'logloss','normgini', + 'cont.boyce'), + value = NA + ) + # - # + out$value[out$metric=='n'] <- nrow(df2) # Number of records + out$value[out$metric=='rmse'] <- RMSE(pred = df2$pred, obs = df2[[point_column]]) # RMSE + out$value[out$metric=='mae'] <- MAE(pred = df2$pred, obs = df2[[point_column]]) # Mean absolute error + out$value[out$metric=='normgini'] <- NormalizedGini(y_pred = df2$pred, y_true = df2[[point_column]]) + + if(!is.null(mod)){ + if( any( sapply(mod$model$biodiversity, function(x) x$family) == "binomial" ) ){ + LogLoss <- function(y_pred, y_true) { + LogLoss <- -mean(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred)) + return(LogLoss) + } + out$value[out$metric=='logloss'] <- LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) + } else { + out$value[out$metric=='logloss'] <- Poisson_LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) + } + } else { + # Assume Poisson distributed values, calculate log-loss + out$value[out$metric=='logloss'] <- Poisson_LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) + } + + + # Boyce index. Wrap in try since is known to crash + try({ + if("modEvA" %in% utils::installed.packages()[,1]){ + check_package("modEvA") + suppressWarnings( + boi <- modEvA::Boyce(obs = df2[[point_column]], pred = df2$pred, plot = FALSE) + ) + } else { + # Run boyce a few times as average sample ? + # obs <- df2[df2[[point_column]]>0,] + # abs <- df2[sample(which(df2[[point_column]]==0), size = nrow(obs)), ] + # test <- rbind(obs, abs) + boi <- ecospat.boyce(obs = df2[[point_column]], fit = df2$pred, nclass = 0, PEplot = FALSE) + boi$Boyce <- boi$Spearman.cor + } + }, silent = TRUE) + if(exists('boi')) out$value[out$metric=='cont.boyce'] <- boi$Boyce + + } else { + # discrete evaluation + assertthat::assert_that(utils::hasName(df2, 'pred_tr'), + length(unique(df2[[point_column]])) > 1, + msg = "It appears as either the observed data or the threshold does not allow discrete validation.") + # For discrete functions to work correctly, ensure that all values are 0/1 + df2[[point_column]] <- ifelse(df2[[point_column]] > 0, 1, 0 ) + # Build the confusion matrix + ta <- sum((df2["pred_tr"] == 0) & (df2[point_column] == 0)) # True absence + fp <- sum((df2["pred_tr"] == 1) & (df2[point_column] == 0)) # False presence + fa <- sum((df2["pred_tr"] == 0) & (df2[point_column] == 1)) # False absence + tp <- sum((df2["pred_tr"] == 1) & (df2[point_column] == 1)) # True presence + + # Binary brier Score + BS <- function(pred, obs, na.rm = TRUE) { + if(assertthat::see_if(length(unique(pred)) <= 2, + length(unique(obs)) <= 2)){ + mean( (pred - obs)^2, na.rm = na.rm) + } else return(NA) + } + + # Output data.frame + out <- data.frame( + modelid = id, + name = name, + method = method, + metric = c('n','auc','overall.accuracy', 'true.presence.ratio', + 'precision','sensitivity', 'specificity', + 'tss', 'f1', 'logloss', + 'expected.accuracy', 'kappa', 'brier.score'), + value = NA + ) + + # Accuracy indices + out$value[out$metric=='n'] <- N <- ta + fp + fa + tp # Total number of records + out$value[out$metric=='overall.accuracy'] <- OA <- (tp + ta) / N # Overall accuracy + out$value[out$metric=='true.presence.ratio'] <- FOM <- tp / (tp + fp + fa) # True presence classifications + out$value[out$metric=='precision'] <- precision <- tp / (tp + fp) # Precision + out$value[out$metric=='sensitivity'] <- Sensitivity <- tp / (tp + fa) # Sensitivity + out$value[out$metric=='specificity'] <- Specificity <- ta / (ta + fp) # Specificity + out$value[out$metric=='tss'] <- TSS <- Sensitivity + Specificity - 1 # True Skill statistic + out$value[out$metric=='f1'] <- 2 * (precision * Sensitivity) / (precision + Sensitivity) # F1 score + Prob_1and1 <- ((tp + fp) / N) * ((tp + fa) / N) # Probability presence + Prob_0and0 <- ((ta + fa) / N) * ((ta + fp) / N) # Probability absence + out$value[out$metric=='expected.accuracy'] <- Expected_accuracy <- Prob_1and1 + Prob_0and0 # Expected accuracy + out$value[out$metric=='kappa'] <- (OA - Expected_accuracy) / (1 - Expected_accuracy) + + if("modEvA" %in% utils::installed.packages()[,1]){ + check_package("modEvA") + # Calculate AUC + out$value[out$metric=='auc'] <- modEvA::AUC(obs = df2[[point_column]], pred = df2[['pred_tr']], simplif = TRUE, plot = FALSE) + } + # Add brier score + out$value[out$metric=='brier.score'] <- BS(obs = df2[[point_column]], pred = df2[['pred_tr']]) + + # Evaluate Log loss / Cross-Entropy Loss for a predicted probability measure + # FIXME: Hacky. This likely won't work with specific formulations + if(!is.null(mod)){ + if( any( sapply(mod$model$biodiversity, function(x) x$family) == "binomial" ) ){ + LogLoss <- function(y_pred, y_true) { + LogLoss <- -mean(y_true * log(y_pred) + (1 - y_true) * log(1 - y_pred)) + return(LogLoss) + } + out$value[out$metric=='logloss'] <- LogLoss(y_pred = df2$pred, y_true = df2[[point_column]]) + } else { + # Function for log loss/cross-entropy loss. + Poisson_LogLoss <- function(y_pred, y_true) { + eps <- 1e-15 + y_pred <- pmax(y_pred, eps) + Poisson_LogLoss <- mean(log(gamma(y_true + 1)) + y_pred - log(y_pred) * y_true) + return(Poisson_LogLoss) + } + out$value[out$metric=='logloss'] <- Poisson_LogLoss(y_pred = df2$pred_tr, y_true = df2[[point_column]]) + } + } + } # End of discrete clause + return(out) +} diff --git a/R/write_output.R b/R/write_output.R index 4612fb60..c2a57661 100644 --- a/R/write_output.R +++ b/R/write_output.R @@ -1,626 +1,628 @@ -#' Generic function to write spatial outputs -#' -#' @description -#' The `write_output` function is a generic wrapper to writing any output files (e.g. projections) created with -#' the [`ibis.iSDM-package`]. It is possible to write outputs of fitted [`DistributionModel`], -#' [`BiodiversityScenario`] or individual [`Raster`] or [`stars`] objects. In case a [`data.frame`] -#' is supplied, the output is written as csv file. -#' **For creating summaries of distribution and scenario parameters and performance, see `write_summary()`** -#' @note -#' By default output files will be overwritten if already existing! -#' @param mod Provided [`DistributionModel`], [`BiodiversityScenario`], [`Raster`] or [`stars`] object. -#' @param fname A [`character`] depicting an output filename. -#' @param dt A [`character`] for the output datatype. Following the [`raster::dataType()`] options (Default: \code{'FLT4S'}). -#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). -#' @param ... Any other arguements passed on the individual functions. -#' @returns No R-output is created. A file is written to the target direction. -#' @examples \dontrun{ -#' x <- distribution(background) %>% -#' add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% -#' add_predictors(pred_current, transform = 'scale',derivates = 'none') %>% -#' engine_xgboost(nrounds = 2000) %>% train(varsel = FALSE, only_linear = TRUE) -#' write_output(x, "testmodel.tif") -#' } - -#' @name write_output -#' @aliases write_output -#' @keywords utils -#' @exportMethod write_output -#' @export -NULL -methods::setGeneric("write_output", - signature = methods::signature("mod"), - function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...) standardGeneric("write_output")) - -#' @name write_output -#' @rdname write_output -#' @usage \S4method{write_output}{ANY, character, character, logical}(mod, fname, dt, verbose) -methods::setMethod( - "write_output", - methods::signature("ANY"), - function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...){ - assertthat::assert_that( - !missing(mod), - is.character(fname), - is.character(dt), - is.logical(verbose) - ) - - if(verbose && getOption('ibis.setupmessages')) myLog('[Output]','green','Saving output(s)...') - - # This function will only capture the distribution model object and will save them separately - if(any(class(mod) %in% getOption("ibis.engines")) ){ - # FIXME: If errors occur, check harmonization of saving among engines. - mod$save(fname = fname) - } else if(is.Raster(mod)){ - if(raster::extension(fname) %in% c('.tif', '.TIF')) { - writeGeoTiff(file = mod, fname = fname, dt = dt) - } else if(raster::extension(fname) %in% c('.nc', '.NC', '.ncdf', '.NCDF')){ - writeNetCDF(file = mode, fname = fname, varName = names(mod), dt = dt) - } else { - stop("Output type could not be determined. Currently only geoTIFF and netCDF are supported.") - } - } else if(is.data.frame(mod)){ - utils::write.csv(x = mod,file = fname,...) - } else { - # Check that a save function exists for object - assertthat::assert_that("save" %in%names(mod), - msg = "No method to save the output could be found!") - # Try a generic save - mod$save(fname = fname) - } - invisible() - } -) - -#' @name write_output -#' @rdname write_output -#' @usage \S4method{write_output}{BiodiversityScenario, character, character, logical}(mod, fname, dt, verbose) -methods::setMethod( - "write_output", - methods::signature(mod = "BiodiversityScenario"), - function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...) { - assertthat::assert_that( - !missing(mod), - is.character(fname), - is.character(dt), - is.logical(verbose) - ) - # Get outputs - mod$save(fname = fname, type = tools::file_ext(fname), dt = dt) - invisible() - } -) - -#' @name write_output -#' @rdname write_output -#' @usage \S4method{write_output}{RasterLayer, character, character, logical}(mod, fname, dt, verbose) -methods::setMethod( - "write_output", - methods::signature(mod = "RasterLayer"), - function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...) { - assertthat::assert_that( - !missing(mod), - is.Raster(mod), - is.character(fname), - is.character(dt), - is.logical(verbose) - ) - - # Write output depending on type - if(raster::extension(fname) %in% c('.tif', '.TIF')) { - writeGeoTiff(file = mod, fname = fname, dt = dt) - } else if(raster::extension(fname) %in% c('.nc', '.NC', '.ncdf', '.NCDF')){ - writeNetCDF(file = mode, fname = fname, varName = names(mod), dt = dt) - } else { - stop("Output type could not be determined. Currently only geoTIFF and netCDF are supported.") - } - invisible() - } -) - -#' @name write_output -#' @rdname write_output -#' @usage \S4method{write_output}{RasterStack, character, character, logical}(mod, fname, dt, verbose) -methods::setMethod( - "write_output", - methods::signature(mod = "RasterStack"), - function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"),...) { - assertthat::assert_that( - !missing(mod), - is.Raster(mod), - is.character(fname), - is.character(dt), - is.logical(verbose) - ) - - # Write output depending on type - if(raster::extension(fname) %in% c('.tif', '.TIF')) { - writeGeoTiff(file = mod, fname = fname, dt = dt) - } else if(raster::extension(fname) %in% c('.nc', '.NC', '.ncdf', '.NCDF')){ - writeNetCDF(file = mode, fname = fname, varName = names(mod), dt = dt) - } else { - stop("Output type could not be determined. Currently only geoTIFF and netCDF are supported.") - } - invisible() - } -) - -#' @name write_output -#' @rdname write_output -#' @usage \S4method{write_output}{data.frame, character, character, logical}(mod, fname, dt, verbose) -methods::setMethod( - "write_output", - methods::signature(mod = "data.frame"), - function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"),...) { - assertthat::assert_that( - !missing(mod), - is.data.frame(mod), - is.character(fname), - is.logical(verbose) - ) - - # data.frames will be written by default as csv files for consistency - fname <- paste0( tools::file_path_sans_ext(fname), ".csv") - utils::write.csv(x = mod, file = fname, ...) - invisible() - } -) - -#' @name write_output -#' @rdname write_output -#' @usage \S4method{write_output}{stars, character, character, logical}(mod, fname, dt, verbose) -methods::setMethod( - "write_output", - methods::signature(mod = "stars"), - function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"),...) { - assertthat::assert_that( - !missing(mod), - # is.list(mod), - is.character(fname), - is.logical(verbose) - ) - # Check that it is a star object - assertthat::assert_that( - inherits(mod, "stars"), msg = "Supplied list object needs to be a stars object." - ) - - # Define filename - fname <- paste0( tools::file_path_sans_ext(fname), ".nc") - # TODO: Align with write NetCDF function further below - stars::write_stars( - obj = mod, - dsn = fname, - layer = names(mod), - ...) - invisible() - } -) - -#' Saves a raster file in Geotiff format -#' -#' @description Functions that acts as a wrapper to [raster::writeRaster]. -#' @param file A [`raster`] object to be saved. -#' @param fname A [`character`] stating the output destination. -#' @param dt The datatype to be written (Default: *Float64*). -#' @param varNA The nodata value to be used (Default: \code{-9999}). -#' @param ... Other options. -#' @keywords utils, internal -#' @noRd -writeGeoTiff <- function(file, fname, dt = "FLT4S", varNA = -9999, ...){ - assertthat::assert_that( - inherits(file,'Raster') || inherits(file, 'stars'), - is.character(fname), is.character(dt), - is.numeric(varNA) - ) - if(!assertthat::has_extension(fname,"tif")) fname <- paste0(fname,".tif") - - # Check if layer is factor and deratify if so (causes error otherwise) - if(any(is.factor(file))){ - file <- raster::deratify(file, complete = TRUE) - } - - # Save output - writeRaster(file, fname, - format='GTiff', - datatype = dt, - NAflag = varNA, - options=c("COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"), - overwrite= TRUE, - ...) -} - -#' Save a raster stack to a netcdf file -#' -#' @param file A [`raster`] object to be saved. -#' @param fname A [`character`] stating the output destination. -#' @param varName Name for the NetCDF export variable. -#' @param varUnit Units for the NetCDF export variable. -#' @param varLong Long name for the NetCDF export variable. -#' @param dt The datatype to be written. Default is Float64 -#' @param varNA The nodata value to be used. Default: \code{-9999}. -#' @param ... Other options. -#' @keywords utils, internal -#' @noRd -writeNetCDF <- function(file, fname, - varName, varUnit = NULL, - varLong = NULL, dt = "FLT4S", varNA = -9999, ...) { - assertthat::assert_that( - inherits(file,'Raster'), - is.character(fname), is.character(dt), - is.numeric(varNA) - ) - check_package('ncdf4') - if(!isNamespaceLoaded("ncdf4")) { attachNamespace("ncdf4");requireNamespace('ncdf4') } - if(!assertthat::has_extension(fname,"nc")) fname <- paste0(fname,".nc") - - # Output NetCDF file - raster::writeRaster(x = file, - filename = fname,format = "CDF", overwrite = TRUE, - varname = ifelse(is.null(varName),'Prediction',varName), - varunit = ifelse(is.null(varUnit),'',varUnit), - longname = ifelse(is.null(varLong),'',varLong), - xname = ifelse(isLonLat(ras), "Longitude","x"), - yname = ifelse(isLonLat(ras), "Latitude","y"), - zname = "Time", - zunit = "Years since 2000-01-01", # FIXME: Load and format date if provided - bylayer = FALSE, # Don't save separate layers - datatype = dt, NAflag = varNA, - ... - ) - - # Add common attributes - ncout <- ncdf4::nc_open(fname, write = TRUE) - - # add global attributes - ncdf4::ncatt_put(ncout, 0,"title","Biodiversity suitability projection created with ibis.iSDM") - - history <- paste(Sys.info()['user'], date(), sep=", ") - ncdf4::ncatt_put(ncout,0, "created", history) - ncdf4::ncatt_put(ncout,0, "Conventions", "CF=1.5") - - # close the file, writing data to disk - ncdf4::nc_close(ncout) - - invisible() -} - -# ------------------------- # -#### Write summary methods #### - -#' Generic function to write summary outputs from created models. -#' -#' @description -#' The [`write_summary`] function is a wrapper function to create summaries from fitted [`DistributionModel`] or -#' [`BiodiversityScenario`] objects. This function will extract parameters and statistics about the used data -#' from the input object and writes the output as either \code{'rds'} or \code{'rdata'} file. Alternative, more open file formats -#' are under consideration. -#' @note -#' No predictions or tabular data is saved through this function. -#' Use [`write_output()`] to save those. -#' @param mod Provided [`DistributionModel`] or [`BiodiversityScenario`] object. -#' @param fname A [`character`] depicting an output filename. -#' The suffix determines the file type of the output (Options: \code{'rds'}, \code{'rdata'}). -#' @param partial A [`logical`] value determining whether partial variable contributions should be calculated and added -#' to the model summary. **Note that this can be rather slow** (Default: \code{FALSE}). -#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). -#' @param ... Any other arguments passed on the individual functions. -#' @returns No R-output is created. A file is written to the target direction. -#' @examples \dontrun{ -#' x <- distribution(background) %>% -#' add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% -#' add_predictors(pred_current, transform = 'scale',derivates = 'none') %>% -#' engine_xgboost(nrounds = 2000) %>% train(varsel = FALSE, only_linear = TRUE) -#' write_summary(x, "testmodel.rds") -#' } -#' @keywords utils - -#' @name write_summary -#' @aliases write_summary -#' @exportMethod write_summary -#' @export -NULL -methods::setGeneric("write_summary", - signature = methods::signature("mod"), - function(mod, fname, partial = FALSE, verbose = getOption("ibis.setupmessages"),...) standardGeneric("write_summary")) - -#' @name write_summary -#' @rdname write_summary -#' @usage \S4method{write_summary}{ANY, character, logical, logical}(mod, fname, partial, verbose) -methods::setMethod( - "write_summary", - methods::signature(mod = "ANY"), - function(mod, fname, partial = FALSE, verbose = getOption("ibis.setupmessages"), ...) { - assertthat::assert_that( - !missing(mod), - is.character(fname), - is.logical(partial), - is.logical(verbose) - ) - assertthat::assert_that( - inherits(mod, "DistributionModel") || inherits(mod, "BiodiversityScenario"), - msg = "Only objects created through `train()` or `project()` are supported!" - ) - # Check writeable or not - assertthat::assert_that( - assertthat::is.writeable(dirname(fname)),msg = "Given input folder is not writeable!" - ) - - # Get file extension - ext <- tolower( tools::file_ext(fname) ) - if(ext == "") ext <- "rds" # Assign rds as default - ext <- match.arg(ext, choices = c("rds", "rdata"), several.ok = FALSE) - fname <- paste0(tools::file_path_sans_ext(fname), ".", ext) - if(file.exists(fname) && (verbose && getOption('ibis.setupmessages'))) myLog('[Output]','yellow','Overwriting existing file...') - assertthat::assert_that(assertthat::is.writeable(dirname(fname))) - # --- # - # Gather the statistics and parameters from the provided file - output <- list() - if(inherits(mod, "DistributionModel")){ - - # Summarize the model object - model <- mod$model - - # Model input summary in a tibble - output[["input"]][["extent"]] <- as.matrix( extent( model$background ) ) - output[["input"]][["predictors"]] <- model$predictors_types - if(!is.Waiver(model$offset)) output[["input"]][["offset"]] <- names(model$offset) else output[["input"]][["offset"]] <- NA - if(!is.Waiver(model$priors)){ - output[["input"]][["priors"]] <- model$priors$summary() - } else output[["input"]][["priors"]] <- NA - - # Go over biodiversity datasets - o <- data.frame() - for(i in 1:length(model$biodiversity)){ - o <- rbind(o, - data.frame(id = names(model$biodiversity)[i], - name = model$biodiversity[[i]]$name, - type = model$biodiversity[[i]]$type, - family = model$biodiversity[[i]]$family, - equation = deparse1(model$biodiversity[[i]]$equation), - obs_pres = sum( model$biodiversity[[i]]$observations$observed > 0 ), - obs_abs = sum( model$biodiversity[[i]]$observations$observed == 0 ), - n_predictors = length( model$biodiversity[[i]]$predictors_names ) - ) - ) - } - output[["input"]][["biodiversity"]] <- o - - # Model parameters in a tibble - output[["params"]][["id"]] <- as.character(model$id) - output[["params"]][["runname"]] <- as.character(model$runname) - output[["params"]][["algorithm"]] <- class(mod)[1] - output[["params"]][["equation"]] <- mod$get_equation() - # Collect settings and parameters if existing - if( !is.Waiver(mod$get_data("params")) ){ - output[["params"]][["params"]] <- mod$get_data("params") - } - if( "settings" %in% names(mod) ){ - output[["params"]][["settings"]] <- mod$settings$data - } - - # Model summary in a tibble and formula - output[["output"]][["summary"]] <- mod$summary() - if(!is.null(mod$get_data("prediction") )){ - output[["output"]][["resolution"]] <- raster::res( mod$get_data("prediction") ) - output[["output"]][["prediction"]] <- names( mod$get_data("prediction") ) - } else { - output[["output"]][["resolution"]] <- NA - output[["output"]][["prediction"]] <- NA - } - # Calculate partial estimates if set - if(partial){ - if(verbose && getOption('ibis.setupmessages')) myLog('[Export]','green',paste0('Calculating partial variable contributions...')) - message("Not yet added") # TODO: - output[["output"]][["partial"]] <- NA - } else { - output[["output"]][["partial"]] <- NA - } - - } else if(inherits(mod, "BiodiversityScenario")){ - # Summarize the model object - model <- mod$get_model() - - # Model input summary in a tibble - output[["input"]][["extent"]] <- as.matrix( raster::extent( model$model$background ) ) - output[["input"]][["predictors"]] <- mod$predictors$get_names() - output[["input"]][["timerange"]] <- mod$get_timeperiod() - output[["input"]][["predictor_time"]] <- mod$predictors$get_time() - # Collect settings and parameters if existing - if( !is.Waiver(mod$get_data("constraints")) ){ - output[["input"]][["constraints"]] <- mod$get_constraints() - } - - # Model parameters in a tibble - output[["params"]][["id"]] <- as.character(mod$modelid) - output[["params"]][["runname"]] <- as.character(model$model$runname) - output[["params"]][["algorithm"]] <- class(model)[1] - if( "settings" %in% names(mod) ){ - output[["params"]][["settings"]] <- model$settings$data - } - - # Model summary in a tibble and formula - output[["output"]][["summary"]] <- mod$summary(plot = FALSE,...) - if(!is.Waiver(mod$get_data() )){ - sc_dim <- stars::st_dimensions(mod$get_data()) - output[["output"]][["resolution"]] <- abs( c(x = sc_dim$x$delta, y = sc_dim$y$delta) ) - output[["output"]][["prediction"]] <- names(mod$get_data()) - } else { - output[["output"]][["resolution"]] <- NA - output[["output"]][["prediction"]] <- NA - } - } - assertthat::assert_that( - is.list(output), - length(output)>0 - ) - # --- # - # Write the output - if(ext == "rds"){ - saveRDS(output, fname) - } else if(ext == "rdata") { - save(output, file = fname) - } else { - message("No compatible file format found. No summary output file ignored created!") - } - rm(output) - invisible() - } -) - -# ------------------------- # -#### Save model for later use #### - -#' Save a model for later use -#' -#' @description -#' The `write_model` function (opposed to the `write_output`) is a generic wrapper to writing a -#' [`DistributionModel`] to disk. It is essentially a wrapper to [`saveRDS`]. -#' Models can be loaded again via the `load_model` function. -#' -#' @note -#' By default output files will be overwritten if already existing! -#' -#' @param mod Provided [`DistributionModel`] object. -#' @param fname A [`character`] depicting an output filename. -#' @param slim A [`logical`] option to whether unnecessary entries in the model object should be deleted. -#' This deletes for example predictions or any other non-model content from the object (Default: \code{FALSE}). -#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). -#' @returns No R-output is created. A file is written to the target direction. -#' @examples \dontrun{ -#' x <- distribution(background) |> -#' add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> -#' add_predictors(pred_current, transform = 'scale',derivates = 'none') |> -#' engine_xgboost(nrounds = 2000) |> train(varsel = FALSE, only_linear = TRUE) -#' write_model(x, "testmodel.rds") -#' } - -#' @name write_model -#' @aliases write_model -#' @seealso load_model -#' @keywords utils -#' @exportMethod write_model -#' @export -NULL -methods::setGeneric("write_model", - signature = methods::signature("mod"), - function(mod, fname, slim = FALSE, verbose = getOption("ibis.setupmessages")) standardGeneric("write_model")) - -#' @name write_model -#' @rdname write_model -#' @usage \S4method{write_model}{ANY, character, logical, logical}(mod, fname, slim, verbose) -methods::setMethod( - "write_model", - methods::signature(mod = "ANY"), - function(mod, fname, slim = FALSE, verbose = getOption("ibis.setupmessages")) { - assertthat::assert_that( - !missing(mod), - is.character(fname), - is.logical(slim), - is.logical(verbose) - ) - # Check that provided model is correct - assertthat::assert_that(inherits(mod, "DistributionModel"), - !is.Waiver(mod$get_data("fit_best")) ) - # And model format - assertthat::assert_that( assertthat::has_extension(fname, "rds")) - - # If slim, remove some balast - if(slim){ - if(!is.Waiver(mod$get_data("prediction"))) mod$fits$prediction <- NULL - } - - # Save output - if(verbose && getOption('ibis.setupmessages')) myLog('[Export]','green',paste0('Writing model to file...')) - saveRDS(mod, fname) - invisible() - } -) - -#' Load a pre-computed model -#' -#' @description -#' The `load_model` function (opposed to the `write_model`) loads previous saved -#' [`DistributionModel`]. It is essentially a wrapper to [`readRDS`]. -#' -#' When models are loaded, they are briefly checked for their validity and presence of -#' necessary components. -#' -#' @param fname A [`character`] depicting an output filename. -#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). -#' @returns A [`DistributionModel`] object. -#' @examples \dontrun{ -#' # Load model -#' mod <- load_model("testmodel.rds") -#' -#' summary(mod) -#' } - -#' @name load_model -#' @aliases load_model -#' @seealso write_model -#' @keywords utils -#' @exportMethod load_model -#' @export -NULL -methods::setGeneric("load_model", - signature = methods::signature("fname"), - function(fname, verbose = getOption("ibis.setupmessages")) standardGeneric("load_model")) - -#' @name load_model -#' @rdname load_model -#' @usage \S4method{load_model}{character, logical}(fname, verbose) -methods::setMethod( - "load_model", - methods::signature(fname = "character"), - function(fname, verbose = getOption("ibis.setupmessages")) { - assertthat::assert_that( - is.character(fname), - is.logical(verbose) - ) - # Check that file exists and is an rds file - assertthat::assert_that( - assertthat::has_extension(fname, "rds"), - file.exists(fname) - ) - - # Get file size - fz <- (file.size(fname) * 0.000001) |> round(digits = 3) - - if(verbose && getOption('ibis.setupmessages')) myLog('[Export]','green',paste0('Loading previously serialized model (size: ',fz,' MB)')) - # Load file - mod <- readRDS(fname) - - # --- # - # Make some checks # - assertthat::assert_that( - inherits(mod, "DistributionModel"), - hasName(mod, "model"), - !is.Waiver(mod$get_data("fit_best")) - ) - # Check that model type is known - assertthat::assert_that( any(sapply(class(mod), function(z) z %in% getOption("ibis.engines"))) ) - # Depending on engine, check package and load them - if(inherits(mod, "GDB-Model")){ - check_package("mboost"); require("mboost") - } else if(inherits(mod, "BART-Model")){ - check_package("dbarts"); require("dbarts") - } else if(inherits(mod, "INLABRU-Model")){ - check_package("INLA"); require("INLA") - check_package("inlabru"); require("inlabru") - } else if(inherits(mod, "BREG-Model")){ - check_package("BoomSpikeSlab"); require("BoomSpikeSlab") - } else if(inherits(mod, "GLMNET-Model")){ - check_package("glmnet"); require("glmnet") - check_package("glmnetUtils"); require("glmnetUtils") - } else if(inherits(mod, "STAN-Model")){ - check_package("rstan"); require("rstan") - check_package("cmdstanr"); require("cmdstanr") - } else if(inherits(mod, "XGBOOST-Model")){ - check_package("xgboost"); require("xgboost") - } - # --- # - # Return the model - return(mod) - } -) +#' Generic function to write spatial outputs +#' +#' @description +#' The `write_output` function is a generic wrapper to writing any output files (e.g. projections) created with +#' the [`ibis.iSDM-package`]. It is possible to write outputs of fitted [`DistributionModel`], +#' [`BiodiversityScenario`] or individual [`Raster`] or [`stars`] objects. In case a [`data.frame`] +#' is supplied, the output is written as csv file. +#' **For creating summaries of distribution and scenario parameters and performance, see `write_summary()`** +#' @note +#' By default output files will be overwritten if already existing! +#' @param mod Provided [`DistributionModel`], [`BiodiversityScenario`], [`Raster`] or [`stars`] object. +#' @param fname A [`character`] depicting an output filename. +#' @param dt A [`character`] for the output datatype. Following the [`raster::dataType()`] options (Default: \code{'FLT4S'}). +#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). +#' @param ... Any other arguements passed on the individual functions. +#' @returns No R-output is created. A file is written to the target direction. +#' @examples \dontrun{ +#' x <- distribution(background) |> +#' add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> +#' add_predictors(pred_current, transform = 'scale',derivates = 'none') |> +#' engine_xgboost(nrounds = 2000) |> train(varsel = FALSE, only_linear = TRUE) +#' write_output(x, "testmodel.tif") +#' } + +#' @name write_output +#' @aliases write_output +#' @keywords utils +#' @exportMethod write_output +#' @export +NULL +methods::setGeneric("write_output", + signature = methods::signature("mod"), + function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...) standardGeneric("write_output")) + +#' @name write_output +#' @rdname write_output +#' @usage \S4method{write_output}{ANY, character, character, logical}(mod, fname, dt, verbose) +methods::setMethod( + "write_output", + methods::signature("ANY"), + function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...){ + assertthat::assert_that( + !missing(mod), + is.character(fname), + is.character(dt), + is.logical(verbose) + ) + + if(verbose && getOption('ibis.setupmessages')) myLog('[Output]','green','Saving output(s)...') + + # This function will only capture the distribution model object and will save them separately + if(any(class(mod) %in% getOption("ibis.engines")) ){ + # FIXME: If errors occur, check harmonization of saving among engines. + mod$save(fname = fname) + } else if(is.Raster(mod)){ + if(raster::extension(fname) %in% c('.tif', '.TIF')) { + writeGeoTiff(file = mod, fname = fname, dt = dt) + } else if(raster::extension(fname) %in% c('.nc', '.NC', '.ncdf', '.NCDF')){ + writeNetCDF(file = mode, fname = fname, varName = names(mod), dt = dt) + } else { + stop("Output type could not be determined. Currently only geoTIFF and netCDF are supported.") + } + } else if(is.data.frame(mod)){ + utils::write.csv(x = mod,file = fname,...) + } else { + # Check that a save function exists for object + assertthat::assert_that("save" %in%names(mod), + msg = "No method to save the output could be found!") + # Try a generic save + mod$save(fname = fname) + } + invisible() + } +) + +#' @name write_output +#' @rdname write_output +#' @usage \S4method{write_output}{BiodiversityScenario, character, character, logical}(mod, fname, dt, verbose) +methods::setMethod( + "write_output", + methods::signature(mod = "BiodiversityScenario"), + function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...) { + assertthat::assert_that( + !missing(mod), + is.character(fname), + is.character(dt), + is.logical(verbose) + ) + # Get outputs + mod$save(fname = fname, type = tools::file_ext(fname), dt = dt) + invisible() + } +) + +#' @name write_output +#' @rdname write_output +#' @usage \S4method{write_output}{RasterLayer, character, character, logical}(mod, fname, dt, verbose) +methods::setMethod( + "write_output", + methods::signature(mod = "RasterLayer"), + function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"), ...) { + assertthat::assert_that( + !missing(mod), + is.Raster(mod), + is.character(fname), + is.character(dt), + is.logical(verbose) + ) + + # Write output depending on type + if(raster::extension(fname) %in% c('.tif', '.TIF')) { + writeGeoTiff(file = mod, fname = fname, dt = dt) + } else if(raster::extension(fname) %in% c('.nc', '.NC', '.ncdf', '.NCDF')){ + writeNetCDF(file = mode, fname = fname, varName = names(mod), dt = dt) + } else { + stop("Output type could not be determined. Currently only geoTIFF and netCDF are supported.") + } + invisible() + } +) + +#' @name write_output +#' @rdname write_output +#' @usage \S4method{write_output}{RasterStack, character, character, logical}(mod, fname, dt, verbose) +methods::setMethod( + "write_output", + methods::signature(mod = "RasterStack"), + function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"),...) { + assertthat::assert_that( + !missing(mod), + is.Raster(mod), + is.character(fname), + is.character(dt), + is.logical(verbose) + ) + + # Write output depending on type + if(raster::extension(fname) %in% c('.tif', '.TIF')) { + writeGeoTiff(file = mod, fname = fname, dt = dt) + } else if(raster::extension(fname) %in% c('.nc', '.NC', '.ncdf', '.NCDF')){ + writeNetCDF(file = mode, fname = fname, varName = names(mod), dt = dt) + } else { + stop("Output type could not be determined. Currently only geoTIFF and netCDF are supported.") + } + invisible() + } +) + +#' @name write_output +#' @rdname write_output +#' @usage \S4method{write_output}{data.frame, character, character, logical}(mod, fname, dt, verbose) +methods::setMethod( + "write_output", + methods::signature(mod = "data.frame"), + function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"),...) { + assertthat::assert_that( + !missing(mod), + is.data.frame(mod), + is.character(fname), + is.logical(verbose) + ) + + # data.frames will be written by default as csv files for consistency + fname <- paste0( tools::file_path_sans_ext(fname), ".csv") + utils::write.csv(x = mod, file = fname, ...) + invisible() + } +) + +#' @name write_output +#' @rdname write_output +#' @usage \S4method{write_output}{stars, character, character, logical}(mod, fname, dt, verbose) +methods::setMethod( + "write_output", + methods::signature(mod = "stars"), + function(mod, fname, dt = "FLT4S", verbose = getOption("ibis.setupmessages"),...) { + assertthat::assert_that( + !missing(mod), + # is.list(mod), + is.character(fname), + is.logical(verbose) + ) + # Check that it is a star object + assertthat::assert_that( + inherits(mod, "stars"), msg = "Supplied list object needs to be a stars object." + ) + + # Define filename + fname <- paste0( tools::file_path_sans_ext(fname), ".nc") + # TODO: Align with write NetCDF function further below + stars::write_stars( + obj = mod, + dsn = fname, + layer = names(mod), + ...) + invisible() + } +) + +#' Saves a raster file in Geotiff format +#' +#' @description Functions that acts as a wrapper to [raster::writeRaster]. +#' @param file A [`raster`] object to be saved. +#' @param fname A [`character`] stating the output destination. +#' @param dt The datatype to be written (Default: *Float64*). +#' @param varNA The nodata value to be used (Default: \code{-9999}). +#' @param ... Other options. +#' @keywords utils, internal +#' @noRd +writeGeoTiff <- function(file, fname, dt = "FLT4S", varNA = -9999, ...){ + assertthat::assert_that( + inherits(file,'Raster') || inherits(file, 'stars'), + is.character(fname), is.character(dt), + is.numeric(varNA) + ) + if(!assertthat::has_extension(fname,"tif")) fname <- paste0(fname,".tif") + + # Check if layer is factor and deratify if so (causes error otherwise) + if(any(is.factor(file))){ + file <- raster::deratify(file, complete = TRUE) + } + + # Save output + writeRaster(file, fname, + format='GTiff', + datatype = dt, + NAflag = varNA, + options=c("COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"), + overwrite= TRUE, + ...) +} + +#' Save a raster stack to a netcdf file +#' +#' @param file A [`raster`] object to be saved. +#' @param fname A [`character`] stating the output destination. +#' @param varName Name for the NetCDF export variable. +#' @param varUnit Units for the NetCDF export variable. +#' @param varLong Long name for the NetCDF export variable. +#' @param dt The datatype to be written. Default is Float64 +#' @param varNA The nodata value to be used. Default: \code{-9999}. +#' @param ... Other options. +#' @keywords utils, internal +#' @noRd +writeNetCDF <- function(file, fname, + varName, varUnit = NULL, + varLong = NULL, dt = "FLT4S", varNA = -9999, ...) { + assertthat::assert_that( + inherits(file,'Raster'), + is.character(fname), is.character(dt), + is.numeric(varNA) + ) + check_package('ncdf4') + if(!isNamespaceLoaded("ncdf4")) { attachNamespace("ncdf4");requireNamespace('ncdf4') } + if(!assertthat::has_extension(fname,"nc")) fname <- paste0(fname,".nc") + + # Output NetCDF file + raster::writeRaster(x = file, + filename = fname,format = "CDF", overwrite = TRUE, + varname = ifelse(is.null(varName),'Prediction',varName), + varunit = ifelse(is.null(varUnit),'',varUnit), + longname = ifelse(is.null(varLong),'',varLong), + xname = ifelse(isLonLat(ras), "Longitude","x"), + yname = ifelse(isLonLat(ras), "Latitude","y"), + zname = "Time", + zunit = "Years since 2000-01-01", # FIXME: Load and format date if provided + bylayer = FALSE, # Don't save separate layers + datatype = dt, NAflag = varNA, + ... + ) + + # Add common attributes + ncout <- ncdf4::nc_open(fname, write = TRUE) + + # add global attributes + ncdf4::ncatt_put(ncout, 0,"title","Biodiversity suitability projection created with ibis.iSDM") + + history <- paste(Sys.info()['user'], date(), sep=", ") + ncdf4::ncatt_put(ncout,0, "created", history) + ncdf4::ncatt_put(ncout,0, "Conventions", "CF=1.5") + + # close the file, writing data to disk + ncdf4::nc_close(ncout) + + invisible() +} + +# ------------------------- # +#### Write summary methods #### + +#' Generic function to write summary outputs from created models. +#' +#' @description +#' The [`write_summary`] function is a wrapper function to create summaries from fitted [`DistributionModel`] or +#' [`BiodiversityScenario`] objects. This function will extract parameters and statistics about the used data +#' from the input object and writes the output as either \code{'rds'} or \code{'rdata'} file. Alternative, more open file formats +#' are under consideration. +#' @note +#' No predictions or tabular data is saved through this function. +#' Use [`write_output()`] to save those. +#' @param mod Provided [`DistributionModel`] or [`BiodiversityScenario`] object. +#' @param fname A [`character`] depicting an output filename. +#' The suffix determines the file type of the output (Options: \code{'rds'}, \code{'rdata'}). +#' @param partial A [`logical`] value determining whether partial variable contributions should be calculated and added +#' to the model summary. **Note that this can be rather slow** (Default: \code{FALSE}). +#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). +#' @param ... Any other arguments passed on the individual functions. +#' @returns No R-output is created. A file is written to the target direction. +#' @examples \dontrun{ +#' x <- distribution(background) |> +#' add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> +#' add_predictors(pred_current, transform = 'scale',derivates = 'none') |> +#' engine_xgboost(nrounds = 2000) |> train(varsel = FALSE, only_linear = TRUE) +#' write_summary(x, "testmodel.rds") +#' } +#' @keywords utils + +#' @name write_summary +#' @aliases write_summary +#' @exportMethod write_summary +#' @export +NULL +methods::setGeneric("write_summary", + signature = methods::signature("mod"), + function(mod, fname, partial = FALSE, verbose = getOption("ibis.setupmessages"),...) standardGeneric("write_summary")) + +#' @name write_summary +#' @rdname write_summary +#' @usage \S4method{write_summary}{ANY, character, logical, logical}(mod, fname, partial, verbose) +methods::setMethod( + "write_summary", + methods::signature(mod = "ANY"), + function(mod, fname, partial = FALSE, verbose = getOption("ibis.setupmessages"), ...) { + assertthat::assert_that( + !missing(mod), + is.character(fname), + is.logical(partial), + is.logical(verbose) + ) + assertthat::assert_that( + inherits(mod, "DistributionModel") || inherits(mod, "BiodiversityScenario"), + msg = "Only objects created through `train()` or `project()` are supported!" + ) + # Check writeable or not + assertthat::assert_that( + assertthat::is.writeable(dirname(fname)),msg = "Given input folder is not writeable!" + ) + + # Get file extension + ext <- tolower( tools::file_ext(fname) ) + if(ext == "") ext <- "rds" # Assign rds as default + ext <- match.arg(ext, choices = c("rds", "rdata"), several.ok = FALSE) + fname <- paste0(tools::file_path_sans_ext(fname), ".", ext) + if(file.exists(fname) && (verbose && getOption('ibis.setupmessages'))) myLog('[Output]','yellow','Overwriting existing file...') + assertthat::assert_that(assertthat::is.writeable(dirname(fname))) + # --- # + # Gather the statistics and parameters from the provided file + output <- list() + if(inherits(mod, "DistributionModel")){ + + # Summarize the model object + model <- mod$model + + # Model input summary in a tibble + output[["input"]][["extent"]] <- as.matrix( extent( model$background ) ) + output[["input"]][["predictors"]] <- model$predictors_types + if(!is.Waiver(model$offset)) output[["input"]][["offset"]] <- names(model$offset) else output[["input"]][["offset"]] <- NA + if(!is.Waiver(model$priors)){ + output[["input"]][["priors"]] <- model$priors$summary() + } else output[["input"]][["priors"]] <- NA + + # Go over biodiversity datasets + o <- data.frame() + for(i in 1:length(model$biodiversity)){ + o <- rbind(o, + data.frame(id = names(model$biodiversity)[i], + name = model$biodiversity[[i]]$name, + type = model$biodiversity[[i]]$type, + family = model$biodiversity[[i]]$family, + equation = deparse1(model$biodiversity[[i]]$equation), + obs_pres = sum( model$biodiversity[[i]]$observations$observed > 0 ), + obs_abs = sum( model$biodiversity[[i]]$observations$observed == 0 ), + n_predictors = length( model$biodiversity[[i]]$predictors_names ) + ) + ) + } + output[["input"]][["biodiversity"]] <- o + + # Model parameters in a tibble + output[["params"]][["id"]] <- as.character(model$id) + output[["params"]][["runname"]] <- as.character(model$runname) + output[["params"]][["algorithm"]] <- class(mod)[1] + output[["params"]][["equation"]] <- mod$get_equation() + # Collect settings and parameters if existing + if( !is.Waiver(mod$get_data("params")) ){ + output[["params"]][["params"]] <- mod$get_data("params") + } + if( "settings" %in% names(mod) ){ + output[["params"]][["settings"]] <- mod$settings$data + } + + # Model summary in a tibble and formula + output[["output"]][["summary"]] <- mod$summary() + if(!is.null(mod$get_data("prediction") )){ + output[["output"]][["resolution"]] <- raster::res( mod$get_data("prediction") ) + output[["output"]][["prediction"]] <- names( mod$get_data("prediction") ) + } else { + output[["output"]][["resolution"]] <- NA + output[["output"]][["prediction"]] <- NA + } + # Calculate partial estimates if set + if(partial){ + if(verbose && getOption('ibis.setupmessages')) myLog('[Export]','green',paste0('Calculating partial variable contributions...')) + message("Not yet added") # TODO: + output[["output"]][["partial"]] <- NA + } else { + output[["output"]][["partial"]] <- NA + } + + } else if(inherits(mod, "BiodiversityScenario")){ + # Summarize the model object + model <- mod$get_model() + + # Model input summary in a tibble + output[["input"]][["extent"]] <- as.matrix( raster::extent( model$model$background ) ) + output[["input"]][["predictors"]] <- mod$predictors$get_names() + output[["input"]][["timerange"]] <- mod$get_timeperiod() + output[["input"]][["predictor_time"]] <- mod$predictors$get_time() + # Collect settings and parameters if existing + if( !is.Waiver(mod$get_data("constraints")) ){ + output[["input"]][["constraints"]] <- mod$get_constraints() + } + + # Model parameters in a tibble + output[["params"]][["id"]] <- as.character(mod$modelid) + output[["params"]][["runname"]] <- as.character(model$model$runname) + output[["params"]][["algorithm"]] <- class(model)[1] + if( "settings" %in% names(mod) ){ + output[["params"]][["settings"]] <- model$settings$data + } + + # Model summary in a tibble and formula + output[["output"]][["summary"]] <- mod$summary(plot = FALSE,...) + if(!is.Waiver(mod$get_data() )){ + sc_dim <- stars::st_dimensions(mod$get_data()) + output[["output"]][["resolution"]] <- abs( c(x = sc_dim$x$delta, y = sc_dim$y$delta) ) + output[["output"]][["prediction"]] <- names(mod$get_data()) + } else { + output[["output"]][["resolution"]] <- NA + output[["output"]][["prediction"]] <- NA + } + } + assertthat::assert_that( + is.list(output), + length(output)>0 + ) + # --- # + # Write the output + if(ext == "rds"){ + saveRDS(output, fname) + } else if(ext == "rdata") { + save(output, file = fname) + } else { + message("No compatible file format found. No summary output file ignored created!") + } + rm(output) + invisible() + } +) + +# ------------------------- # +#### Save model for later use #### + +#' Save a model for later use +#' +#' @description +#' The `write_model` function (opposed to the `write_output`) is a generic wrapper to writing a +#' [`DistributionModel`] to disk. It is essentially a wrapper to [`saveRDS`]. +#' Models can be loaded again via the `load_model` function. +#' +#' @note +#' By default output files will be overwritten if already existing! +#' +#' @param mod Provided [`DistributionModel`] object. +#' @param fname A [`character`] depicting an output filename. +#' @param slim A [`logical`] option to whether unnecessary entries in the model object should be deleted. +#' This deletes for example predictions or any other non-model content from the object (Default: \code{FALSE}). +#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). +#' @returns No R-output is created. A file is written to the target direction. +#' @examples \dontrun{ +#' x <- distribution(background) |> +#' add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> +#' add_predictors(pred_current, transform = 'scale',derivates = 'none') |> +#' engine_xgboost(nrounds = 2000) |> train(varsel = FALSE, only_linear = TRUE) +#' write_model(x, "testmodel.rds") +#' } + +#' @name write_model +#' @aliases write_model +#' @seealso load_model +#' @keywords utils +#' @exportMethod write_model +#' @export +NULL +methods::setGeneric("write_model", + signature = methods::signature("mod"), + function(mod, fname, slim = FALSE, verbose = getOption("ibis.setupmessages")) standardGeneric("write_model")) + +#' @name write_model +#' @rdname write_model +#' @usage \S4method{write_model}{ANY, character, logical, logical}(mod, fname, slim, verbose) +methods::setMethod( + "write_model", + methods::signature(mod = "ANY"), + function(mod, fname, slim = FALSE, verbose = getOption("ibis.setupmessages")) { + assertthat::assert_that( + !missing(mod), + is.character(fname), + is.logical(slim), + is.logical(verbose) + ) + # Check that provided model is correct + assertthat::assert_that(inherits(mod, "DistributionModel"), + !is.Waiver(mod$get_data("fit_best")) ) + # And model format + assertthat::assert_that( assertthat::has_extension(fname, "rds")) + + # If slim, remove some balast + if(slim){ + if(!is.Waiver(mod$get_data("prediction"))) mod$fits$prediction <- NULL + } + + # Save output + if(verbose && getOption('ibis.setupmessages')) myLog('[Export]','green',paste0('Writing model to file...')) + saveRDS(mod, fname) + invisible() + } +) + +#' Load a pre-computed model +#' +#' @description +#' The `load_model` function (opposed to the `write_model`) loads previous saved +#' [`DistributionModel`]. It is essentially a wrapper to [`readRDS`]. +#' +#' When models are loaded, they are briefly checked for their validity and presence of +#' necessary components. +#' +#' @param fname A [`character`] depicting an output filename. +#' @param verbose [`logical`] indicating whether messages should be shown. Overwrites `getOption("ibis.setupmessages")` (Default: \code{TRUE}). +#' @returns A [`DistributionModel`] object. +#' @examples \dontrun{ +#' # Load model +#' mod <- load_model("testmodel.rds") +#' +#' summary(mod) +#' } + +#' @name load_model +#' @aliases load_model +#' @seealso write_model +#' @keywords utils +#' @exportMethod load_model +#' @export +NULL +methods::setGeneric("load_model", + signature = methods::signature("fname"), + function(fname, verbose = getOption("ibis.setupmessages")) standardGeneric("load_model")) + +#' @name load_model +#' @rdname load_model +#' @usage \S4method{load_model}{character, logical}(fname, verbose) +methods::setMethod( + "load_model", + methods::signature(fname = "character"), + function(fname, verbose = getOption("ibis.setupmessages")) { + assertthat::assert_that( + is.character(fname), + is.logical(verbose) + ) + # Check that file exists and is an rds file + assertthat::assert_that( + assertthat::has_extension(fname, "rds"), + file.exists(fname) + ) + + # Get file size + fz <- (file.size(fname) * 0.000001) |> round(digits = 3) + + if(verbose && getOption('ibis.setupmessages')) myLog('[Export]','green',paste0('Loading previously serialized model (size: ',fz,' MB)')) + # Load file + mod <- readRDS(fname) + + # --- # + # Make some checks # + assertthat::assert_that( + inherits(mod, "DistributionModel"), + utils::hasName(mod, "model"), + !is.Waiver(mod$get_data("fit_best")) + ) + # Check that model type is known + assertthat::assert_that( any(sapply(class(mod), function(z) z %in% getOption("ibis.engines"))) ) + # Depending on engine, check package and load them + + if(inherits(mod, "GDB-Model")){ + check_package("mboost") + } else if(inherits(mod, "BART-Model")){ + check_package("dbarts") + } else if(inherits(mod, "INLABRU-Model")){ + check_package("INLA") + check_package("inlabru") + } else if(inherits(mod, "BREG-Model")){ + check_package("BoomSpikeSlab") + } else if(inherits(mod, "GLMNET-Model")){ + check_package("glmnet") + check_package("glmnetUtils") + } else if(inherits(mod, "STAN-Model")){ + check_package("rstan") + check_package("cmdstanr") + } else if(inherits(mod, "XGBOOST-Model")){ + check_package("xgboost") + } + + # --- # + # Return the model + return(mod) + } +) diff --git a/README.Rmd b/README.Rmd index 4c19966d..63552836 100644 --- a/README.Rmd +++ b/README.Rmd @@ -48,13 +48,13 @@ See relevant [reference site](https://iiasa.github.io/ibis.iSDM/) and [articles] Note that the package is in active development and parameters of some functions might change. **Citation:** - -Jung, Martin. 2023. “An Integrated Species Distribution Modelling Framework for Heterogeneous Biodiversity Data.” OSF Preprints. February 19. [DOI](https://osf.io/vusz8) - + +Jung, Martin. 2023. “An Integrated Species Distribution Modelling Framework for Heterogeneous Biodiversity Data.” Ecological Informatics, 102127, [DOI](https://doi.org/10.1016/j.ecoinf.2023.102127) + ## Acknowledgement IIASA -**ibis.iSDM** is developed by the Biodiversity, Ecology and Conservation group at the International Institute for Applied Systems Analysis (IIASA), Austria. +**ibis.iSDM** is developed and maintained by the Biodiversity, Ecology and Conservation group at the International Institute for Applied Systems Analysis (IIASA), Austria. diff --git a/README.md b/README.md index 53086c2d..7bcdbe27 100644 --- a/README.md +++ b/README.md @@ -49,16 +49,15 @@ Note that the package is in active development and parameters of some functions might change. **Citation:** - + Jung, Martin. 2023. “An Integrated Species Distribution Modelling -Framework for Heterogeneous Biodiversity Data.” OSF Preprints. February -19. [DOI](https://osf.io/vusz8) - +Framework for Heterogeneous Biodiversity Data.” Ecological Informatics, +102127, [DOI](https://doi.org/10.1016/j.ecoinf.2023.102127) ## Acknowledgement IIASA -**ibis.iSDM** is developed by the Biodiversity, Ecology and Conservation -group at the International Institute for Applied Systems Analysis -(IIASA), Austria. +**ibis.iSDM** is developed and maintained by the Biodiversity, Ecology +and Conservation group at the International Institute for Applied +Systems Analysis (IIASA), Austria. diff --git a/_pkgdown.yml b/_pkgdown.yml index 60f4231d..a74e4066 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -117,7 +117,6 @@ reference: of them are for internal use, but can be of use if input needs to be reformatted. contents: - explode_factorized_raster - - find_correlated_predictors - posterior_predict_stanfit - alignRasters - emptyraster @@ -125,6 +124,7 @@ reference: - get_rastervalue - predictor_transform - predictor_derivate + - predictor_filter - run_stan - wrap_stanmodel - starts_with("stancode") diff --git a/man/add_biodiversity_poipa.Rd b/man/add_biodiversity_poipa.Rd index 1e02257f..ae3c7cb7 100644 --- a/man/add_biodiversity_poipa.Rd +++ b/man/add_biodiversity_poipa.Rd @@ -67,8 +67,7 @@ unless the specific \link{engine} does not support generalised linear regression \examples{ \dontrun{ # Define model -x <- distribution(background) \%>\% - add_biodiversity_poipa(virtual_species) +x <- distribution(background) |> add_biodiversity_poipa(virtual_species) x } } diff --git a/man/add_biodiversity_poipo.Rd b/man/add_biodiversity_poipo.Rd index cfa3c85e..1891f3e5 100644 --- a/man/add_biodiversity_poipo.Rd +++ b/man/add_biodiversity_poipo.Rd @@ -71,7 +71,7 @@ regression. See Renner et al. 2015 for an overview. # Load virtual species virtual_species <- sf::st_read("inst/extdata/input_data.gpkg", "points") # Define model -x <- distribution(background) \%>\% +x <- distribution(background) |> add_biodiversity_poipo(virtual_species) x } diff --git a/man/add_biodiversity_polpa.Rd b/man/add_biodiversity_polpa.Rd index 32f91623..65d2b852 100644 --- a/man/add_biodiversity_polpa.Rd +++ b/man/add_biodiversity_polpa.Rd @@ -83,7 +83,7 @@ For an integration of range data as predictor or offset, see \code{\link[=add_pr } \examples{ \dontrun{ - x <- distribution(background) \%>\% + x <- distribution(background) |> add_biodiversity_polpa(protectedArea) } } diff --git a/man/add_biodiversity_polpo.Rd b/man/add_biodiversity_polpo.Rd index b0885241..6a20aa60 100644 --- a/man/add_biodiversity_polpo.Rd +++ b/man/add_biodiversity_polpo.Rd @@ -83,7 +83,7 @@ For an integration of range data as predictor or offset, see \code{\link[=add_pr } \examples{ \dontrun{ - x <- distribution(mod) \%>\% + x <- distribution(mod) |> add_biodiversity_polpo(protectedArea) } } diff --git a/man/add_constrain_MigClim.Rd b/man/add_constrain_MigClim.Rd index 876d2711..beaa3707 100644 --- a/man/add_constrain_MigClim.Rd +++ b/man/add_constrain_MigClim.Rd @@ -61,8 +61,10 @@ The barrier parameter is defined through \link{add_barrier}. } \references{ \itemize{ -\item Engler R., Hordijk W. and Guisan A. The MIGCLIM R package – seamless integration of dispersal constraints into projections of species distribution models. Ecography, -\item Robin Engler, Wim Hordijk and Loic Pellissier (2013). MigClim: Implementing dispersal into species distribution models. R package version 1.6. +\item Engler R., Hordijk W. and Guisan A. The MIGCLIM R package – seamless integration of +dispersal constraints into projections of species distribution models. Ecography, +\item Robin Engler, Wim Hordijk and Loic Pellissier (2013). MigClim: Implementing dispersal +into species distribution models. R package version 1.6. } } \seealso{ diff --git a/man/add_control_bias.Rd b/man/add_control_bias.Rd index 8fc626fa..76152ea3 100644 --- a/man/add_control_bias.Rd +++ b/man/add_control_bias.Rd @@ -49,8 +49,8 @@ method to \code{"offset"} will automatically point to this option. } \examples{ \dontrun{ - x <- distribution(background) \%>\% - add_predictors(covariates) \%>\% + x <- distribution(background) |> + add_predictors(covariates) |> add_control_bias(biasvariable, bias_value = NULL) } } diff --git a/man/add_latent_spatial.Rd b/man/add_latent_spatial.Rd index aae56872..1f3fd8c5 100644 --- a/man/add_latent_spatial.Rd +++ b/man/add_latent_spatial.Rd @@ -63,7 +63,7 @@ Available for all Engines. } \examples{ \dontrun{ - distribution(background) \%>\% add_latent_spatial(method = "poly") + distribution(background) |> add_latent_spatial(method = "poly") } } \references{ diff --git a/man/add_log.Rd b/man/add_log.Rd index afd58065..2bb59a04 100644 --- a/man/add_log.Rd +++ b/man/add_log.Rd @@ -23,7 +23,7 @@ console outputs, prints and messages. } \examples{ \dontrun{ - x <- distribution(background) \%>\% + x <- distribution(background) |> add_log() x } diff --git a/man/add_offset.Rd b/man/add_offset.Rd index 72589c77..f8ad3e99 100644 --- a/man/add_offset.Rd +++ b/man/add_offset.Rd @@ -45,8 +45,8 @@ Offsets specified for non-supported engines are ignored during the estimation } \examples{ \dontrun{ - x <- distribution(background) \%>\% - add_predictors(covariates) \%>\% + x <- distribution(background) |> + add_predictors(covariates) |> add_offset(nicheEstimate) } } diff --git a/man/add_offset_bias.Rd b/man/add_offset_bias.Rd index 9e9ec892..5f64bb59 100644 --- a/man/add_offset_bias.Rd +++ b/man/add_offset_bias.Rd @@ -44,8 +44,8 @@ function \code{\link[=add_offset_range]{add_offset_range()}} or the \code{bossMa } \examples{ \dontrun{ - x <- distribution(background) \%>\% - add_predictors(covariates) \%>\% + x <- distribution(background) |> + add_predictors(covariates) |> add_offset_bias(samplingBias) } } diff --git a/man/add_predictors.Rd b/man/add_predictors.Rd index 48e0a225..fff91025 100644 --- a/man/add_predictors.Rd +++ b/man/add_predictors.Rd @@ -112,7 +112,7 @@ will return an error message if these are used. } \examples{ \dontrun{ - obj <- distribution(background) \%>\% + obj <- distribution(background) |> add_predictors(covariates, transform = 'scale') obj } diff --git a/man/add_predictors_globiom.Rd b/man/add_predictors_globiom.Rd index a5bee547..15df76d0 100644 --- a/man/add_predictors_globiom.Rd +++ b/man/add_predictors_globiom.Rd @@ -65,7 +65,7 @@ called directly via \code{formatGLOBIOM()}. } \examples{ \dontrun{ - obj <- distribution(background) \%>\% + obj <- distribution(background) |> add_predictors_globiom(fname = "", transform = 'none') obj } diff --git a/man/add_pseudoabsence.Rd b/man/add_pseudoabsence.Rd index 5e81ce29..f95cc7a6 100644 --- a/man/add_pseudoabsence.Rd +++ b/man/add_pseudoabsence.Rd @@ -49,8 +49,11 @@ and the coordinate columns (which will be created if not already present). } \references{ \itemize{ -\item Stolar, J., & Nielsen, S. E. (2015). Accounting for spatially biased sampling effort in presence‐only species distribution modelling. Diversity and Distributions, 21(5), 595-608. -\item Bird, T.J., Bates, A.E., Lefcheck, J.S., Hill, N.A., Thomson, R.J., Edgar, G.J., Stuart-Smith, R.D., Wotherspoon, S., Krkosek, M., Stuart-Smith, J.F. and Pecl, G.T., 2014. Statistical solutions for error and bias in global citizen science datasets. Biological Conservation, 173, pp.144-154. +\item Stolar, J., & Nielsen, S. E. (2015). Accounting for spatially biased sampling effort in +presence‐only species distribution modelling. Diversity and Distributions, 21(5), 595-608. +\item Bird, T.J., Bates, A.E., Lefcheck, J.S., Hill, N.A., Thomson, R.J., Edgar, G.J., Stuart-Smith, R.D., +Wotherspoon, S., Krkosek, M., Stuart-Smith, J.F. and Pecl, G.T., 2014. Statistical solutions +for error and bias in global citizen science datasets. Biological Conservation, 173, pp.144-154. } } \keyword{train} diff --git a/man/coef.Rd b/man/coef.Rd index 99201c1b..797b0a2f 100644 --- a/man/coef.Rd +++ b/man/coef.Rd @@ -5,12 +5,12 @@ \alias{coef.DistributionModel} \title{Obtains the coefficients of a trained model} \usage{ -\method{coef}{DistributionModel}(x, ...) +\method{coef}{DistributionModel}(object, ...) } \arguments{ -\item{...}{not used.} - \item{object}{Any prepared object.} + +\item{...}{not used.} } \description{ Similar as \code{\link{summary}}, this helper function obtains the coefficients from diff --git a/man/effects.Rd b/man/effects.Rd index f09bc0d6..58d21e89 100644 --- a/man/effects.Rd +++ b/man/effects.Rd @@ -5,10 +5,12 @@ \alias{effects.DistributionModel} \title{Plot effects of trained model} \usage{ -\method{effects}{DistributionModel}(x) +\method{effects}{DistributionModel}(object, ...) } \arguments{ -\item{x}{Any fitted \link{distribution} object.} +\item{object}{Any fitted \link{distribution} object.} + +\item{...}{Not used.} } \value{ None. diff --git a/man/engine_bart.Rd b/man/engine_bart.Rd index 567396d7..04f5679c 100644 --- a/man/engine_bart.Rd +++ b/man/engine_bart.Rd @@ -9,9 +9,9 @@ engine_bart(x, iter = 1000, nburn = 250, chains = 4, type = "response", ...) \arguments{ \item{x}{\code{\link[=distribution]{distribution()}} (i.e. \code{\linkS4class{BiodiversityDistribution}}) object.} -\item{iter}{A \code{\link{numeric}} estimate of the number of trees to be used in the sum-of-trees formulation.} +\item{iter}{A \code{\link{numeric}} estimate of the number of trees to be used in the sum-of-trees formulation (Default: \code{1000}).} -\item{nburn}{A \code{\link{numeric}} estimate of the burn in samples.} +\item{nburn}{A \code{\link{numeric}} estimate of the burn in samples (Default: \code{250}).} \item{chains}{A number of the number of chains to be used (Default: \code{4}).} @@ -19,6 +19,9 @@ engine_bart(x, iter = 1000, nburn = 250, chains = 4, type = "response", ...) \item{...}{Other options.} } +\value{ +An \link{engine}. +} \description{ The Bayesian regression approach to a sum of complementary trees is to shrink the said fit of each tree through a regularization prior. BART models provide diff --git a/man/engine_breg.Rd b/man/engine_breg.Rd index 835d4b06..24d7750d 100644 --- a/man/engine_breg.Rd +++ b/man/engine_breg.Rd @@ -23,6 +23,9 @@ engine_breg( \item{...}{Other none specified parameters passed on to the model.} } +\value{ +An \link{engine}. +} \description{ Efficient MCMC algorithm for linear regression models that makes use of 'spike-and-slab' priors for some modest regularization on the amount of posterior @@ -35,6 +38,12 @@ not available. This engines allows the estimation of linear and non-linear effects via the \code{"only_linear"} option specified in \link{train}. } +\examples{ +\dontrun{ +# Add BREG as an engine +x <- distribution(background) |> engine_breg(iter = 1000) +} +} \references{ \itemize{ \item Nguyen, K., Le, T., Nguyen, V., Nguyen, T., & Phung, D. (2016, November). Multiple kernel learning with data augmentation. In Asian Conference on Machine Learning (pp. 49-64). PMLR. diff --git a/man/engine_gdb.Rd b/man/engine_gdb.Rd index a1615347..3b29f08b 100644 --- a/man/engine_gdb.Rd +++ b/man/engine_gdb.Rd @@ -27,6 +27,9 @@ Available options are \code{'inbag'}, \code{'oobag'} and \code{'none'}. (Default \item{...}{Other variables or control parameters} } +\value{ +An\link{engine}. +} \description{ Gradient descent boosting is an efficient way to optimize any loss function of a generalized linear or additive model (such as the GAMs available through the \link{mgcv} R-package). @@ -45,6 +48,12 @@ particularly useful for spatial projections. Such as for instance the ability to spatial baselearners via \link{add_latent} or the specification of monotonically constrained priors via \link{GDBPrior}. } +\examples{ +\dontrun{ +# Add GDB as an engine +x <- distribution(background) |> engine_gdb(iter = 1000) +} +} \references{ \itemize{ \item Hofner, B., Mayr, A., Robinzonov, N., & Schmid, M. (2014). Model-based boosting in R: a hands-on tutorial using the R package mboost. Computational statistics, 29(1-2), 3-35. diff --git a/man/engine_glmnet.Rd b/man/engine_glmnet.Rd index 5bb912e3..1c502696 100644 --- a/man/engine_glmnet.Rd +++ b/man/engine_glmnet.Rd @@ -28,6 +28,9 @@ determined deterministically (Default: \code{NULL}).} \item{...}{Other parameters passed on to glmnet.} } +\value{ +An \link{engine}. +} \description{ This engine allows the estimation of linear coefficients using either ridge, lasso or elastic net regressions techniques. Backbone of this engine is the \pkg{glmnet} R-package which is commonly used in SDMs, @@ -52,6 +55,12 @@ of the two. The optimal lambda parameter can be determined via cross-validation. For this option set \code{"varsel"} in \code{train()} to \code{"reg"}. } +\examples{ +\dontrun{ +# Add BREG as an engine +x <- distribution(background) |> engine_glmnet(iter = 1000) +} +} \references{ \itemize{ \item Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear Models via Coordinate Descent. Journal of Statistical Software, 33(1), 1-22. URL https://www.jstatsoft.org/v33/i01/. diff --git a/man/engine_inla.Rd b/man/engine_inla.Rd index 08023019..04065df8 100644 --- a/man/engine_inla.Rd +++ b/man/engine_inla.Rd @@ -68,6 +68,9 @@ Either summarizing the linear \code{"predictor"} or \code{"response"} (Default: \item{...}{Other options.} } +\value{ +An \link{engine}. +} \description{ Allows a full Bayesian analysis of linear and additive models using Integrated Nested Laplace approximation. Engine has been largely superceded by the \link{engine_bru} package and users are advised to us this one, @@ -92,6 +95,12 @@ created. Priors can be set via \link{INLAPrior}. } +\examples{ +\dontrun{ +# Add INLA as an engine (with a custom mesh) +x <- distribution(background) |> engine_inla(mesh = my_mesh) +} +} \references{ \itemize{ \item Havard Rue, Sara Martino, and Nicholas Chopin (2009), Approximate Bayesian Inference for Latent Gaussian Models Using Integrated Nested Laplace Approximations (with discussion), Journal of the Royal Statistical Society B, 71, 319-392. diff --git a/man/engine_inlabru.Rd b/man/engine_inlabru.Rd index a6d0021f..d960a6d5 100644 --- a/man/engine_inlabru.Rd +++ b/man/engine_inlabru.Rd @@ -51,6 +51,9 @@ Default is an educated guess (Default: \code{NULL}).} \item{...}{Other variables} } +\value{ +An \link{engine}. +} \description{ Model components are specified with general inputs and mapping methods to the latent variables, and the predictors are specified via general R expressions, @@ -80,6 +83,12 @@ created. Priors can be set via \link{INLAPrior}. } +\examples{ +\dontrun{ +# Add inlabru as an engine +x <- distribution(background) |> engine_inlabru() +} +} \references{ \itemize{ \item Bachl, F. E., Lindgren, F., Borchers, D. L., & Illian, J. B. (2019). inlabru: an R package for Bayesian spatial modelling from ecological survey data. Methods in Ecology and Evolution, 10(6), 760-766. diff --git a/man/engine_stan.Rd b/man/engine_stan.Rd index cf60432e..d2bf6052 100644 --- a/man/engine_stan.Rd +++ b/man/engine_stan.Rd @@ -43,6 +43,9 @@ See \code{\link{cmdstanr}} package for more details. (Default: \code{"sampling"} \item{...}{Other variables} } +\value{ +An \link{engine}. +} \description{ Stan is probabilistic programming language that can be used to specify most types of statistical linear and non-linear regression models. @@ -59,6 +62,12 @@ approximate inference forms through penalized maximum likelihood estimation (see \note{ The function \code{obj$stancode()} can be used to print out the stancode of the model. } +\examples{ +\dontrun{ +# Add Stan as an engine +x <- distribution(background) |> engine_stan(iter = 1000) +} +} \references{ \itemize{ \item Jonah Gabry and Rok Češnovar (2021). cmdstanr: R Interface to 'CmdStan'. https://mc-stan.org/cmdstanr, https://discourse.mc-stan.org. diff --git a/man/engine_xgboost.Rd b/man/engine_xgboost.Rd index 465b3be4..58f28b9b 100644 --- a/man/engine_xgboost.Rd +++ b/man/engine_xgboost.Rd @@ -50,6 +50,9 @@ tresting dataset (Default: \code{0.75}).} \item{...}{Other none specified parameters.} } +\value{ +An \link{engine}. +} \description{ Allows to estimate eXtreme gradient descent boosting for tree-based or linear boosting regressions. The XGBoost engine is a flexible, yet powerful engine with many customization options, @@ -67,6 +70,12 @@ ibis this is possible via \code{\link{XGBPrior}}. However constraints are availa \note{ \emph{'Machine learning is statistics minus any checking of models and assumptions‘} ~ Brian D. Ripley, useR! 2004, Vienna } +\examples{ +\dontrun{ +# Add xgboost as an engine +x <- distribution(background) |> engine_xgboost(iter = 4000) +} +} \references{ \itemize{ \item Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System", 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, https://arxiv.org/abs/1603.02754 diff --git a/man/find_correlated_predictors.Rd b/man/find_correlated_predictors.Rd deleted file mode 100644 index f137707d..00000000 --- a/man/find_correlated_predictors.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{find_correlated_predictors} -\alias{find_correlated_predictors} -\title{Filter a set of correlated predictors to fewer ones} -\usage{ -find_correlated_predictors(env, keep = NULL, cutoff = 0.7, method = "pearson") -} -\arguments{ -\item{env}{A \code{\link{data.frame}} with extracted environmental covariates for a given species.} - -\item{keep}{A \code{\link{vector}} with variables to keep regardless.} - -\item{cutoff}{A \code{\link{numeric}} variable specifying the maximal correlation cutoff.} - -\item{method}{Which method to use for constructing the correlation matrix (Options: \code{'pearson'}| \code{'spearman'}| \code{'kendal'})} -} -\value{ -vector of variable names to exclude -} -\description{ -Filter a set of correlated predictors to fewer ones -} -\concept{Code inspired from the [`caret`] package} -\keyword{utils} diff --git a/man/get_priors.Rd b/man/get_priors.Rd index 342e43c9..8c36fdce 100644 --- a/man/get_priors.Rd +++ b/man/get_priors.Rd @@ -27,10 +27,10 @@ Not all engines support priors in similar ways. See the vignettes and help pages } \examples{ \dontrun{ - mod <- distribution(background) \%>\% - add_predictors(covariates) \%>\% - add_biodiversity_poipo(points) \%>\% - engine_inlabru() \%>\% + mod <- distribution(background) |> + add_predictors(covariates) |> + add_biodiversity_poipo(points) |> + engine_inlabru() |> train() get_priors(mod, target_engine = "BART") } diff --git a/man/ibis.iSDM-package.Rd b/man/ibis.iSDM-package.Rd new file mode 100644 index 00000000..39731000 --- /dev/null +++ b/man/ibis.iSDM-package.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ibis.iSDM-package.R +\docType{package} +\name{ibis.iSDM-package} +\alias{ibis.iSDM} +\alias{ibis.iSDM-package} +\title{ibis.iSDM: Modelling Framework for Integrated Biodiversity Distribution Scenarios} +\description{ +Integrated framework of modelling the distribution of species and ecosystems in a suitability framing. This package allows the estimation of integrated species distribution models (iSDM) based on several sources of evidence and provided presence-only and presence-absence datasets. It makes heavy use of point-process models for estimating habitat suitability and allows to include spatial latent effects and priors in the estimation. To do so 'ibis.iSDM' supports a number of engines for Bayesian and more non-parametric machine learning estimation. Further, the 'ibis.iSDM' is specifically customized to support spatial-temporal projections of habitat suitability into the future. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://iiasa.github.io/ibis.iSDM/} + \item Report bugs at \url{https://github.com/iiasa/ibis.iSDM/issues} +} + +} +\author{ +\strong{Maintainer}: Martin Jung \email{jung@iiasa.ac.at} (\href{https://orcid.org/0000-0002-7569-1390}{ORCID}) [copyright holder] + +Other contributors: +\itemize{ + \item Maximilian H.K. Hesselbarth \email{hesselbarth@iiasa.ac.at} (\href{https://orcid.org/0000-0003-1125-9918}{ORCID}) [contributor] +} + +} +\keyword{internal} diff --git a/man/pipe.Rd b/man/pipe.Rd deleted file mode 100644 index 7e632fd6..00000000 --- a/man/pipe.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/misc.R -\name{\%>\%} -\alias{\%>\%} -\alias{pipe} -\title{Pipe operator} -\arguments{ -\item{lhs, }{rhs An object and a function.} -} -\value{ -An object. -} -\description{ -This package uses the pipe operator (\verb{\\\%>\\\%}) to express nested code -as a series of imperative procedures. -} -\examples{ -# set seed for reproducibility -set.seed(500) - -# generate 100 random numbers and calculate the mean -mean(runif(100)) - -# reset the seed -set.seed(500) - -# repeat the previous procedure but use the pipe operator instead of nesting -# function calls inside each other. -runif(100) \%>\% mean() - -} -\seealso{ -\code{\link[magrittr:pipe]{magrittr::\%>\%()}}, \code{\link[=tee]{tee()}}. -} -\keyword{internal} diff --git a/man/predictor_filter.Rd b/man/predictor_filter.Rd new file mode 100644 index 00000000..3c0aa4b0 --- /dev/null +++ b/man/predictor_filter.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-predictors.R +\name{predictor_filter} +\alias{predictor_filter} +\title{Filter a set of correlated predictors to fewer ones} +\usage{ +predictor_filter(env, keep = NULL, method = "pearson", ...) +} +\arguments{ +\item{env}{A \code{\link{data.frame}} or \code{\link{matrix}} with extracted environmental covariates for a given species.} + +\item{keep}{A \code{\link{vector}} with variables to keep regardless. These are usually variables for which prior +information is known.} + +\item{method}{Which method to use for constructing the correlation matrix (Options: \code{'pearson'} (Default), +\code{'spearman'}| \code{'kendal'}), \code{"abess"}, or \code{"boruta"}.} + +\item{...}{Other options for a specific method} + +\item{obs}{A \code{\link{vector}} with observational records to use for determining variable importance. Can be \code{NULL}.} +} +\value{ +A \code{\link{character}} \code{\link{vector}} of variable names to be excluded. +If the function fails due to some reason return \code{NULL}. +} +\description{ +This function helps to remove highly correlated variables from a set of predictors. It supports multiple options +some of which require both environmental predictors and observations, others only predictors. + +Some of the options require different packages to be pre-installed, such as \link{ranger} or \link{Boruta}. +} +\details{ +Available options are: +\itemize{ +\item \code{"none"} No prior variable removal is performed (Default). +\item \code{"pearson"}, \code{"spearman"} or \code{"kendall"} Makes use of pairwise comparisons to identify and +remove highly collinear predictors (Pearson's \code{r >= 0.7}). +\item \code{"abess"} A-priori adaptive best subset selection of covariates via the \link{abess} package (see References). Note that this +effectively fits a separate generalized linear model to reduce the number of covariates. +\item \code{"boruta"} Uses the \link{Boruta} package to identify non-informative features. +} +} +\note{ +Using this function on predictors effectively means that a separate model is fitted on the data +with all the assumptions that come with in (e.g. linearity, appropriateness of response, normality, etc). +} +\examples{ +\dontrun{ + # Remove highly correlated predictors + env <- predictor_filter( env, option = "pearson") +} +} +\keyword{utils} diff --git a/man/find_subset_of_predictors.Rd b/man/predictors_filter_abess.Rd similarity index 69% rename from man/find_subset_of_predictors.Rd rename to man/predictors_filter_abess.Rd index a1744b7d..b265d81c 100644 --- a/man/find_subset_of_predictors.Rd +++ b/man/predictors_filter_abess.Rd @@ -1,23 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{find_subset_of_predictors} -\alias{find_subset_of_predictors} +% Please edit documentation in R/utils-predictors.R +\name{predictors_filter_abess} +\alias{predictors_filter_abess} \title{Apply the adaptive best subset selection framework on a set of predictors} \usage{ -find_subset_of_predictors( +predictors_filter_abess( env, observed, + method, family, tune.type = "cv", lambda = 0, weight = NULL, - keep = NULL + keep = NULL, + ... ) } \arguments{ -\item{env}{A \code{\link{data.frame}} with extracted environmental covariates for a given species.} +\item{env}{A \code{\link{data.frame}} or \code{\link{matrix}} with extracted environmental covariates for a given species.} -\item{observed}{A \code{\link{vector}} with the observed response variable.} +\item{method}{Which method to use for constructing the correlation matrix (Options: \code{'pearson'} (Default), +\code{'spearman'}| \code{'kendal'}), \code{"abess"}, or \code{"boruta"}.} \item{family}{A \code{\link{character}} indicating the family the observational data originates from.} @@ -28,10 +31,13 @@ Options are \code{c("gic", "ebic", "bic", "aic", "cv")} as listed in \link{abess \item{weight}{Observation weights. When weight = \code{NULL}, we set weight = \code{1} for each observation as default.} -\item{keep}{A \code{\link{vector}} with variables to keep regardless (Default: \code{NULL}).} +\item{keep}{A \code{\link{vector}} with variables to keep regardless. These are usually variables for which prior +information is known.} + +\item{...}{Other options for a specific method} } \value{ -vector of variable names to exclude +A \code{\link{vector}} of variable names to exclude } \description{ This is a wrapper function to fit the adaptive subset selection procedure outlined diff --git a/man/predictors_filter_boruta.Rd b/man/predictors_filter_boruta.Rd new file mode 100644 index 00000000..352eb03f --- /dev/null +++ b/man/predictors_filter_boruta.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-predictors.R +\name{predictors_filter_boruta} +\alias{predictors_filter_boruta} +\title{All relevant feature selection using Boruta} +\usage{ +predictors_filter_boruta( + env, + observed, + method, + keep = NULL, + iter = 100, + verbose = getOption("ibis.setupmessages"), + ... +) +} +\arguments{ +\item{env}{A \code{\link{data.frame}} or \code{\link{matrix}} with extracted environmental covariates for a given species.} + +\item{method}{Which method to use for constructing the correlation matrix (Options: \code{'pearson'} (Default), +\code{'spearman'}| \code{'kendal'}), \code{"abess"}, or \code{"boruta"}.} + +\item{keep}{A \code{\link{vector}} with variables to keep regardless. These are usually variables for which prior +information is known.} + +\item{iter}{\code{\link{numeric}} on the number of maximal runs (Default: \code{100}). Increase if too many tentative left.} + +\item{verbose}{\code{\link{logical}} whether to be chatty.} + +\item{...}{Other options for a specific method} +} +\value{ +A \code{\link{vector}} of variable names to exclude. +} +\description{ +This function uses the \link{Boruta} package to identify predictor variables with little information content. It iteratively +compares importances of attributes with importances of shadow attributes, created by shuffling original ones. +Attributes that have significantly worst importance than shadow ones are being consecutively dropped. +} +\note{ +This package depends on the \link{ranger} package to iteratively fit randomForest models. +} +\references{ +\itemize{ +\item Miron B. Kursa, Witold R. Rudnicki (2010). Feature Selection with the Boruta Package. Journal of Statistical Software, 36(11), 1-13. URL https://doi.org/10.18637/jss.v036.i11. +} +} +\keyword{internal} +\keyword{utils,} diff --git a/man/predictors_filter_collinearity.Rd b/man/predictors_filter_collinearity.Rd new file mode 100644 index 00000000..e6da2a7a --- /dev/null +++ b/man/predictors_filter_collinearity.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-predictors.R +\name{predictors_filter_collinearity} +\alias{predictors_filter_collinearity} +\title{Identify collinear predictors} +\usage{ +predictors_filter_collinearity( + env, + keep = NULL, + cutoff = getOption("ibis.corPred"), + method = "pearson", + ... +) +} +\arguments{ +\item{env}{A \code{\link{data.frame}} or \code{\link{matrix}} with extracted environmental covariates for a given species.} + +\item{keep}{A \code{\link{vector}} with variables to keep regardless. These are usually variables for which prior +information is known.} + +\item{cutoff}{A \code{\link{numeric}} variable specifying the maximal correlation cutoff.} + +\item{method}{Which method to use for constructing the correlation matrix (Options: \code{'pearson'} (Default), +\code{'spearman'}| \code{'kendal'}), \code{"abess"}, or \code{"boruta"}.} + +\item{...}{Other options for a specific method} +} +\value{ +\code{\link{vector}} of variable names to exclude +} +\description{ +Identify collinear predictors +} +\concept{Code inspired from the [`caret`] package} +\keyword{internal} +\keyword{utils,} diff --git a/man/pseudoabs_settings.Rd b/man/pseudoabs_settings.Rd index e8691994..5ce508e5 100644 --- a/man/pseudoabs_settings.Rd +++ b/man/pseudoabs_settings.Rd @@ -66,7 +66,8 @@ the target areas for grid cells with non-zero values. } \examples{ \dontrun{ -# This setting generates 10000 pseudo-absence points outside the minimum convex polygon of presence points +# This setting generates 10000 pseudo-absence points outside the minimum convex polygon +of presence points ass1 <- pseudoabs_settings(nrpoints = 10000, method = 'mcp', inside = FALSE) # This setting would match the number of presence-absence points directly. @@ -80,8 +81,10 @@ all_my_points <- add_pseudoabsence(df = virtual_points, field_occurrence = 'Obse } \references{ \itemize{ -\item Renner IW, Elith J, Baddeley A, Fithian W, Hastie T, Phillips SJ, Popovic G, Warton DI. 2015. Point process models for presence-only analysis. Methods in Ecology and Evolution 6:366–379. DOI: 10.1111/2041-210X.12352. -\item Renner, I. W., & Warton, D. I. (2013). Equivalence of MAXENT and Poisson point process models for species distribution modeling in ecology. Biometrics, 69(1), 274-281. +\item Renner IW, Elith J, Baddeley A, Fithian W, Hastie T, Phillips SJ, Popovic G, Warton DI. 2015. Point process +models for presence-only analysis. Methods in Ecology and Evolution 6:366–379. DOI: 10.1111/2041-210X.12352. +\item Renner, I. W., & Warton, D. I. (2013). Equivalence of MAXENT and Poisson point +process models for species distribution modeling in ecology. Biometrics, 69(1), 274-281. } } \keyword{train} diff --git a/man/rm_predictors.Rd b/man/rm_predictors.Rd index 23f28a72..34297263 100644 --- a/man/rm_predictors.Rd +++ b/man/rm_predictors.Rd @@ -20,8 +20,8 @@ See Examples. } \examples{ \dontrun{ -distribution(background) \%>\% - add_predictors(my_covariates) \%>\% +distribution(background) |> + add_predictors(my_covariates) |> rm_predictors(names = "Urban") } } diff --git a/man/sel_predictors.Rd b/man/sel_predictors.Rd index 46be7278..f5aa49d1 100644 --- a/man/sel_predictors.Rd +++ b/man/sel_predictors.Rd @@ -20,8 +20,8 @@ See Examples. } \examples{ \dontrun{ -distribution(background) \%>\% - add_predictors(my_covariates) \%>\% +distribution(background) |> + add_predictors(my_covariates) |> sel_predictors(names = c("Forest", "Elevation")) } } diff --git a/man/similarity.Rd b/man/similarity.Rd index 736caf75..6da6a773 100644 --- a/man/similarity.Rd +++ b/man/similarity.Rd @@ -72,8 +72,11 @@ plot( } \references{ \itemize{ -\item Elith, J., Kearney, M., and Phillips, S. (2010) "The art of modelling range-shifting species" https://doi.org/10.1111/j.2041-210X.2010.00036.x \emph{Methods in Ecology and Evolution}, 1: 330-342 -\item Mesgaran, M.B., Cousens, R.D. and Webber, B.L. (2014) "Here be dragons: a tool for quantifying novelty due to covariate range and correlation change when projecting species distribution models" https://doi.org/10.1111/ddi.12209 \emph{Diversity and Distributions}, 20: 1147-1159. +\item Elith, J., Kearney, M., and Phillips, S. (2010) "The art of modelling range-shifting +species" https://doi.org/10.1111/j.2041-210X.2010.00036.x \emph{Methods in Ecology and Evolution}, 1: 330-342 +\item Mesgaran, M.B., Cousens, R.D. and Webber, B.L. (2014) "Here be dragons: a tool +for quantifying novelty due to covariate range and correlation change when projecting +species distribution models" https://doi.org/10.1111/ddi.12209 \emph{Diversity and Distributions}, 20: 1147-1159. } } \seealso{ diff --git a/man/summary.Rd b/man/summary.Rd index 48e9bb9e..d06fa438 100644 --- a/man/summary.Rd +++ b/man/summary.Rd @@ -10,20 +10,20 @@ \alias{summary.Settings} \title{Summarises a trained model or predictor object} \usage{ -\method{summary}{distribution}(x, ...) +\method{summary}{distribution}(object, ...) -\method{summary}{DistributionModel}(x, ...) +\method{summary}{DistributionModel}(object, ...) -\method{summary}{PredictorDataset}(x, ...) +\method{summary}{PredictorDataset}(object, ...) -\method{summary}{BiodiversityScenario}(x, ...) +\method{summary}{BiodiversityScenario}(object, ...) -\method{summary}{PriorList}(x, ...) +\method{summary}{PriorList}(object, ...) -\method{summary}{Settings}(x, ...) +\method{summary}{Settings}(object, ...) } \arguments{ -\item{x}{Any prepared object.} +\item{object}{Any prepared object.} \item{...}{not used.} } @@ -38,11 +38,11 @@ When unsure, it is usually a good strategy to run \link{summary} on any object. \examples{ \dontrun{ # Example with a trained model -x <- distribution(background) \%>\% +x <- distribution(background) |> # Presence-absence data - add_biodiversity_poipa(surveydata) \%>\% + add_biodiversity_poipa(surveydata) |> # Add predictors and scale them - add_predictors(env = predictors) \%>\% + add_predictors(env = predictors) |> # Use glmnet and lasso regression for estimation engine_glmnet(alpha = 1) # Train the model diff --git a/man/train.Rd b/man/train.Rd index 714d9856..b430b4a6 100644 --- a/man/train.Rd +++ b/man/train.Rd @@ -9,8 +9,8 @@ train( x, runname, - rm_corPred = FALSE, - varsel = "none", + filter_predictors = "none", + optim_hyperparam = FALSE, inference_only = FALSE, only_linear = TRUE, method_integration = "predictor", @@ -27,20 +27,21 @@ train( \item{runname}{A \code{\link{character}} name of the trained run.} -\item{rm_corPred}{Remove highly correlated predictors (Default: \code{FALSE}). This option -removes - based on pairwise comparisons - those covariates that are highly collinear (Pearson's \code{r >= 0.7}).} - -\item{varsel}{Perform a variable selection on the set of predictors either prior to building the model -or via variable selection / regularization of the model. Available options are: +\item{filter_predictors}{A \code{\link{character}} defining if and how highly correlated predictors are to be removed +prior to any model estimation. +Available options are: \itemize{ -\item \code{\link{none}} for no or default priors and no extensive hyperparameter search. -\item \code{\link{reg}} Model selection either through DIC or regularization / hyperparameter tuning depending on the -engine (Default). -\item \code{\link{abess}} A-priori adaptive best subset selection of covariates via the \link{abess} package (see References). -Note that this effectively fits a separate generalized linear model to reduce the number of covariates. -Can be helpful for engines that don't directly support efficient variable regularization and when \code{N>100}. +\item \code{"none"} No prior variable removal is performed (Default). +\item \code{"pearson"}, \code{"spearman"} or \code{"kendall"} Makes use of pairwise comparisons to identify and +remove highly collinear predictors (Pearson's \code{r >= 0.7}). +\item \code{"abess"} A-priori adaptive best subset selection of covariates via the \link{abess} package (see References). Note that this +effectively fits a separate generalized linear model to reduce the number of covariates. +\item \code{"boruta"} Uses the \link{Boruta} package to identify non-informative features. }} +\item{optim_hyperparam}{Perform a variable selection on the set of predictors either prior +to building the model (Default: \code{FALSE}).} + \item{inference_only}{By default the \link{engine} is used to create a spatial prediction of the suitability surface, which can take time. If only inferences of the strength of relationship between covariates and observations are required, this parameter @@ -55,6 +56,7 @@ How non-linearity is captured depends on the used \link{engine}.} than one \code{\linkS4class{BiodiversityDataset}} object is provided in \code{x}. Particular relevant for engines that do not support the integration of more than one dataset. Integration methods are generally sensitive to the order in which they have been added to the \code{\link{BiodiversityDistribution}} object. + Available options are: \itemize{ \item \code{"predictor"} The predicted output of the first (or previously fitted) models are @@ -101,20 +103,22 @@ The resulting object contains both a \code{\link{fit_best}} object of the estima a \link{RasterLayer} object named \code{\link{prediction}} that contains the spatial prediction of the model. These objects can be requested via \code{object$get_data("fit_best")}. -Available options in this function include: +Other parameters in this function: \itemize{ -\item \code{"rm_corPred"} Setting this to \code{TRUE} removes highly correlated variables for the observation -prior to fitting. -\item \code{"varsel"} This option allows to make use of hyper-parameter search for several models (\code{"reg"}) or -alternatively of variable selection methods to further reduce model complexity. Generally substantially increases -runtime. The option makes use of the \code{"abess"} approach (Zhu et al. 2020) to identify and remove the least-important -variables. +\item \code{"filter_predictors"} The parameter can be set to various options to remove highly correlated variables or those +with little additional information gain from the model prior to any estimation. Available options are \code{"none"} (Default) \code{"pearson"} for +applying a \code{0.7} correlation cutoff, \code{"abess"} for the regularization framework by Zhu et al. (2020), or \code{"RF"} or \code{"randomforest"} +for removing the least important variables according to a randomForest model. \strong{Note}: This function is only applied on +predictors for which no prior has been provided (e.g. potentially non-informative ones). +\item \code{"optim_hyperparam"} This option allows to make use of hyper-parameter search for several models, which can improve +prediction accuracy although through the a substantial increase in computational cost. \item \code{"method_integration"} Only relevant if more than one \code{\link{BiodiversityDataset}} is supplied and when the engine does not support joint integration of likelihoods. See also Miller et al. (2019) in the references for more details on different types of integration. Of course, if users want more control about this aspect, another option is to fit separate models and make use of the \link{add_offset}, \link{add_offset_range} and \link{ensemble} functionalities. -\item \code{"clamp"} Clamps the projection predictors to the range of values observed during model training. +\item \code{"clamp"} Boolean parameter to support a clamping of the projection predictors to the range of values observed +during model training. } } \note{ @@ -125,15 +129,15 @@ be obtained from the same data and parameters and not all necessarily make sense \examples{ \dontrun{ # Fit a linear penalized logistic regression model via stan - x <- distribution(background) \%>\% + x <- distribution(background) |> # Presence-absence data - add_biodiversity_poipa(surveydata) \%>\% + add_biodiversity_poipa(surveydata) |> # Add predictors and scale them - add_predictors(env = predictors, transform = "scale", derivates = "none") \%>\% - # Use stan for estimation + add_predictors(env = predictors, transform = "scale", derivates = "none") |> + # Use Stan for estimation engine_stan(chains = 2, iter = 1000, warmup = 500) # Train the model - mod <- train(x, only_linear = TRUE, varsel = 'none') + mod <- train(x, only_linear = TRUE, filter_predictors = 'pearson') mod } } diff --git a/man/write_output.Rd b/man/write_output.Rd index ad7f7ab9..47798641 100644 --- a/man/write_output.Rd +++ b/man/write_output.Rd @@ -42,10 +42,10 @@ By default output files will be overwritten if already existing! } \examples{ \dontrun{ -x <- distribution(background) \%>\% - add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') \%>\% - add_predictors(pred_current, transform = 'scale',derivates = 'none') \%>\% - engine_xgboost(nrounds = 2000) \%>\% train(varsel = FALSE, only_linear = TRUE) +x <- distribution(background) |> + add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> + add_predictors(pred_current, transform = 'scale',derivates = 'none') |> + engine_xgboost(nrounds = 2000) |> train(varsel = FALSE, only_linear = TRUE) write_output(x, "testmodel.tif") } } diff --git a/man/write_summary.Rd b/man/write_summary.Rd index a77f289d..b7625261 100644 --- a/man/write_summary.Rd +++ b/man/write_summary.Rd @@ -34,10 +34,10 @@ Use \code{\link[=write_output]{write_output()}} to save those. } \examples{ \dontrun{ -x <- distribution(background) \%>\% - add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') \%>\% - add_predictors(pred_current, transform = 'scale',derivates = 'none') \%>\% - engine_xgboost(nrounds = 2000) \%>\% train(varsel = FALSE, only_linear = TRUE) +x <- distribution(background) |> + add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> + add_predictors(pred_current, transform = 'scale',derivates = 'none') |> + engine_xgboost(nrounds = 2000) |> train(varsel = FALSE, only_linear = TRUE) write_summary(x, "testmodel.rds") } } diff --git a/tests/testthat/test_BiodiversityDistribution.R b/tests/testthat/test_BiodiversityDistribution.R index 3feb0ece..6ff82b25 100644 --- a/tests/testthat/test_BiodiversityDistribution.R +++ b/tests/testthat/test_BiodiversityDistribution.R @@ -9,10 +9,10 @@ test_that('Setting up a distribution model',{ skip_if(condition = tryCatch(expr = cmdstanr::cmdstan_path(), error = function(e) return(TRUE)), message = "No cmdstan path") - suppressWarnings( library(raster) ) - suppressWarnings( library(sf) ) - suppressWarnings( library(rgeos) ) - suppressWarnings( library(igraph) ) + suppressWarnings( requireNamespace("raster", quietly = TRUE) ) + suppressWarnings( requireNamespace("sf", quietly = TRUE) ) + suppressWarnings( requireNamespace("rgeos", quietly = TRUE) ) + suppressWarnings( requireNamespace("igraph", quietly = TRUE) ) options("ibis.setupmessages" = FALSE) # Background Raster @@ -39,7 +39,7 @@ test_that('Setting up a distribution model',{ expect_vector(names(x)) # Now add one variable - x <- x %>% add_biodiversity_poipo(virtual_points,field_occurrence = 'Observed',name = 'Virtual points') + x <- x |> add_biodiversity_poipo(virtual_points,field_occurrence = 'Observed',name = 'Virtual points') expect_message(x$biodiversity,NA) expect_equal(x$biodiversity$length(),1) expect_equal(x$biodiversity$get_equations()[[1]],'') @@ -47,7 +47,7 @@ test_that('Setting up a distribution model',{ expect_error(train(x)) # Try to solve without solver # And a range off - invisible( suppressWarnings( suppressMessages(x <- x %>% add_offset_range(virtual_range))) ) + invisible( suppressWarnings( suppressMessages(x <- x |> add_offset_range(virtual_range))) ) expect_equal(x$get_offset(),'range_distance') expect_s4_class(x$offset,'Raster') @@ -56,25 +56,25 @@ test_that('Setting up a distribution model',{ expect_true(is.Waiver( x$get_offset() ) ) # Add Predictors - x <- x %>% add_predictors(predictors) + x <- x |> add_predictors(predictors) expect_s3_class(x$predictors, "PredictorDataset") expect_equal(x$predictors$length(),14) expect_true(is.vector(x$get_predictor_names())) # Try removing one - x <- x %>% rm_predictors('bio01_mean_50km') + x <- x |> rm_predictors('bio01_mean_50km') expect_equal(x$predictors$length(),13) expect_error( rm_predictors(x,'bio20_mean_50km') ) # Finally select all predictors with CLC3 n <- grep('CLC',x$get_predictor_names(),value = TRUE) - x <- x %>% sel_predictors(n) + x <- x |> sel_predictors(n) expect_equal(x$predictors$length(),5) expect_equal(x$get_predictor_names(), n) # Add brick object make derivatives pb <- raster::brick(predictors) - x <- distribution(background) %>% add_predictors(pb$aspect_mean_50km, derivates = 'quadratic') + x <- distribution(background) |> add_predictors(pb$aspect_mean_50km, derivates = 'quadratic') testthat::expect_equal(x$predictors$length(),2) - x <- distribution(background) %>% add_predictors(pb, derivates = c('quadratic','hinge')) + x <- distribution(background) |> add_predictors(pb, derivates = c('quadratic','hinge')) testthat::expect_equal(x$predictors$length(),84) # Interactions @@ -83,7 +83,7 @@ test_that('Setting up a distribution model',{ testthat::expect_s3_class(y, "BiodiversityDistribution") rm(y) - x <- x %>% engine_inla() + x <- x |> engine_inla() # Mesh is not created yet expect_s3_class(x$engine$get_data("mesh"),'Waiver') expect_equal(x$engine$name,'') @@ -93,7 +93,7 @@ test_that('Setting up a distribution model',{ expect_null(x$get_limits()) # Add latent effect and see whether the attributes is changed - y <- x %>% add_latent_spatial(method = "spde") + y <- x |> add_latent_spatial(method = "spde") expect_vector( attr(y$get_latent(),'method'),'spde') # ---- # @@ -103,19 +103,19 @@ test_that('Setting up a distribution model',{ x <- distribution(background,limits = zones) expect_s3_class(x$get_limits(), "sf") - y <- x %>% engine_bart() + y <- x |> engine_bart() expect_equal( y$get_engine(), "") - y <- x %>% engine_breg() + y <- x |> engine_breg() expect_equal( y$get_engine(), "") - y <- x %>% engine_gdb() + y <- x |> engine_gdb() expect_equal( y$get_engine(), "") - y <- x %>% engine_inla() + y <- x |> engine_inla() expect_equal( y$get_engine(), "") - y <- x %>% engine_inlabru() + y <- x |> engine_inlabru() expect_equal( y$get_engine(), "") - y <- x %>% engine_stan() + y <- x |> engine_stan() expect_equal( y$get_engine(), "") - y <- x %>% engine_xgboost() + y <- x |> engine_xgboost() expect_equal( y$get_engine(), "") # Normal x should still be none diff --git a/tests/testthat/test_Scenarios.R b/tests/testthat/test_Scenarios.R index 0b0de196..f89c6969 100644 --- a/tests/testthat/test_Scenarios.R +++ b/tests/testthat/test_Scenarios.R @@ -7,9 +7,8 @@ test_that('Scenarios and constraints', { skip_on_travis() skip_on_cran() - suppressWarnings( require('glmnet') ) - suppressWarnings( require('igraph') ) - suppressWarnings( library("ibis.iSDM") ) + suppressWarnings( requireNamespace('glmnet', quietly = TRUE) ) + suppressWarnings( requireNamespace('igraph', quietly = TRUE) ) options("ibis.setupmessages" = FALSE) # Be less chatty options("ibis.seed" = 1234) @@ -32,7 +31,7 @@ test_that('Scenarios and constraints', { for(i in ll) pred_current <- raster::addLayer(pred_current, raster::raster(i,layer = 1) ) # Load the same files future ones suppressWarnings( - pred_future <- stars::read_stars(ll) %>% stars:::slice.stars('Time', seq(1, 86, by = 10)) + pred_future <- stars::read_stars(ll) |> stars:::slice.stars('Time', seq(1, 86, by = 10)) ) sf::st_crs(pred_future) <- sf::st_crs(4326) @@ -43,8 +42,8 @@ test_that('Scenarios and constraints', { # --------------- # # Fit a model and add a threshold to it - fit <- distribution(background) %>% - add_biodiversity_poipa(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% + fit <- distribution(background) |> + add_biodiversity_poipa(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> add_predictors(pred_current) |> engine_glmnet(alpha = 0) |> train("test", inference_only = FALSE, verbose = FALSE) |> @@ -104,8 +103,10 @@ test_that('Scenarios and constraints', { # Make a first projection mod <- sc |> add_predictors(pred_future) |> project() - expect_s3_class(summary(mod), "data.frame") - suppressWarnings( expect_s3_class(mod$calc_scenarios_slope(), "stars") ) + suppressWarnings( expect_s3_class(summary(mod), "data.frame") ) + invisible( + suppressWarnings( expect_s3_class(mod$calc_scenarios_slope(), "stars") ) + ) # These will throw errors as we haven't added thresholds expect_error(mod$plot_relative_change()) diff --git a/tests/testthat/test_functions.R b/tests/testthat/test_functions.R index 47a14275..c0e499a4 100644 --- a/tests/testthat/test_functions.R +++ b/tests/testthat/test_functions.R @@ -3,7 +3,10 @@ test_that('Custom functions - Test gridded transformations and ensembles', { skip_on_travis() skip_on_cran() - library(raster) + + suppressWarnings( + requireNamespace("raster", quietly = TRUE) + ) # --- # # Manipulating raster files # diff --git a/tests/testthat/test_modelFits.R b/tests/testthat/test_modelFits.R index 01606a54..a737a36f 100644 --- a/tests/testthat/test_modelFits.R +++ b/tests/testthat/test_modelFits.R @@ -1,77 +1,77 @@ -# Train a full distribution model with INLA -test_that('Add further tests for model fits', { - - skip_if_not_installed('glmnet') - skip_if_not_installed('pdp') - skip_on_travis() - skip_on_cran() - - # Load data - # Background Raster - background <- raster::raster(system.file('extdata/europegrid_50km.tif', package='ibis.iSDM',mustWork = TRUE)) - # Get test species - virtual_points <- sf::st_read(system.file('extdata/input_data.gpkg', package='ibis.iSDM',mustWork = TRUE),'points',quiet = TRUE) - virtual_range <- sf::st_read(system.file('extdata/input_data.gpkg', package='ibis.iSDM',mustWork = TRUE),'range',quiet = TRUE) - # Get list of test predictors - ll <- list.files(system.file('extdata/predictors/',package = 'ibis.iSDM',mustWork = TRUE),full.names = T) - # Load them as rasters - predictors <- raster::stack(ll);names(predictors) <- tools::file_path_sans_ext(basename(ll)) - - # Add pseudo absence - abs <- pseudoabs_settings(nrpoints = 0,min_ratio = 1,method = "mcp") - suppressMessages( - virtual_points <- add_pseudoabsence(virtual_points,template = background, field_occurrence = "Observed", settings = abs) - ) - - # Create testing and training data - ind <- sample(1:nrow(virtual_points), 70) - train_data <- virtual_points[-ind,] - test_data <- virtual_points[ind,] - - # Now set them one up step by step - x <- distribution(background) %>% - add_biodiversity_poipa(train_data, field_occurrence = 'Observed', name = 'Virtual points') %>% - add_predictors(predictors, transform = 'none',derivates = 'none') %>% - engine_glmnet() - - # Train the model - suppressWarnings( - mod <- train(x, "test", inference_only = FALSE, only_linear = TRUE, varsel = "none", verbose = FALSE) - ) - expect_s4_class(mod$get_data(), "RasterLayer") - - # Threshold with independent data - mod <- threshold(mod,method = "perc",format = "bin") - expect_gt(mod$get_thresholdvalue(),0) - expect_length(mod$show_rasters(), 2) - - # Summarize model - expect_s3_class( summary(mod), "data.frame" ) - expect_s3_class( coef(mod), "data.frame" ) - - # Validate - val <- validate(mod, method = "cont") - expect_s3_class(val, "data.frame") - # Validate discrete - val <- validate(mod, method = "disc") - expect_s3_class(val, "data.frame") - - # Validate with withold data - val <- validate(mod, method = "disc", point = test_data,point_column = "Observed") - expect_s3_class(val, "data.frame") - - # ----------- # - # Partial stuff - pp <- partial(mod,x.var = "bio19_mean_50km",plot = FALSE) - expect_s3_class(pp, "data.frame") - - # Spartial - pp <- spartial(mod,x.var = "bio19_mean_50km",plot = FALSE) - expect_s4_class(pp, "RasterLayer") - - - # ----------- # - # Write model outputs - # expect_snapshot_file(write_summary(mod, "test.rds"), "test.rds") - -}) +# Train a full distribution model with INLA +test_that('Add further tests for model fits', { + + skip_if_not_installed('glmnet') + skip_if_not_installed('pdp') + skip_on_travis() + skip_on_cran() + + # Load data + # Background Raster + background <- raster::raster(system.file('extdata/europegrid_50km.tif', package='ibis.iSDM',mustWork = TRUE)) + # Get test species + virtual_points <- sf::st_read(system.file('extdata/input_data.gpkg', package='ibis.iSDM',mustWork = TRUE),'points',quiet = TRUE) + virtual_range <- sf::st_read(system.file('extdata/input_data.gpkg', package='ibis.iSDM',mustWork = TRUE),'range',quiet = TRUE) + # Get list of test predictors + ll <- list.files(system.file('extdata/predictors/',package = 'ibis.iSDM',mustWork = TRUE),full.names = T) + # Load them as rasters + predictors <- raster::stack(ll);names(predictors) <- tools::file_path_sans_ext(basename(ll)) + + # Add pseudo absence + abs <- pseudoabs_settings(nrpoints = 0,min_ratio = 1,method = "mcp") + suppressMessages( + virtual_points <- add_pseudoabsence(virtual_points,template = background, field_occurrence = "Observed", settings = abs) + ) + + # Create testing and training data + ind <- sample(1:nrow(virtual_points), 70) + train_data <- virtual_points[-ind,] + test_data <- virtual_points[ind,] + + # Now set them one up step by step + x <- distribution(background) |> + add_biodiversity_poipa(train_data, field_occurrence = 'Observed', name = 'Virtual points') |> + add_predictors(predictors, transform = 'none',derivates = 'none') |> + engine_glmnet() + + # Train the model + suppressWarnings( + mod <- train(x, "test", inference_only = FALSE, only_linear = TRUE, varsel = "none", verbose = FALSE) + ) + expect_s4_class(mod$get_data(), "RasterLayer") + + # Threshold with independent data + mod <- threshold(mod,method = "perc",format = "bin") + expect_gt(mod$get_thresholdvalue(),0) + expect_length(mod$show_rasters(), 2) + + # Summarize model + expect_s3_class( summary(mod), "data.frame" ) + expect_s3_class( coef(mod), "data.frame" ) + + # Validate + val <- validate(mod, method = "cont") + expect_s3_class(val, "data.frame") + # Validate discrete + val <- validate(mod, method = "disc") + expect_s3_class(val, "data.frame") + + # Validate with withold data + val <- validate(mod, method = "disc", point = test_data,point_column = "Observed") + expect_s3_class(val, "data.frame") + + # ----------- # + # Partial stuff + pp <- partial(mod,x.var = "bio19_mean_50km",plot = FALSE) + expect_s3_class(pp, "data.frame") + + # Spartial + pp <- spartial(mod,x.var = "bio19_mean_50km",plot = FALSE) + expect_s4_class(pp, "RasterLayer") + + + # ----------- # + # Write model outputs + # expect_snapshot_file(write_summary(mod, "test.rds"), "test.rds") + +}) diff --git a/tests/testthat/test_objectinheritance.R b/tests/testthat/test_objectinheritance.R index 325137a4..a5d642a5 100644 --- a/tests/testthat/test_objectinheritance.R +++ b/tests/testthat/test_objectinheritance.R @@ -10,8 +10,8 @@ test_that('Check that distribution objects are properly inherited', { message = "No cmdstan path") # Load packages - require(raster) - require(sf) + suppressWarnings( requireNamespace("raster", quietly = TRUE) ) + suppressWarnings( requireNamespace("sf", quietly = TRUE) ) options("ibis.setupmessages" = FALSE) @@ -30,18 +30,18 @@ test_that('Check that distribution objects are properly inherited', { # Biodiversity expect_equal(x$biodiversity$length(),0) - x %>% add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') + x |> add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') expect_equal(x$biodiversity$length(),0) # Multiple - x %>% add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% + x |> add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> add_biodiversity_polpo(virtual_range, field_occurrence = 'Observed', name = 'Virtual points') expect_equal(x$biodiversity$length(),0) - x %>% add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% + x |> add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> add_biodiversity_polpo(virtual_range, field_occurrence = 'Observed', name = 'Virtual points') expect_equal(x$biodiversity$length(),0) # Offsets - suppressWarnings( x %>% add_offset_range(virtual_range) ) + suppressWarnings( x |> add_offset_range(virtual_range) ) expect_s3_class(x$offset, "Waiver") # -- # @@ -69,29 +69,29 @@ test_that('Check that distribution objects are properly inherited', { # -- # # Latent effect check - x %>% add_latent_spatial(method = "spde",priors = NULL) + x |> add_latent_spatial(method = "spde",priors = NULL) expect_true(is.Waiver(x$latentfactors)) # Engine - x %>% engine_gdb(boosting_iterations = 500) + x |> engine_gdb(boosting_iterations = 500) expect_true(is.Waiver(x$engine)) - x %>% engine_stan() + x |> engine_stan() expect_true(is.Waiver(x$engine)) # Priors - x %>% add_predictors(predictors, transform = 'none',derivates = 'none',priors = priors(INLAPrior(names(predictors)[1],'normal'))) + x |> add_predictors(predictors, transform = 'none',derivates = 'none',priors = priors(INLAPrior(names(predictors)[1],'normal'))) expect_true(is.Waiver(x$priors)) - x %>% add_latent_spatial(method = "spde", priors = priors(INLAPrior('spde','prior.range'))) + x |> add_latent_spatial(method = "spde", priors = priors(INLAPrior('spde','prior.range'))) expect_true(is.Waiver(x$priors)) # Two different priors - x %>% - add_predictors(predictors, transform = 'none',derivates = 'none',priors = priors(INLAPrior(names(predictors)[1],'normal'))) %>% + x |> + add_predictors(predictors, transform = 'none',derivates = 'none',priors = priors(INLAPrior(names(predictors)[1],'normal'))) |> add_latent_spatial(method = "spde", priors = priors(INLAPrior('spde','prior.range'))) expect_true(is.Waiver(x$priors)) # Check variable removal - xx <- x %>% add_predictors(predictors) - xx %>% rm_predictors("hmi_mean_50km") + xx <- x |> add_predictors(predictors) + xx |> rm_predictors("hmi_mean_50km") expect_length(xx$get_predictor_names(), 14) # --- # diff --git a/tests/testthat/test_priors.R b/tests/testthat/test_priors.R index a05e406f..e150005b 100644 --- a/tests/testthat/test_priors.R +++ b/tests/testthat/test_priors.R @@ -171,7 +171,8 @@ test_that('Add and modify priors to existing object', { skip_if_not_installed('INLA') skip_on_cran() skip_on_ci() - library(raster) + + suppressWarnings( requireNamespace("raster", quietly = TRUE) ) options("ibis.setupmessages" = FALSE) background <- raster::raster(system.file('extdata/europegrid_50km.tif', package='ibis.iSDM',mustWork = TRUE)) @@ -187,10 +188,10 @@ test_that('Add and modify priors to existing object', { # Define a model invisible( - x <- distribution(background) %>% - add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% + x <- distribution(background) |> + add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> add_predictors(predictors[[c('slope_mean_50km','bio01_mean_50km','CLC3_132_mean_50km')]], - transform = 'none',derivates = 'none') %>% + transform = 'none',derivates = 'none') |> engine_inla( max.edge = c(.5, 3), offset = c(0.5, 1), @@ -202,15 +203,15 @@ test_that('Add and modify priors to existing object', { expect_s3_class(x$get_priors(),'Waiver') # Add priors to it - x <- x %>% add_priors(priors = pp) + x <- x |> add_priors(priors = pp) expect_s3_class( x$get_priors(), 'PriorList') expect_vector(x$get_prior_variables(), "CLC3_132_mean_50km" ) # Remove priors from it - invisible( x %>% rm_priors() ) + invisible( x |> rm_priors() ) expect_s3_class( x$get_priors(), 'PriorList') - x <- x %>% rm_priors() + x <- x |> rm_priors() expect_s3_class(x$priors,'Waiver') expect_s3_class(x$get_priors(),'Waiver') @@ -225,14 +226,14 @@ test_that('Add and modify priors to existing object', { expect_vector(pp$get('CLC3_132_mean_50km'), c(0, Inf)) # With BREG engine - suppressWarnings( suppressMessages(x <- x %>% engine_breg()) ) + suppressWarnings( suppressMessages(x <- x |> engine_breg()) ) p1 <- BREGPrior("test", hyper = 0.5, ip = 1) p2 <- BREGPrior("test", hyper = 5, ip = .2) expect_equal(priors(p1,p2)$length(),1) - x <- x %>% add_priors(priors(p1,p2)) + x <- x |> add_priors(priors(p1,p2)) expect_equal(x$get_prior_variables() |> as.character(), "test") - x <- x %>% add_priors(priors( + x <- x |> add_priors(priors( BREGPrior("test2", hyper = 5, ip = .2) )) expect_true("test2" %in% x$get_prior_variables() ) diff --git a/tests/testthat/test_rangesOffsets.R b/tests/testthat/test_rangesOffsets.R index b944dba4..980b21d3 100644 --- a/tests/testthat/test_rangesOffsets.R +++ b/tests/testthat/test_rangesOffsets.R @@ -5,7 +5,7 @@ test_that('Load ranges and add them to distribution object', { skip_if_not_installed('INLA') skip_if_not_installed('igraph') - requireNamespace("igraph") + suppressWarnings( requireNamespace("igraph", quietly = TRUE) ) options("ibis.setupmessages" = FALSE) @@ -26,7 +26,7 @@ test_that('Load ranges and add them to distribution object', { # This will raise a warning since projection is different suppressMessages( expect_warning( - x %>% add_predictor_range(virtual_range, method = 'distance') ) + x |> add_predictor_range(virtual_range, method = 'distance') ) ) # Try and add a range as raster @@ -35,12 +35,12 @@ test_that('Load ranges and add them to distribution object', { expect_s4_class(virtual_range_ras,'Raster') # Add the rasterized range - y <- x %>% add_predictor_range(virtual_range_ras) + y <- x |> add_predictor_range(virtual_range_ras) expect_vector(y$get_predictor_names(),'precomputed_range') # Artificially aggregate the range virtual_range_ras <- raster::aggregate(virtual_range_ras, 5) - expect_s3_class( x %>% add_predictor_range(virtual_range_ras),class = "BiodiversityDistribution" ) + expect_s3_class( x |> add_predictor_range(virtual_range_ras),class = "BiodiversityDistribution" ) # Add bias variable y <- x |> add_control_bias(layer = predictors$hmi_mean_50km) diff --git a/tests/testthat/test_trainINLA.R b/tests/testthat/test_trainINLA.R index 033ec92f..3bd08b61 100644 --- a/tests/testthat/test_trainINLA.R +++ b/tests/testthat/test_trainINLA.R @@ -6,12 +6,15 @@ test_that('Check that INLA works', { skip_if_not_installed('INLA') suppressWarnings( - suppressPackageStartupMessages( library(INLA) ) + suppressPackageStartupMessages( requireNamespace("INLA") ) + ) + suppressWarnings( + suppressPackageStartupMessages( attachNamespace("INLA") ) ) options("ibis.setupmessages" = FALSE) # Use test data that comes with INLA - data(Epil) + data("Epil",package = "INLA") observed <- Epil[1:30, 'y'] Epil <- rbind(Epil, Epil[1:30, ]) @@ -19,7 +22,7 @@ test_that('Check that INLA works', { # Set up formula and train formula = y ~ Trt + Age + V4 + f(Ind, model="iid") + f(rand,model="iid") - result = inla(formula, family="poisson", data = Epil, control.predictor = list(compute = TRUE, link = 1)) + result = INLA::inla(formula, family="poisson", data = Epil, control.predictor = list(compute = TRUE, link = 1)) expect_type(result,'list') expect_null(result$waic) @@ -47,9 +50,9 @@ test_that('Train a distribution model with INLA', { predictors <- raster::stack(ll);names(predictors) <- tools::file_path_sans_ext(basename(ll)) # Now set them one up step by step - x <- distribution(background) %>% - add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% - add_predictors(predictors[[c('slope_mean_50km','bio01_mean_50km','CLC3_132_mean_50km')]], transform = 'none',derivates = 'none') %>% + x <- distribution(background) |> + add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> + add_predictors(predictors[[c('slope_mean_50km','bio01_mean_50km','CLC3_132_mean_50km')]], transform = 'none',derivates = 'none') |> engine_inla( max.edge = c(.5, 3), offset = c(0.5, 1), diff --git a/vignettes/articles/01_train_simple_model.Rmd b/vignettes/articles/01_train_simple_model.Rmd index c456741b..dd7bc49a 100644 --- a/vignettes/articles/01_train_simple_model.Rmd +++ b/vignettes/articles/01_train_simple_model.Rmd @@ -124,16 +124,16 @@ print(mod) The `print` call at the end now shows some summary statistics contained in this object, such as the extent of the modelling background and the projection used, the number of biodiversity datasets added and statistics on the predictors, eventual priors and which engine is being used. -Of course all of these steps can also be done in "pipe" using the ` %>% ` or `|>` syntax. +Of course all of these steps can also be done in "pipe" using the `|>` syntax. ```{r, eval=TRUE, warning=FALSE} print("Create model") -mod <- distribution(background) %>% +mod <- distribution(background) |> add_biodiversity_poipo(virtual_species, name = "Virtual test species", - field_occurrence = "Observed") %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% + field_occurrence = "Observed") |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> engine_inlabru() ``` @@ -263,7 +263,7 @@ predictors <- subset(predictors, c("bio01_mean_50km","bio03_mean_50km","bio19_me predictors$koeppen_50km <- raster::ratify(predictors$koeppen_50km) # Create a distribution modelling pipeline -x <- distribution(background) %>% +x <- distribution(background) |> add_biodiversity_poipo(virtual_species, field_occurrence = 'Observed', name = 'Virtual points') |> add_predictors(predictors, transform = 'scale', derivates = "none") |> engine_xgboost(iter = 8000) @@ -272,7 +272,7 @@ x <- distribution(background) %>% mod_null <- train(x, runname = 'Normal PPM projection', only_linear = TRUE, verbose = FALSE) # And with an added constrain # Calculated as nearest neighbour distance (NND) between all input points -mod_dist <- train(x %>% add_latent_spatial(method = "nnd"), +mod_dist <- train(x |> add_latent_spatial(method = "nnd"), runname = 'PPM with NND constrain', only_linear = TRUE, verbose = FALSE) # Compare both @@ -289,11 +289,11 @@ This approach can be particular suitable for current and future projections at l ```{r Prediction limits, fig.width = w, fig.height = h} # Create again a distribution object, but this time with limits (use the Köppen-geiger layer from above) # The zones layer must be a factor layer (e.g. is.factor(layer) ) -x <- distribution(background, limits = predictors$koeppen_50km) %>% +x <- distribution(background, limits = predictors$koeppen_50km) |> add_biodiversity_poipo(virtual_species, field_occurrence = 'Observed', name = 'Virtual points') |> add_predictors(predictors, transform = 'scale', derivates = "none") |> # Since we are adding the koeppen layer as zonal layer, we disgard it from the predictors - rm_predictors("koeppen_50km") %>% + rm_predictors("koeppen_50km") |> engine_xgboost(iter = 3000, learning_rate = 0.01) # Spatially limited prediction diff --git a/vignettes/articles/02_integrate_data.Rmd b/vignettes/articles/02_integrate_data.Rmd index 350cdbfd..11d1a46d 100644 --- a/vignettes/articles/02_integrate_data.Rmd +++ b/vignettes/articles/02_integrate_data.Rmd @@ -93,8 +93,8 @@ We can define a generic model to use for any of the sections below. ```{r Create basemodel, eval = TRUE} # First define a generic model and engine using the available predictors -basemodel <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% +basemodel <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> engine_inlabru() ``` @@ -105,7 +105,7 @@ The most simple way of integrating prior observations into species distribution ```{r Add range as predictor, eval = TRUE} # Here we simply add the range as simple binary predictor -mod1 <- basemodel %>% +mod1 <- basemodel |> add_predictor_range(virtual_range, method = "distance") # We can see that the range has been added to the predictors object @@ -119,11 +119,11 @@ For instance suppose a species is known to occur between 300 and 800m above sea ```{r add elevation as predictor, eval=TRUE} # Specification -basemodel <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% +basemodel <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> engine_inlabru() -mod1 <- basemodel %>% +mod1 <- basemodel |> add_predictor_elevationpref(layer = predictors$elevation_mean_50km, lower = 300, upper = 800) @@ -142,11 +142,11 @@ Offsets can be specified as addition or as nuisance for a model, for instance ei ```{r Train model with range offset} # Specification -mod1 <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% - add_biodiversity_poipo(virtual_species,field_occurrence = "Observed") %>% - add_offset_range(virtual_range, distance_max = 5e5) %>% - engine_glmnet() %>% +mod1 <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> + add_biodiversity_poipo(virtual_species,field_occurrence = "Observed") |> + add_offset_range(virtual_range, distance_max = 5e5) |> + engine_glmnet() |> # Train train(runname = "Prediction with range offset",only_linear = TRUE) @@ -178,9 +178,9 @@ Prior specifications are specific to each engine and more information can be fou ```{r Specification of priors, eval = TRUE, fig.width = w, fig.height = h} # Set a clean base model with biodiversity data -x <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% - add_biodiversity_poipo(virtual_species, field_occurrence = "Observed") %>% +x <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> + add_biodiversity_poipo(virtual_species, field_occurrence = "Observed") |> engine_inlabru() # Make a first model @@ -198,7 +198,7 @@ pp <- priors(p) pp$varnames() # Priors can then be added via -mod2 <- train(x %>% add_priors(pp), only_linear = TRUE) +mod2 <- train(x |> add_priors(pp), only_linear = TRUE) # Or alternatively directly as parameter via add_predictors, # e.g. add_predictors(env = predictors, priors = pp) @@ -219,19 +219,19 @@ Another very straight forward way for model-based integration is to simply fit t ```{r Model ensembles} # Create and fit two models -mod1 <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% - engine_glmnet() %>% +mod1 <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> + engine_glmnet() |> # Add dataset 1 - add_biodiversity_poipo(poipo = virtual_species, name = "Dataset1",field_occurrence = "Observed") %>% + add_biodiversity_poipo(poipo = virtual_species, name = "Dataset1",field_occurrence = "Observed") |> train(runname = "Test1", only_linear = TRUE) -mod2 <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% - engine_xgboost(iter = 5000) %>% +mod2 <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> + engine_xgboost(iter = 5000) |> # Add dataset 2, Here we simple simulate presence-only points from a range add_biodiversity_polpo(virtual_range, name = "Dataset2",field_occurrence = "Observed", - simulate = TRUE,simulate_points = 300) %>% + simulate = TRUE,simulate_points = 300) |> train(runname = "Test1", only_linear = FALSE) # Show outputs of each model individually and combined @@ -268,19 +268,21 @@ By default all engines that do not support any joint estimation (see below) will * "interaction": In the case of two datasets of the same type it also is possible to make use of factor interactions. In this case the prediction is made based on the first reference level (e.g. the first added dataset) with the others being "partialed" out during prediction. This method only works if one fits a model with multiple datasets on the same response (e.g. Bernoulli distributed). Can be used in projections (`scenario()`). +* "weights": This type of integration works only for two biodiversity datasets of the same type. Here the datasets are combined into one, however the observations are weighted through a `weights` parameter in each add_biodiversity call. This can be for example used to give one dataset an arbitrary (or expert-defined) higher value compared to another. + All of these can be specified as parameter in `train()`. Note that for any of these methods (like "predictor" & "offset"), models are trained in the sequence at which datasets are added! ```{r Combined integration} # Specification -mod1 <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% +mod1 <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> # A presence only dataset - add_biodiversity_poipo(virtual_species,field_occurrence = "Observed") %>% + add_biodiversity_poipo(virtual_species,field_occurrence = "Observed") |> # A Presence absence dataset - add_biodiversity_poipa(virtual_pseudoabs,field_occurrence = "Observed") %>% - engine_xgboost() %>% + add_biodiversity_poipa(virtual_pseudoabs,field_occurrence = "Observed") |> + engine_xgboost() |> # Train train(runname = "Combined prediction",only_linear = TRUE, method_integration = "predictor") @@ -324,15 +326,15 @@ See the [Engine comparison](04_engine_comparison.html) for an overview on which ```{r, warning=FALSE, message=FALSE} # Define a model -mod1 <- distribution(background) %>% - add_predictors(env = predictors, transform = "scale", derivates = "none") %>% +mod1 <- distribution(background) |> + add_predictors(env = predictors, transform = "scale", derivates = "none") |> # A presence only dataset - add_biodiversity_poipo(virtual_species,field_occurrence = "Observed") %>% + add_biodiversity_poipo(virtual_species,field_occurrence = "Observed") |> # A Presence absence dataset - add_biodiversity_poipa(virtual_pseudoabs,field_occurrence = "Observed") %>% + add_biodiversity_poipa(virtual_pseudoabs,field_occurrence = "Observed") |> # Use inlabru for estimation and default parameters. # INLA requires the specification of a mesh which in this example is generated from the data. - engine_inlabru() %>% + engine_inlabru() |> # Train train(runname = "Combined prediction", only_linear = TRUE, method_integration = "predictor") diff --git a/vignettes/articles/03_biodiversity_projections.Rmd b/vignettes/articles/03_biodiversity_projections.Rmd index 1c4c0775..db556074 100644 --- a/vignettes/articles/03_biodiversity_projections.Rmd +++ b/vignettes/articles/03_biodiversity_projections.Rmd @@ -76,7 +76,7 @@ projection(pred_current) <- projection(background) # Same projection as backgrou # Get future predictors # These we will load in using the stars package and also ignoring the first time step -pred_future <- stars::read_stars(ll) %>% stars:::slice.stars('Time', 2:86) +pred_future <- stars::read_stars(ll) |> stars:::slice.stars('Time', 2:86) st_crs(pred_future) <- st_crs(4326) # Set projection # Rename future predictors to those of current names(pred_future) <- names(pred_current) @@ -93,10 +93,10 @@ For guidance on how distribution models are trained, see other vignettes ([1](01 ```{r Train a basic model, eval = TRUE} # Train model adding the data loaded above -x <- distribution(background) %>% - add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') %>% +x <- distribution(background) |> + add_biodiversity_poipo(virtual_points, field_occurrence = 'Observed', name = 'Virtual points') |> # Note that we scale the predictors here - add_predictors(pred_current, transform = 'scale',derivates = 'none') %>% + add_predictors(pred_current, transform = 'scale',derivates = 'none') |> engine_glmnet(alpha = 0) # Train the model @@ -107,9 +107,9 @@ modf <- threshold(modf, method = 'percentile', value = 0.05) # -- # # Now lets create a scenarios object via scenarios -sc <- scenario(modf) %>% +sc <- scenario(modf) |> # Apply the same variable transformations as above. - add_predictors(pred_future, transform = 'scale') %>% + add_predictors(pred_future, transform = 'scale') |> # Calculate thresholds at each time step. The threshold estimate is taken from the model object. threshold() @@ -187,15 +187,15 @@ Lastly there are also options to *stabilize* suitability projections via the `pr ```{r Add constraints and reproject, eval = TRUE, fig.width = w, fig.height = h} # Adding a simple negative exponential kernel to constrain the predictions -sc.fit2 <- sc %>% - add_constraint(method = "sdd_nex", value = 1e5) %>% +sc.fit2 <- sc |> + add_constraint(method = "sdd_nex", value = 1e5) |> # Directly fit the object project(stabilize = F) # Also fit one projection a nichelimit has been added -sc.fit3 <- sc %>% - add_constraint(method = "sdd_nex", value = 1e5) %>% - add_constraint_adaptability(method = "nichelimit") %>% +sc.fit3 <- sc |> + add_constraint(method = "sdd_nex", value = 1e5) |> + add_constraint_adaptability(method = "nichelimit") |> # Directly fit the object project(stabilize = F) @@ -215,7 +215,7 @@ sc.fit3$plot(which = 40) # With dispersal limit and nichelimitation (within a st o1 <- sc.fit1$summary() o2 <- sc.fit2$summary() o3 <- sc.fit3$summary() -arlim <- c(min(o1$area_km2, o2$area_km2, o3$area_km2), +arlim <- c(min(o1$area_km2, o2$area_km2, o3$area_km2)-5000, max(o1$area_km2, o2$area_km2, o3$area_km2)) plot(area_km2~band, data = o1, type = 'n', diff --git a/vignettes/articles/06_frequently-asked-questions.Rmd b/vignettes/articles/06_frequently-asked-questions.Rmd index 9fed900f..71d61159 100644 --- a/vignettes/articles/06_frequently-asked-questions.Rmd +++ b/vignettes/articles/06_frequently-asked-questions.Rmd @@ -80,9 +80,9 @@ in. ```{r, echo=TRUE,eval=FALSE} # Where zone is a provided raster -mod <- distribution(background, limits = zone) %>% - add_biodiversity_poipo(species_data) %>% - engine_gdb() %>% +mod <- distribution(background, limits = zone) |> + add_biodiversity_poipo(species_data) |> + engine_gdb() |> train() plot(mod) ``` @@ -115,9 +115,9 @@ p <- list( pp <- priors(pp) # And can now added to the model -mod <- distribution(background, limits = zone) %>% - add_biodiversity_poipo(species_data) %>% - add_predictors(covariates) %>% +mod <- distribution(background, limits = zone) |> + add_biodiversity_poipo(species_data) |> + add_predictors(covariates) |> add_priors(priors = pp) engine_inlabru() @@ -147,7 +147,7 @@ Alternatively one could think of specifying specific pseudo-absence sampling inf ss <- pseudoabs_settings(background = NULL, bias = bias_layer) # Assuming background and point data exists -x <- distribution(background) %>% +x <- distribution(background) |> add_biodiversity_poipo(points, pseudoabsence_settings = ss) ``` @@ -236,10 +236,37 @@ Users are advised to use [`engine_inlabru`] by **default** in most cases and onl When creating predictive models such as SDMs it is often a concern to not predict to a variable range outside the environmental conditions for which a model was trained. The `ibis.iSDM` package supports variable 'clamping' of its predictions similar as the popular Maxent model, however for each [`engine`]. Clamping can be enabled by setting the parameter `clamp` in [`train`] to `TRUE`. This restricts any spatial (or spatial-temporal) projections to the combined `range` of predictor variables observed for each of the training localities. -Similar functionalities are also available separately during scenario projections by setting adaptability constraints (see [`add_constraint_adaptability`]). +Similar functionalities are also available separately during scenario projections by setting adaptability constraints (see [`add_constraint_adaptability`] or [`add_constraint_boundary`]). +
+I am using (too) many predictors in my model. What options do I have ? + + +Having too many predictors in a SDM can be a cause of substantial over-parametrization and subsequently +overfitting (e.g. the model is reproducing the data it was trained with rather than projecting into areas unknown). + +It is recommended to (a) either use an engine with very strong regularization, such as for example [`engine_glmnet`] or [`engine_gdb`], +(b) train a model with caution and have a minimum number of observations (arbitrary rule of thumb, have at least 10 observations for each additional predictor included), (c) make use of pre-estimation removal of predictor, such for example through variable importance criteria or colinearity. See code below for an example. + +```{r, echo=TRUE,eval=FALSE} +# Prior to model fitting, remove highly collinear predictors through a pearson correlation assessment +mod <- distribution(background) |> + add_biodiversity_poipo(species_data) |> + engine_glmnet() |> + train(filter_predictors = "pearson") + +# Alternatively use a RandomForest estimator to remove the least important variables +mod <- distribution(background) |> + add_biodiversity_poipo(species_data) |> + engine_glmnet() |> + train(filter_predictors = "RF") + + +``` +
+
Where can I find predictions created by `train` ? @@ -250,9 +277,9 @@ In addition, for Bayesian Engines other bands quantifying the posterior predicti **Example:** ```{r, echo=TRUE,eval=FALSE} -mod <- distribution(background) %>% - add_biodiversity_poipo(species_data) %>% - engine_inlabru() %>% +mod <- distribution(background) |> + add_biodiversity_poipo(species_data) |> + engine_inlabru() |> train() # To plot plot(mod, "mean") diff --git a/vignettes/articles/contributing.Rmd b/vignettes/articles/contributing.Rmd index 0b352435..8e684ba9 100644 --- a/vignettes/articles/contributing.Rmd +++ b/vignettes/articles/contributing.Rmd @@ -25,9 +25,10 @@ For the latter, please get in touch with the package author or one of the mainta - Don’t repeat yourself. Create new functions and if necessary classes. Equally try to reuse common names from R, e.g. *plot*, *summary* -- Avoid using additional package dependencies where possible +- Please run code *checks* and *tests* regularly. +- Avoid using additional package dependencies where possible. - Comment your code!! -- Use assertions to verify inputs to functions +- Use assertions to verify inputs to functions. - If bored, please write unit tests and ensure they all evaluate (CRTL+SHIFT+T)!