diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5d2857f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,55 @@
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+.RDataTmp
+
+# User-specific files
+.Ruserdata
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+
+# pkgdown site
+docs/
+
+# translation temp files
+po/*~
+
+# RStudio Connect folder
+rsconnect/
+
+#data
+data/
+
+# Environmental files
+.nc
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a18ab27
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+# Habitat suitability model
+
+## About the model/project
+Write a summary of the project and what the model does. 
+
+## How to use
+Technical manual how to install & run the project.
+
+## License
+Mention developers, reusability, contact information.
\ No newline at end of file
diff --git a/demo/app/getData.R b/demo/app/getData.R
new file mode 100644
index 0000000..e69de29
diff --git a/demo/app/shinyApp.Rmd b/demo/app/shinyApp.Rmd
new file mode 100644
index 0000000..0ead0a6
--- /dev/null
+++ b/demo/app/shinyApp.Rmd
@@ -0,0 +1,90 @@
+---
+title: "Tidymodels ensemble approach"
+author: "Jo-Hannes Nowé"
+output: 
+html_document:
+  toc=TRUE
+---
+
+# Loading the required packages
+
+```{r}
+library(shiny)
+library(leaflet)
+library(raster)
+ 
+# Define UI
+ui <- fluidPage(
+  tags$head(
+    tags$style(HTML("
+            body {
+                background-color: #333333; /* dark grey */
+                color: white; /* text color */
+            }
+            .text-box {
+                background-color: black;
+                color: white;
+                padding: 10px;
+                border-radius: 5px;
+                white-space: pre-line; /* Preserve line breaks */
+            }
+        "))
+  ),
+  titlePanel("Species Habitat Suitability Maps"),
+  fluidRow(
+    column(
+      width = 5,
+      mainPanel(
+        h3("Introduction"),
+        p("This application displays habitat suitability maps for selected species from 2020 to 2100."),
+        p("A red to orange to yellow to green to blue to dark blue colour scale is used to indicate habitat suitability:"),
+        tags$ul(
+          tags$li("red = low suitability"),
+          tags$li("dark blue = high suitability"),
+          tags$li("all suitability is between 0 and 1")
+        ),
+        p("A species, variable, and year can be selected using the options below."),
+        selectInput("species", "Select Species:", choices = c("Harbour porpoise")),
+        sliderInput("month", "Select Month:", min = 1, max = 12, value = 1, step = 1)
+      )
+    ),
+    column(
+      width = 7,
+      leafletOutput("map", width = "100%", height = "800px"),
+      verbatimTextOutput("additional_text")
+    )
+  )
+)
+ 
+# Define server logic
+server <- function(input, output) {
+  output$map <- renderLeaflet({
+    # Load TIFF file based on selected species, variable, and year
+    species <- input$species
+    month <- input$month
+    file_path <- paste0("PredictionRasters/Month", month, ".tif")
+    suitability_raster <- raster(file_path)
+    # Calculate the extent of the study area
+    raster_ext <- extent(suitability_raster)
+    leaflet() %>%
+      setView(lng = mean(c(raster_ext@xmin, raster_ext@xmax)), 
+              lat = mean(c(raster_ext@ymin, raster_ext@ymax)), zoom = 5.1) %>%
+      addProviderTiles("Esri.WorldImagery") %>%
+      addRasterImage(suitability_raster, opacity = 0.8) %>%
+      addLegend("bottomright", title = "Habitat Suitability",
+                colors = c("red", "orange", "yellow", "green", "blue", "darkblue"),
+                labels = c("Low", "", "", "", "", "High"),
+                opacity = 0.8)
+  })
+  output$additional_text <- renderText({
+    "We employed a mechanistic niche modelling approach that mathematically describes each species' specific ecological niche based on their responses to temperature and salinity. This approach, utilizing fuzzy logic principles, provided a more nuanced understanding than traditional methods. Climate prediction data from Bio-ORACLE (www.bio-oracle.org) was incorporated, focusing on sea surface temperature and salinity. Baseline data from 2010 established the foundation for the current scenario, while future projections spanned from 2020 to 2090, covering each decade under six Shared Socioeconomic Pathways (SSPs) – including the most extreme scenario, SSP585.
+ 
+        Team 
+        Our team comprises scientists and data managers from Flanders Marine Institute and Gent University: Rutendo Musimwa, Ward Standaert, Martha Stevens, Salvador Jesus Fernandez Bejarano, Carlota Muñiz, Elisabeth Debusschere, Steven Pint and Gert Everaert"
+  })
+}
+ 
+# Run the application 
+shinyApp(ui = ui, server = server)
+```
+
diff --git a/demo/config.R b/demo/config.R
new file mode 100644
index 0000000..68e9971
--- /dev/null
+++ b/demo/config.R
@@ -0,0 +1,21 @@
+# script that reads yaml configuration and builds on the information provided.
+# source this script and use settings to get config parameters.
+
+library(yaml)
+
+get_config = function(config){
+  #' function that gets the configuration settings from a yaml file
+  #' default config.yml is the one located in main dir. 
+  #' 
+  if (missing(config)){
+    config = "config.yml"
+  }
+  ## ---------------------------------------------------------------------------
+  # READ CONFIG FILE
+  settings= yaml.load_file("config.yml")
+  return(settings)
+}
+
+settings = get_config()
+
+  
\ No newline at end of file
diff --git a/demo/config.yml b/demo/config.yml
new file mode 100644
index 0000000..b6d9835
--- /dev/null
+++ b/demo/config.yml
@@ -0,0 +1,5 @@
+edito:
+   url: something
+data:
+   start_date: 2000
+   end_date: 2020
diff --git a/demo/install_dependencies.R b/demo/install_dependencies.R
new file mode 100644
index 0000000..a176b0f
--- /dev/null
+++ b/demo/install_dependencies.R
@@ -0,0 +1,25 @@
+# use this script to install a number of dependencies.
+
+packages = x <- scan("requirements.txt", what="", sep="\n")
+
+packagecheck <- match( packages, utils::installed.packages()[,1] )
+packagestoinstall <- packages[ is.na( packagecheck ) ]
+
+if( length( packagestoinstall ) > 0L ) {
+  utils::install.packages( packagestoinstall,
+                           repos = "http://cran.csiro.au"
+  )
+} else {
+  print( "All requested packages already installed" )
+}
+
+for( package in packages ) {
+  suppressPackageStartupMessages(
+    library( package, character.only = TRUE, quietly = TRUE )
+  )
+}
+
+# install.packages(setdiff(packages, rownames(installed.packages())))  
+# lapply(packages, require, character.only = TRUE)
+
+rm(list = ls.str())
diff --git a/demo/requiremets.txt b/demo/requiremets.txt
new file mode 100644
index 0000000..e69de29
diff --git a/model_training/0_1Preparation.Rmd b/model_training/0_1Preparation.Rmd
new file mode 100644
index 0000000..cd264b7
--- /dev/null
+++ b/model_training/0_1Preparation.Rmd
@@ -0,0 +1,42 @@
+---
+title: "Folder structure preparation"
+author: "Jo-Hannes Nowé"
+date: "05-04-2024"
+output: 
+  pdf_document:default
+---
+
+A script to prepare the folder structure and download all the used packages.
+
+```{r folder-structure, EVAL=FALSE}
+############# Define different folders
+downloaddir <- "data/raw_data"
+datadir     <- "data/derived_data"
+mapsdir     <- "product/maps"
+rasterdir   <- "product/species_rasters"
+plotsdir    <- "product/species_plots"
+envdir <-"data/raw_data/environmental_layers"
+occdir <-"data/raw_data/occurrences"
+spatdir <- "data/raw_data/spatial_layers"
+folderstruc <- c(downloaddir,
+                 datadir,
+                 mapsdir,
+                 rasterdir,
+                 plotsdir,
+                 envdir,
+                 occdir,
+                 spatdir)
+
+############# Check for their existence, create if missing
+for(i in 1:length(folderstruc)){
+  if(!dir.exists(folderstruc[i])){
+  # If not, create the folder
+  dir.create(folderstruc[i],recursive = TRUE)
+  cat("Folder created:", folderstruc[i], "\n")
+} else {
+  cat("Folder already exists:", folderstruc[i], "\n")
+}
+}
+```
+
+
diff --git a/model_training/1_1DownloadPresenceData.Rmd b/model_training/1_1DownloadPresenceData.Rmd
new file mode 100644
index 0000000..bb67858
--- /dev/null
+++ b/model_training/1_1DownloadPresenceData.Rmd
@@ -0,0 +1,188 @@
+---
+title: "Downloading the occurrence data from eurOBIS"
+author: "Jo-Hannes Nowé"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Loading the required packages
+```{r packages}
+library(eurobis)
+library(maptools)
+library(dplyr)
+library(sf)
+library(maps)
+library(ggplot2)
+library(lubridate)
+library(CoordinateCleaner)
+library(arulesViz)
+library(tidyverse)
+library(worrms)
+#library("devtools")
+#devtools::install_github("vlizBE/imis")
+require(imis)
+```
+
+
+# Defining the study area
+The proposed study area concerns OSPAR regions II, III 
+More information about these regions can be found on: https://www.ospar.org/convention/the-north-east-atlantic.
+There are observations in OSPAR regions I and IV, however when working at a monthly
+resolution, the spatial extent of the points is rather limited and these two OSPAR regions
+lack points in multiple year_months. 
+
+```{r shapefiles}
+#Download and unzip the OSPAR shapefiles
+#Found on https://odims.ospar.org/en/submissions/ospar_regions_2017_01_002/
+url <- "https://odims.ospar.org/public/submissions/ospar_regions/regions/2017-01/002/ospar_regions_2017_01_002-gis.zip"
+download.file(url,paste0(spatdir,"/OSPAR_REGIONS.zip"),mode="wb")
+unzip(zipfile=paste0(spatdir,"/OSPAR_REGIONS.zip"),exdir=spatdir)
+
+#Visualize the different regions
+#Need to load arulesviz package, otherwise error
+OSPAR<- st_read(paste0(spatdir,"/ospar_regions_2017_01_002.shp"))
+ggplot(OSPAR)+
+  geom_sf()
+```
+
+
+As said before we only retain region II and III in order to not interpolate
+our predictions to the other areas based on the few occurrences. 
+
+
+```{r OSPAR}
+#Only keeping region II and III from the shapefile.
+OSPAR <- OSPAR[OSPAR$Region=="II"|OSPAR$Region=='III',]
+#Only keeping the region and geometry info
+OSPAR<- OSPAR %>% dplyr::select(Region)
+#Not necessary anymore
+OSPAR <- st_make_valid(OSPAR)
+
+#Bring together the different OSPAR regions into one area
+study_area <- st_union(OSPAR)
+plot(study_area)
+```
+
+# Download Occurrence data from eurOBIS
+
+```{r download-eurobis}
+#Does not work on this version, but on the R3.6.3 version
+#scientificname <- "Phocoena phocoena"
+#aphiaid <- eurobis_name2id(scientificname)
+
+#In this version (R4.2.2) need to directly specify the aphiaid
+aphiaid <- 137117
+
+#Does not work when knitting, but works when running seperately
+mydata.eurobis<- eurobis_occurrences_full(aphiaid=aphiaid)
+#save(mydata.eurobis, file=paste0("data/raw_data/mydata.eurobis.RData"))
+#load(paste0(occDir,"data/raw_data/mydata.eurobis.RData"))
+
+ggplot(study_area)+
+  geom_sf()+
+  geom_point(data=mydata.eurobis,x=mydata.eurobis$decimallongitude,y=mydata.eurobis$decimallatitude)
+```
+Before filtering the columns of interest and doing some feature engineering we create the
+metadatalist, giving information about the different datasets used for the presence data.
+
+```{r metadata-function}
+# function to read dataset characteristics
+
+fdr2<-function(dasid){
+  datasetrecords <- datasets(dasid)
+  dascitations <- getdascitations(datasetrecords)
+  if(nrow(dascitations)==0)dascitations<-tibble(dasid=as.character(dasid),title="",citation="")
+  if(nrow(dascitations)==1) if(is.na(dascitations$citation)) dascitations$citation<-""
+  daskeywords <- getdaskeywords(datasetrecords)
+  if(nrow(daskeywords)==0)daskeywords<-tibble(dasid=as.character(dasid),title="",keyword="")
+  if(nrow(daskeywords)==1) if(is.na(daskeywords$keyword))daskeywords$keyword<-""
+  dascontacts <- getdascontacts(datasetrecords)
+  if(nrow(dascontacts)==0)dascontacts<-tibble(dasid=as.character(dasid),title="",contact="")
+  if(nrow(dascontacts)==1) if(is.na(dascontacts$contact))dascontacts$contact<-""
+  dastheme <- getdasthemes(datasetrecords)
+  if(nrow(dastheme)==0)dastheme<-tibble(dasid=as.character(dasid),title="",theme="")
+  if(nrow(dastheme)==1) if(is.na(dastheme$theme))dastheme$theme<-""
+  dastheme2 <- aggregate(theme ~ dasid, data = dastheme, paste, 
+                         collapse = " , ")
+  daskeywords2 <- aggregate(keyword ~ dasid, data = daskeywords, 
+                            paste, collapse = " , ")
+  dascontacts2 <- aggregate(contact ~ dasid, data = dascontacts, 
+                            paste, collapse = " , ")
+  output <- dascitations %>% left_join(dascontacts2, by = "dasid") %>% 
+    left_join(dastheme2, by = "dasid") %>% left_join(daskeywords2, 
+                                                     by = "dasid")
+  return(output)
+}
+
+```
+
+```{r create-metadata}
+datasetidsoi <- mydata.eurobis %>% distinct(datasetid) %>% 
+  mutate(datasetid = as.numeric(str_extract(datasetid, "\\d+")))
+#==== retrieve data by dataset ==============
+all_info <- data.frame()
+for (i in datasetidsoi$datasetid){
+  dataset_info <- fdr2(i)
+  all_info <- rbind(all_info, dataset_info)
+}
+names(all_info)[1]<-"datasetid"
+write.csv(all_info,file=file.path(datadir,"allDatasets.csv"),row.names = F)
+alldataset <- read.csv(file.path(datadir,"allDatasets.csv"))
+```
+
+
+```{r pre-processing}
+# Select columns of interest
+mydata.eurobis <- mydata.eurobis %>%
+  dplyr::select(scientificnameaccepted,decimallongitude,decimallatitude,datecollected)%>%
+  filter(!is.na(datecollected))%>%
+  # Give date format to eventDate and fill out month and year columns and assign 1 to occurrenceStatus
+  dplyr::mutate(occurrenceStatus = 1,day = day(datecollected),month = month(datecollected),year=year(datecollected))
+
+#check which occurrence points fall within the study area
+within_area <- st_contains(study_area,mydata.eurobis)[[1]]
+
+#Only retain points inside the study area
+mydata.eurobis <- mydata.eurobis[within_area,]
+
+#Check if it worked as intended
+ggplot(data=mydata.eurobis,x=decimallongitude,y=decimallatitude)+
+  geom_sf(data=study_area)+
+  geom_sf()
+
+mydata.eurobis <- mydata.eurobis%>%
+  dplyr::filter(year>=1999 & year<=2022)%>%
+  arrange(datecollected)%>%
+  mutate(year_month=paste(year,month,sep='-'))%>%
+  mutate(year_month=factor(year_month,levels=unique(year_month),ordered=TRUE))
+```
+
+
+Observations already go through a couple of data quality controls before entering
+the (eur)OBIS datasets. The biggest remaining issue are duplicates. These can be 
+removed by using the **Coordinatecleaner** package. We consider distinct observations
+with the same longitude, latitude, species and date as duplicates. Only the first
+observation of each duplicate is kept in the dataset.
+
+
+```{r remove-duplicates}
+#Remove duplicates
+mydata.eurobis <- cc_dupl(mydata.eurobis, lon = "decimallongitude", lat = "decimallatitude",
+                        value = "clean",species="scientificnameaccepted", additions=
+                 "datecollected")
+```
+# First exploratory analysis of the observations.
+## Checking the temporal coverage.
+
+# Checking spatial coverage and bias
+
+
+```{r}
+mydata.eurobis
+save(mydata.eurobis, file = file.path(datadir,"presence.RData"))
+save(study_area, file = file.path(datadir,"study_area.RData"))
+```
+
diff --git a/model_training/1_2ExplorationPresenceData.Rmd b/model_training/1_2ExplorationPresenceData.Rmd
new file mode 100644
index 0000000..6c74ff5
--- /dev/null
+++ b/model_training/1_2ExplorationPresenceData.Rmd
@@ -0,0 +1,48 @@
+---
+title: "Downloading the occurrence data"
+author: "Jo-Hannes Nowé"
+output: 
+  pdf_document: default
+  html_document: default
+---
+
+THIS SCRIPTS NEEDS TO BE UPDATED
+
+# Loading the required packages
+
+```{r}
+library(eurobis)
+# #library(maptools)
+ library(sf)
+# library(maps)
+ library(ggplot2)
+# library(lubridate)
+library(CoordinateCleaner)
+# library(arulesViz)
+library(tidyverse)
+# library(worrms)
+```
+
+
+# Checking the presence data temporal and spatial scale
+
+```{r}
+ggplot(study_area)+
+  geom_sf()+
+  geom_point(data=mydata_eurobis,x=mydata_eurobis$decimallongitude,y=mydata_eurobis$decimallatitude)
+```
+
+
+```{r, fig.width=8, fig.height=12}
+
+ggplot(study_area)+
+  geom_sf()+
+  geom_point(data=mydata_eurobis,x=mydata_eurobis$decimallongitude,y=mydata_eurobis$decimallatitude)+
+  theme(plot.title = element_text(size=20), axis.title= element_text(size = 16),
+          text = element_text(size = 16))+
+    facet_wrap(~month, nrow= 4)+
+    theme_void()+
+    theme(strip.background=element_blank(), #remove strip background
+          strip.text= element_text(size=12))
+```
+
diff --git a/model_training/1_3AbsenceCreation.Rmd b/model_training/1_3AbsenceCreation.Rmd
new file mode 100644
index 0000000..5969d22
--- /dev/null
+++ b/model_training/1_3AbsenceCreation.Rmd
@@ -0,0 +1,122 @@
+---
+title: "Downloading the occurrence data"
+author: "Jo-Hannes Nowé"
+output: 
+  pdf_document: default
+  html_document: default
+---
+
+
+
+# Loading the required packages
+```{r}
+library(dplyr)
+library(sf)
+library(ggplot2)
+library(lubridate)
+library(stringr)
+library(ows4R)
+library(readr)
+library(CoordinateCleaner)
+```
+
+```{r}
+alldataset <- read.csv(file.path(datadir,"allDatasets.csv"))
+list_dasid <- alldataset$datasetid
+
+wfs <- WFSClient$
+  new("https://geo.vliz.be/geoserver/Dataportal/wfs", "2.0.0", logger = "INFO")$
+  getCapabilities()$
+  findFeatureTypeByName("Dataportal:eurobis-obisenv_full")
+dir.create(file.path(occdir,"target_group"))
+for(datasetid in list_dasid){
+params <- paste0("where%3Adatasetid+IN+%28",
+                 datasetid,
+                 "%29")
+
+feature_pagination <- wfs$getFeatures(viewParams = params, paging = TRUE, paging_length = 50000)
+filename = paste0("dataset", datasetid,".csv")
+#Save the download to a csv file with the dataset and region name
+write_delim(feature_pagination, file.path(occdir,"target_group", filename), delim = ",")
+}
+
+
+#Still need to load the last 2 but will already try
+target_files <- list.files(file.path(occdir,"target_group/"))
+dataframes_target <- list()
+
+for(file in target_files) {
+  # Read the CSV file into a dataframe
+  df <- read.csv(file.path(occdir,"target_group/",file))
+  
+  #Append the dataframe to the list
+  dataframes_target <- c(dataframes_target, list(df))
+}
+target_background <- do.call(rbind,dataframes_target)
+target_background <- data.frame(target_background)%>%
+  filter(class=="Mammalia")%>%
+  filter(scientificnameaccepted!="Phocoena phocoena")
+
+load(file.path(datadir,"study_area.RData"))
+ggplot(data=study_area)+
+  geom_sf()+
+  geom_point(data=target_background,x=target_background$decimallongitude,y=target_background$decimallatitude)
+```
+Need to further clean up the target_background dataframe. 
+
+
+```{r}
+absence <- target_background%>%
+  dplyr::mutate(occurrenceStatus = 0,day = day(datecollected),month = month(datecollected),year=year(datecollected))%>%
+  filter(!is.na(datecollected))%>%
+  filter(year>1999)%>%
+  filter(year<2020)%>%
+  arrange(datecollected)%>%
+  mutate(year_month=paste(year,month,sep='-'))%>%
+  mutate(year_month=factor(year_month,levels=unique(year_month),ordered=TRUE))%>%
+  mutate(copydecimallongitude = decimallongitude, copydecimallatitude=decimallatitude)%>%
+  dplyr::select(scientificnameaccepted,decimallongitude,decimallatitude,datecollected,
+         occurrenceStatus,day,month,year,year_month,copydecimallongitude,copydecimallatitude)
+absence <-  st_as_sf(absence,coords = c("copydecimallongitude", "copydecimallatitude"), 
+           crs = st_crs(study_area))
+within_abs <- st_contains(study_area,absence)[[1]]
+absence <- absence[within_abs,]
+absence <- cc_dupl(absence, lon = "decimallongitude", lat = "decimallatitude",
+                        value = "clean",species="scientificnameaccepted", additions=
+                 "datecollected")
+```
+
+
+
+
+```{r, fig.width=8, fig.height=12}
+puntdata <- absence%>%
+  filter(year>1999)%>%
+  filter(year<2006)%>%
+  mutate(year=factor(year),month=factor(month))
+
+for (jaar in levels(puntdata$year)){
+  plotdata <- puntdata %>%
+    filter(year==jaar)#%>%
+   # mutate(month=factor(month,levels=new_order))
+  
+plot<-ggplot(data=study_area) +
+    geom_sf()+
+    theme_minimal()+
+    geom_point(data=plotdata, aes(x=decimallongitude,y=decimallatitude,colour="red"),show.legend=FALSE) +
+    theme(plot.title = element_text(size=20), axis.title= element_text(size = 16),
+          text = element_text(size = 16))+
+    labs(title = paste("target group records",jaar))+
+    facet_wrap(~month, nrow= 4)+
+    theme_void()+
+    theme(strip.background=element_blank(), #remove strip background
+          strip.text= element_text(size=12))
+print(plot)
+}
+
+```
+
+```{r}
+save(absence, file=file.path(datadir,"absence.RData"))
+```
+
diff --git a/model_training/2_PAExploration.Rmd b/model_training/2_PAExploration.Rmd
new file mode 100644
index 0000000..d30ac98
--- /dev/null
+++ b/model_training/2_PAExploration.Rmd
@@ -0,0 +1,35 @@
+---
+title: "Exploration of the joined PA data"
+author: "Jo-Hannes Nowé"
+output: 
+  pdf_document: default
+  html_document: default
+---
+
+THIS SCRIPT NEEDS TO BE UPDATED
+
+# Loading the required packages
+```{r}
+library(dplyr)
+library(sf)
+library(ggplot2)
+library(lubridate)
+library(stringr)
+library(ows4R)
+library(readr)
+library(CoordinateCleaner)
+```
+
+```{r}
+load('data/derived_data/absence.RData')
+load('data/derived_data/presence.RData')
+PA <- rbind(mydata.eurobis,absence)
+save(PA,file=paste0("data/derived_data/PA.RData"))
+```
+
+Now we also want to investigate some summary characteristics of this joined dataframe.
+
+```{r}
+summary(PA)
+```
+
diff --git a/model_training/3_1CopernicusMarine.Rmd b/model_training/3_1CopernicusMarine.Rmd
new file mode 100644
index 0000000..0ebca1b
--- /dev/null
+++ b/model_training/3_1CopernicusMarine.Rmd
@@ -0,0 +1,76 @@
+---
+title: "Download Copernicus Marine data"
+author: "Jo-Hannes Nowé"
+output: 
+  pdf_document: default
+  html_document: default
+---
+
+
+# Set-up of the python environment
+```{r}
+library(reticulate) #reticulate package allows for python usage in R 
+virtualenv_create("mbo-proj",force=FALSE) #create a virtual environment to install packages in
+py_install("copernicusmarine",envname = "mbo-proj") #the copernicusmarine package allows CMEMS downloads
+virtualenv_list() #Check the list of available environments
+use_virtualenv("mbo-proj") #Load the environment
+py_available() #Check if python is available for use
+py_config() #Check if your using the environment
+py_list_packages() #Check if your package was installed correctly
+??virtualenv_create
+```
+More information on the reticulate package on: https://rstudio.github.io/cheatsheets/reticulate.pdf
+
+Overview of the functions can be found at: https://help.marine.copernicus.eu/en/collections/9080063-copernicus-marine-toolbox
+
+How to configure the credentials can be found at: https://help.marine.copernicus.eu/en/articles/8185007-copernicus-marine-toolbox-credentials-configuration
+Works in the terminal. Needs to be done only once. 
+
+
+
+```{python}
+import copernicusmarine as cm
+
+#?cm.subset
+cm.subset(
+  dataset_id="cmems_mod_glo_phy_my_0.083deg_P1M-m",
+  dataset_version="202311",
+  variables=["so", "thetao"],
+  minimum_longitude=-45,
+  maximum_longitude=52,
+  minimum_latitude=35,
+  maximum_latitude=90,
+  start_datetime="1999-01-01T00:00:00",
+  end_datetime="2021-06-01T00:00:00",
+  minimum_depth=0.49402499198913574,
+  maximum_depth=0.49402499198913574,
+  netcdf_compression_enabled=True,
+  output_directory= "data/raw_data/",
+  output_filename="tempsal.nc",
+  overwrite_output_data = False)
+
+
+```
+
+```{python}
+cm.subset(
+  dataset_id="cmems_mod_glo_bgc_my_0.25_P1M-m",
+  dataset_version="202112",
+  variables=["nppv"],
+  minimum_longitude=-45,
+  maximum_longitude=52,
+  minimum_latitude=35,
+  maximum_latitude=90,
+  start_datetime="1999-01-01T00:00:00",
+  end_datetime="2021-06-01T00:00:00",
+  minimum_depth=0.5057600140571594,
+  maximum_depth=0.5057600140571594,
+  netcdf_compression_enabled=True,
+  output_directory= "data/raw_data/",
+  output_filename="npp.nc",
+  overwrite_output_data = False)
+
+```
+
+
+
diff --git a/model_training/3_1CopernicusMarine.py b/model_training/3_1CopernicusMarine.py
new file mode 100644
index 0000000..42e7aa1
--- /dev/null
+++ b/model_training/3_1CopernicusMarine.py
@@ -0,0 +1,34 @@
+import copernicusmarine as cm
+
+cm.login
+username = ""
+password = ""
+
+#For temperature want to use GLOBAL_MULTIYEAR_PHY_001_030
+#For Net primary production want to use: GLOBAL_MULTIYEAR_BGC_001_033
+
+cm.subset(
+  dataset_id="cmems_mod_glo_phy_my_0.083deg_P1M-m",
+  dataset_version="202311",
+  variables=["so", "thetao"],
+  minimum_longitude=-12.01498103062979,
+  maximum_longitude=13.645282778488214,
+  minimum_latitude=47.183697679290205,
+  maximum_latitude=61.77604047194526,
+  start_datetime="1999-01-01T00:00:00",
+  end_datetime="2021-06-01T00:00:00",
+  minimum_depth=0.49402499198913574,
+  maximum_depth=0.49402499198913574,
+)
+
+# cm.subset(
+#   dataset_id="cmems_mod_glo_bgc_my_0.083deg-lmtl_PT1D-i",
+#   dataset_version="202211",
+#   variables=["npp"],
+#   minimum_longitude=-12.01498103062979,
+#   maximum_longitude=13.645282778488214,
+#   minimum_latitude=47.183697679290205,
+#   maximum_latitude=61.77604047194526,
+#   start_datetime="1999-01-01T00:00:00",
+#   end_datetime="2022-12-31T00:00:00",
+# )
diff --git a/model_training/3_2CleanExtractEnvLayer.Rmd b/model_training/3_2CleanExtractEnvLayer.Rmd
new file mode 100644
index 0000000..002a5d4
--- /dev/null
+++ b/model_training/3_2CleanExtractEnvLayer.Rmd
@@ -0,0 +1,183 @@
+---
+title: "Extract the environmental data associated to the species distribution data"
+author: "Jo-Hannes Nowé"
+output: 
+html_document:
+  toc=TRUE
+---
+
+# Loading the required packages
+```{r}
+library(terra)
+```
+```{r}
+#Read in the PA data
+load(file.path(datadir,'PA.RData'))
+
+#Load in environmental layers
+#To get it into a SpatRaster object
+tempsal <- terra::rast('data/raw_data/cmems_mod_glo_phy_my_0.083deg_P1M-m_so-thetao_12.00W-13.58E_47.25N-61.75N_0.49m_1999-01-01-2021-06-01.nc')
+
+#if the object is a spatraster you need to give a list of the length of the layers describing where you want each layer to go
+splitlist <- c(rep(1,270),rep(2,270))
+tempsal_dataset <- split(tempsal,splitlist)
+sal <- tempsal_dataset[[1]]
+sal
+temp <- tempsal_dataset[[2]]
+temp
+```
+
+
+```{r}
+#This code is written to give out year_month combinations for the entirety of the environmental layers as we want to work on monthly resolution
+# Generate year and month sequences
+years <- 1999:2022
+months <- 1:12
+
+# Create empty vector to store year_month values
+year_month_factor <- character(length(years) * length(months))
+
+# Populate year_month vector
+index <- 1
+for (year in years) {
+  for (month in months) {
+    year_month_factor[index] <- paste0(year, "_", sprintf("%02d", month))
+    index <- index + 1
+  }
+}
+
+# Convert year_month to ordered factor
+year_month_factor <- factor(year_month_factor, levels = unique(year_month_factor), ordered = TRUE)
+PA <- PA%>%
+  mutate(year_month=paste(year, sprintf("%02d", month), sep = "_"))%>%
+  mutate(year_month=factor(year_month, levels = levels(year_month_factor)))
+```
+
+
+
+
+```{r}
+#temp has 270 layers= 22*12months + 6 months (1999-01-01 till 2021-06-01)
+PA <- PA%>%filter(datecollected<"2021-07-01")
+```
+
+```{r}
+#NPP
+npp <- terra::rast("data/raw_data/cmems_mod_glo_bgc_my_0.083deg-lmtl_PT1D-i_npp_12.00W-13.58E_47.25N-61.75N_1999-01-01-2022-12-31.nc")
+npp
+tijd <- time(npp)
+ym_tijd <- format(tijd, "%Y_%m")
+ym_tijd <- factor(ym_tijd, levels = unique(ym_tijd), ordered = TRUE)
+splitlijst <- as.numeric(ym_tijd)
+npp_dataset <- split(npp,splitlijst)
+npp_dataset
+mean_npp <- list()
+for(i in 1:length(npp_dataset)){
+  mean_npp[[i]] <- app(npp_dataset[[i]],fun=mean)
+}
+time(mean_npp)
+mean_npp <-rast(mean_npp)
+```
+
+
+#Now for the bathymetry layer
+
+Manually create the folder "bathy_sliced" first, tried to include it in the code but for some reason couldn't create it.
+
+```{r}
+## general data handling
+library(XML)
+library(dplyr)
+library(tidyr)
+library(reshape2)
+library(downloader)
+
+library(raster)
+library(sp)
+library(mapdata)
+library(maptools)
+library(ncdf4)
+library(tiff)
+
+ifelse(!dir.exists(file.path(envdir, "bathy_sliced")), dir.create(file.path(envdir, "bathy_sliced")), FALSE)
+xmin<- -12.0
+xmax <- 13.6
+ymin <- 47.2
+ymax <- 61.8
+stepsize <- 0.8
+number_of_slices <- (xmax-xmin)/stepsize
+for(i in 1:number_of_slices){
+beginslice <- xmin + (i-1)*stepsize
+endslice <- xmin + i*stepsize
+#We get an error because the file is too big, will split up in horizontal slices, and then merge together later 
+con <-paste0("https://ows.emodnet-bathymetry.eu/wcs?service=wcs&version=1.0.0&request=getcoverage&coverage=emodnet:mean&crs=EPSG:4326&BBOX=",beginslice,",47.2,",endslice,",61.8&format=image/tiff&interpolation=nearest&resx=0.08333333&resy=0.08333333")
+nomfich <- file.path(envdir,paste0("bathy_sliced/","slice_",i, "_img.tiff"))
+download(con, nomfich, quiet = TRUE, mode = "wb")
+
+}
+
+
+#merge them together
+bathyrasters <- list()
+for(file in list.files(file.path(envdir,"bathy_sliced"))){
+ bathyrasters[[file]] <- rast(file.path(envdir,paste0("bathy_sliced/",file))) 
+}
+#Put the different spatrasters together in a spatrastercollection
+bathy_coll <- sprc(bathyrasters)
+
+#Merge the spatrastercollection
+bathy <- merge(bathy_coll)
+bathy[bathy>0] <- NA
+```
+
+# Extraction of values related to the occurrence points
+
+Now that we have the environmental layers loaded as spatial rasters, we need to extract the right values.
+It is easiest to download them first for all the months and then just select the right monthly value for each point.
+
+```{r}
+#For temperature
+rastertemp <- terra::extract(x=temp, y=PA[,c("decimallongitude","decimallatitude")], method="bilinear", na.rm=TRUE,df=T,ID=FALSE)
+#takes 20seconds
+resulttemp <- data.frame(temp = rastertemp[cbind(1:nrow(PA),as.numeric(PA$year_month))])
+
+#For NPP
+rasternpp <- terra::extract(x=mean_npp, y=PA[,c("decimallongitude","decimallatitude")], method="bilinear", na.rm=TRUE,df=T,ID=FALSE)
+#takes 20seconds
+resultnpp <- data.frame(npp = rasternpp[cbind(1:nrow(PA),as.numeric(PA$year_month))])
+
+
+#For salinity
+rastersal <- terra::extract(x=sal, y=PA[,c("decimallongitude","decimallatitude")], method="bilinear", na.rm=TRUE,df=T,ID=FALSE)
+#takes 20seconds
+
+resultsal <- data.frame(sal = rastersal[cbind(1:nrow(PA),as.numeric(PA$year_month))])
+
+#For bathymetry
+resultbathy <- terra::extract(x=bathy, y=PA[,c("decimallongitude","decimallatitude")], method="bilinear", na.rm=TRUE,df=T,ID=FALSE)
+#don't need to choose the right month, because bathy info is the same over the different months, only have one value
+
+```
+
+```{r}
+ggplot()+
+  geom_spatraster(data=bathy)
+
+mean_npp_test <- mean_npp[[1]]
+
+ggplot()+
+  geom_spatraster(data=mean_npp[[1]])
+
+ggplot()+
+  geom_spatraster(data=temp[[1]])
+
+ggplot()+
+  geom_spatraster(data=sal[[1]])
+```
+
+```{r}
+PA_env <- cbind(PA,resulttemp,resultsal,resultnpp,resultbathy)
+save(PA_env,file=file.path("data","PA_env.RData"))
+```
+
+
diff --git a/model_training/4_1EnsembleApproach.Rmd b/model_training/4_1EnsembleApproach.Rmd
new file mode 100644
index 0000000..a08a5ef
--- /dev/null
+++ b/model_training/4_1EnsembleApproach.Rmd
@@ -0,0 +1,344 @@
+---
+title: "Tidymodels ensemble approach"
+author: "Jo-Hannes Nowé"
+output: 
+html_document:
+  toc=TRUE
+---
+
+# Loading the required packages
+```{r}
+library(tidymodels)
+#devtools::install_github("jiho/autoplot")
+library(autoplot)
+library(stacks) #for ensembling the individual models together
+library(workflows)
+library(ranger) #for the randomforest model
+library(mgcv) #for the GAM model
+library(xgboost) #for the xgboost model
+library(earth) #for the MARS model
+```
+
+# Load data
+
+We start with presence-(pseudo)absence data that is coupled to certain environmental layers.
+This is the necessary starting point of this script, from here on we then build the 
+different individual models and combine them in an ensemble. (Possible to run the different
+models in parallel?).
+
+```{r}
+# Load presence-absence related to environmental data
+PAdata <- file.path(datadir,"PA_env.RData")
+
+load(PAdata)
+#Turn our occurrence into a factor so that there are two known levels, either
+#present or absent.
+PA_env$occurrenceStatus<-factor(PA_env$occurrenceStatus)
+#Remove all these columns because the data we want to predict on also only has the environmental values,
+#model will only accept input to predict on with the same columns as the data the model was trained on.
+PA_env <- PA_env%>%select(-c(scientificnameaccepted,decimallongitude,decimallatitude,datecollected,day,month,year,year_month,geometry))%>%
+  drop_na()%>%
+  mutate(bathy=slice_1_img)%>%
+  select(-slice_1_img)
+
+#Why make this into a tibble?
+data <- as_tibble(PA_env)%>%
+  dplyr::select(c(-ID,-scientificname))
+data
+```
+
+# Data splitting
+Need to split into a test and training set. Training set is used for training and validation.
+The test set is kept separate and only used for performance evaluation.
+We use a 80/20 distribution of train-test.
+Out of this 80% training set, we use 10% each fold for validation.
+
+Still need to implement cross-validation
+
+```{r}
+set.seed(222)
+#Put 4/5 in the training set
+data_split<- initial_split(data, prop=4/5)
+
+#Create data frames for the two sets:
+train_data <- training(data_split)
+test_data <- testing(data_split)
+
+#Create folds in the training data so we can do our validation.
+#Out of the 80% reserved for training data, use 10% each time for validation
+#This is why we choose the number of folds v=8
+folds <- vfold_cv(train_data, v=8)
+folds
+```
+
+
+# Pre-process your data with recipes
+Recipes can be used to define the formula, the dependent variable, predictors, others.
+The roles of the recipe can also be updated when necessary. This recipe is then 
+provided to the workflow, giving information about the necessary relations to calculate.
+
+Variables are the original columns in the data. These can then be used to define a
+formula. Roles define how variables will be used in the model. Examples are predictor,
+response, and case weight. Terms are columns in a design matrix, synonymous with
+features in machine learning. Variables that have predictor roles would automatically be
+main effect terms. 
+
+```{r}
+#The general form is: recipe(formula, data)
+
+#The formula is only used to declare the variables, their roles and nothing else.
+#All the rest can be added later.
+
+#Doesn't matter the data here is train_data. Just used to catalog the names of the 
+#variables and their types. 
+#The formula here states that occurrenceStatus is modelled in relation to all the other columns
+Occurrence_rec <- 
+  recipe(occurrenceStatus ~., data=train_data)
+
+#Can also do some pre-processing of the variables with this recipe, but which should you do?
+# step_* functions
+#Can also provide some checks here. These check_* functions conduct some sort of 
+#data validation, if no issue is found, they return the data as it is, otherwise they show an error.
+```
+
+
+The dot indicates to use all other variables as predictors
+A recipe is associated with the data set used to create the model. We can add roles to the recipe, using
+update_role() lets the recipe know that two of the variables have the role "Location"
+so it tells the recipe to keep these two variables but not use them as either outcomes
+or predictors. Keeping some variables without using them in the model can be useful
+for determining if certain values were wrongly predicted. 
+
+# Fit a model with a recipe, using a model workflow
+# Making an ensemble model with stacks
+
+In stacks, you need to save the assessment of predictions and workflow in your
+tune_grid(), tune_bayes() or fit_resamples() objects by setting control arguments
+save_pred=TRUE and save_workflow = TRUE. 
+
+Each model definition must share the same rsample rset object. 
+
+```{r}
+#We already generated our our rsample rset objects
+data_split
+train_data
+test_data
+
+#We also already made some folds
+folds
+
+#And a recipe
+Occurrence_rec
+
+#Define a certain metric?
+#metric <- metric_set(rmse)
+```
+Above we talked about saving the information in the resulting objects. This can also
+be done by using control_stack_*() functions.
+
+```{r}
+#Because we use tune_grid()
+ctrl_grid <- control_stack_grid()
+ctrl_res <- control_stack_resamples()
+```
+
+The random forest model is a model that does not require a lot of tuning, which is 
+why we just give fixed values.
+```{r}
+
+#set the number of trees to 500 (also the default), use it for classification into
+#presence-absence, and the algorithm to ranger (also the default)
+rf_mod <- rand_forest(trees=500,mode="classification",engine="ranger")
+
+#After your model is defined, create a worfklow object and add your model and recipe
+rf_wf <-
+  workflow() %>%
+  add_model(rf_mod) %>%
+  add_recipe(Occurrence_rec)
+
+#The workflow can then be fit or tuned, in this case it's fit but with resamples
+#so we have to provide the folds
+#Runs +- 2min
+rf_fit <-
+    fit_resamples(
+    rf_wf,
+    resamples = folds,
+    control = ctrl_res
+  )
+#With resamples as we need to provide this type to the stack command later
+rf_fit
+```
+
+```{r}
+rf_fit
+#Predict based on the test set, as noted need to make folds of this
+#predict and ppass the type parameter to get out a probability.
+rf_pred <- predict(rf_fit,test_data)%>%
+  bind_cols(predict(rf_fit,test_data, type="prob")) %>%
+  #To be able to compare the real value to the predicted one
+  bind_cols(test_data %>%
+              dplyr::select(occurrenceStatus)) #need to specify due to conflict by loading tidymodels
+rf_pred
+#Just one of the possible evaluation metrics, need to decide which ones to use
+rf_pred %>%
+  roc_auc(truth = occurrenceStatus, .pred_0)
+  
+rf_pred %>%
+  accuracy(truth= occurrenceStatus, .pred_class)
+
+```
+
+# Tuning model parameters
+```{r}
+#More information found on https://parsnip.tidymodels.org/reference/details_gen_additive_mod_mgcv.html
+gam_mod <- gen_additive_mod(adjust_deg_free = tune()) %>% 
+  set_engine("mgcv") %>% 
+  set_mode("classification")
+gam_wf <-
+  workflow()%>%
+  add_recipe(Occurrence_rec)%>%
+  add_model(gam_mod, formula = occurrenceStatus ~ s(bathy)+ s(temp)+ s(sal) + s(npp))
+gam_grid <- grid_regular(adjust_deg_free(),
+                         levels=4)
+gam_fit <-
+  gam_wf%>%
+  tune_grid(
+    resamples = folds,
+    grid=gam_grid,
+    control=ctrl_grid
+  )
+#started at 16:30 - 16:43
+# gam_res <-gam_fit%>%
+#   collect_metrics()
+# gam_best <- gam_fit%>%
+#   select_best("accuracy")
+# gam_final_wf <- gam_wf%>%
+#   finalize_workflow(gam_best)
+# #Need to also give in the control here, otherwise doesn't work
+# gam_fit <- gam_final_wf%>%
+#   last_fit(data_split,
+#            control=ctrl_grid)
+```
+
+```{r}
+#Tune the xgboost model, see: https://parsnip.tidymodels.org/reference/details_boost_tree_xgboost.html
+xgb_mod <- boost_tree(
+  trees = tune(),
+  tree_depth = tune(),
+  learn_rate = tune()
+) %>%
+  set_engine("xgboost") %>%
+  set_mode("classification")
+xgb_wf <- workflow() %>%
+  add_model(xgb_mod) %>%
+  add_recipe(Occurrence_rec)
+#grid_regular chooses sensible hyperparameter values to try, but want to choose ourself
+#Can give th elower and upper bound limit we want to use
+xgb_grid <- parameters(trees(),
+                          tree_depth(c(3,20)),
+                          learn_rate(c(-4,0)))
+xgb_grid <- grid_regular(xgb_grid,
+                           levels=c(trees=5,tree_depth=3,learn_rate=3))
+```
+
+```{r}
+xgb_fit <- 
+  xgb_wf %>% 
+  tune_grid(
+    resamples = folds,
+    grid = xgb_grid,
+    control=ctrl_grid)
+#Runs for +- 40min
+xgb_fit
+```
+
+```{r}
+#devtools::install_github("tidymodels/tune")
+#Need to update to this version of the tune package, otherwise it fails
+mars_mod <- 
+    mars(prod_degree = tune(), prune_method = tune()) %>% 
+    # This model can be used for classification or regression, so set mode
+    set_mode("classification") %>% 
+    set_engine("earth")#,glm=list(family=binomial))
+mars_mod
+mars_grid <- grid_regular(prod_degree(),prune_method(),
+                          levels=c(prod_degree=2,prune_method=2))
+mars_grid
+mars_wf <-
+  workflow()%>%
+  add_model(mars_mod)%>%
+  add_recipe(Occurrence_rec)
+mars_fit <-
+  mars_wf%>%
+  tune_grid(
+    resamples = folds,
+    grid=mars_grid,
+    control=ctrl_grid
+  )
+
+mars_fit
+```
+
+Now specify a random forest model that we are not optimizing.
+
+
+Now we have XX candidate models (1 RF, xx GAM, xx XGB and xx MARS). 
+We want to create a data stack: tibbles containing an assessment set predictions for 
+each candidate ensemble member. It work a bit like the basic ggplot() constructor.
+The function creates a basic structure that the object will be built on top of.
+
+```{r}
+stack_data <- 
+  stacks() %>%
+  add_candidates(rf_fit) %>%
+  add_candidates(mars_fit)%>%
+  add_candidates(gam_fit)%>%
+  add_candidates(xgb_fit)
+  
+stack_data
+```
+
+The first column gives the first response value and the remaining columns give the
+assessment set predictions for each ensemble member. In regression there is 1 column
+per ensemble member. In classification settings, as many columns as levels of the outcome variable
+per candidate ensemble member. 
+
+```{r}
+stack_mod <-
+  stack_data %>%
+  blend_predictions()
+#16h28-16h29
+#Problem that there are much more member models for xgboost?
+#However there was only 1 random forest and seemed to be good.
+stack_mod
+```
+Blend predictions performs regularization to figure out how we can combine the outputs
+from the stack members to come up with the final solution. Regularization needed
+as the ensemble members are highly correlated. Candidates with non-zero stacking 
+coefficients become members. 
+We can tune the penalty parameter of this blend_predictions as well, see:
+https://stacks.tidymodels.org/articles/basics.html
+
+```{r}
+#Now fit the whole model on the training set
+stack_fit <-
+  stack_mod %>%
+  fit_members()
+#16h30 - 16h33
+stack_fit
+```
+
+
+```{r}
+#Saving the model
+saveRDS(stack_fit,file.path(datadir,"stacked_model.rds"))
+
+#testing if the saving went as planned
+#stack_fit <- readRDS("stacked_model.rds")
+```
+
+
+
+
+
+
diff --git a/model_training/4_2ModelEvaluation.Rmd b/model_training/4_2ModelEvaluation.Rmd
new file mode 100644
index 0000000..d852663
--- /dev/null
+++ b/model_training/4_2ModelEvaluation.Rmd
@@ -0,0 +1,181 @@
+---
+title: "4_2 Model Evaluation Report"
+author: "Jo-Hannes Nowé, VLIZ"
+date: "May 30, 2024"
+output: html_document
+---
+
+THIS SCRIPT NEEDS TO BE UPDATED!
+See https://support.bccvl.org.au/support/solutions/articles/6000127046-sdm-interpretation-of-model-outputs
+For a good overview of what to provide to the user. 
+
+# Loading the required packages
+
+```{r load-packages, warning=FALSE}
+library(tidymodels)
+#devtools::install_github("jiho/autoplot")
+library(autoplot)
+library(stacks) #for ensembling the individual models together
+library(workflows)
+library(ranger) #for the randomforest model
+library(mgcv) #for the GAM model
+library(xgboost) #for the xgboost model
+library(earth) #for the MARS model
+```
+
+```{r load-model}
+model <- readRDS(file.path(datadir,'stacked_model.rds'))
+```
+
+```{r model-inspection}
+#Can use these to give some information of the performance of the individual models before they were stacked
+#These can be used to provide some training statistics
+#To provide good test statistics, need to at minimum use the test data and better would be to have an independent test set. 
+model$model_metrics
+model$equations
+```
+```{r}
+# Load presence-absence related to environmental data
+PAdata <- file.path(datadir,"PA_env.RData")
+
+load(PAdata)
+#Turn our occurrence into a factor so that there are two known levels, either
+#present or absent.
+PA_env$occurrenceStatus<-factor(PA_env$occurrenceStatus)
+#Remove all these columns because the data we want to predict on also only has the environmental values,
+#model will only accept input to predict on with the same columns as the data the model was trained on.
+PA_env <- PA_env%>%select(-c(scientificnameaccepted,decimallongitude,decimallatitude,datecollected,day,month,year,year_month,geometry))%>%
+  drop_na()%>%
+  mutate(bathy=slice_1_img)%>%
+  select(-slice_1_img)
+
+#Why make this into a tibble?
+data <- as_tibble(PA_env)
+data
+```
+```{r}
+set.seed(222)
+#Put 4/5 in the training set
+data_split<- initial_split(data, prop=4/5)
+
+#Create data frames for the two sets:
+train_data <- training(data_split)
+test_data <- testing(data_split)
+
+#Create folds in the training data so we can do our validation.
+#Out of the 80% reserved for training data, use 10% each time for validation
+#This is why we choose the number of folds v=8
+folds <- vfold_cv(train_data, v=8)
+folds
+```
+
+```{r}
+library(caret)
+test <- stats::predict(model,test_data,type="prob")
+test
+member_test <- stats::predict(model,test_data,type="prob",members=TRUE)
+member_test
+
+#Might be interesting to also extract the best performing model of the types of models that didn't get included
+#Just to plot their response curves and compare how they look like. 
+```
+```{r response-curves}
+#Sample random data or use the model data? 
+#Sample random values for our environmental variables
+# df <- data.frame(temp = rnorm(5000, 10, 4),
+#                  sal = rnorm(5000, 32, 3),
+#                  npp = rnorm(5000, 1000, 300),
+#                  bathy=-rlnorm(5000,3.7,1)) 
+
+
+
+#All the possible values of the model
+df <- data%>%dplyr::select(-occurrenceStatus)
+
+head(df) 
+
+#Make a temporary dataframe with a column for each variable
+df_temp <- data.frame(matrix(ncol = ncol(df), nrow = 100)) 
+head(df_temp)
+colnames(df_temp) <- colnames(df) 
+#Fill each column of this dataframe with the mean value of each variable
+df_temp[] <- rep(apply(df, 2, mean, na.rm = T), each = 100) 
+df_temp
+#In each iteration we keep all the variables at the mean value except for one
+#This one variable we give 100 different values between the min and max value we used for the model
+#This way we can see how the response changes for different values of this variable, when the others are kept at an average level
+for (i in 1:ncol(df)) {      
+  df_temp_proc <- df_temp      
+  df_temp_proc[,i] <- seq(min(df[,i]), max(df[,i]), length.out = 100)
+  pred <- stats::predict(model, df_temp_proc,type="prob")[,2]
+  if (i == 1) {     
+    pred_all <- data.frame(original = df_temp_proc[,i],prediction = pred,variable = colnames(df)[i])   }
+  else {     
+    pred_all <- rbind(pred_all, data.frame(original = df_temp_proc[,i],prediction = pred,variable = colnames(df)[i]))   
+  } 
+} 
+rc <- pred_all
+head(rc)
+ggplot(rc)+   
+  geom_point(aes(x = original, y = .pred_1))+
+  facet_wrap(~variable, scales = "free")
+
+ggplot(rc)+   
+  geom_smooth(aes(x = original, y = .pred_1))+
+  facet_wrap(~variable, scales = "free")
+```
+
+
+
+
+```{r response-curves-members}
+#Sample random data or use the model data? 
+#Sample random values for our environmental variables
+# df <- data.frame(temp = rnorm(5000, 10, 4),
+#                  sal = rnorm(5000, 32, 3),
+#                  npp = rnorm(5000, 1000, 300),
+#                  bathy=-rlnorm(5000,3.7,1)) 
+
+
+
+#All the possible values of the model
+df <- data%>%dplyr::select(-occurrenceStatus)
+
+head(df) 
+
+#Make a temporary dataframe with a column for each variable
+df_temp <- data.frame(matrix(ncol = ncol(df), nrow = 100)) 
+head(df_temp)
+colnames(df_temp) <- colnames(df) 
+#Fill each column of this dataframe with the mean value of each variable
+df_temp[] <- rep(apply(df, 2, mean, na.rm = T), each = 100) 
+df_temp
+#In each iteration we keep all the variables at the mean value except for one
+#This one variable we give 100 different values between the min and max value we used for the model
+#This way we can see how the response changes for different values of this variable, when the others are kept at an average level
+for (i in 1:ncol(df)) {      
+  df_temp_proc <- df_temp
+  #we change one of the columns to show the whole range of values, rest remain the mean
+  df_temp_proc[,i] <- seq(min(df[,i]), max(df[,i]), length.out = 100)
+  #for each member and the stack we make a prediction on this data
+  pred <- stats::predict(model, df_temp_proc,type="prob",members=TRUE)%>%
+    dplyr::select(starts_with(".pred_1"))
+  if (i == 1) {     
+    pred_all <- data.frame(original = df_temp_proc[,i],prediction = pred,variable = colnames(df)[i])   }
+  else {     
+    pred_all <- rbind(pred_all, data.frame(original = df_temp_proc[,i],prediction = pred,variable = colnames(df)[i]))   
+  } 
+  
+} 
+rc <- pred_all%>%
+  dplyr::mutate(ID=seq(1,nrow(pred_all)))%>%
+  pivot_longer(cols=starts_with("prediction..pred_1"),names_to="model",values_to="response")
+head(rc)
+ggplot(rc)+   
+  geom_smooth(aes(x = original, y = response,colour=model),se=FALSE)+
+  facet_wrap(~variable, scales = "free")
+
+```
+
+
+
diff --git a/model_training/5_MappingPredictions.Rmd b/model_training/5_MappingPredictions.Rmd
new file mode 100644
index 0000000..430d792
--- /dev/null
+++ b/model_training/5_MappingPredictions.Rmd
@@ -0,0 +1,189 @@
+---
+title: "Tidymodels ensemble approach"
+author: "Jo-Hannes Nowé"
+output: 
+html_document:
+  toc=TRUE
+---
+
+# Loading the required packages
+```{r}
+library(tidymodels)
+#devtools::install_github("jiho/autoplot")
+library(autoplot)
+library(stacks) #for ensembling the individual models together
+library(workflows)
+library(ranger) #for the randomforest model
+library(mgcv) #for the GAM model
+library(xgboost) #for the xgboost model
+library(earth) #for the MARS model
+```
+
+```{r}
+model <- readRDS(file.path(datadir,"stacked_model.rds"))
+```
+
+```{r}
+PAdata <- file.path("data","PA_env.RData")
+load(PAdata)
+#Turn our occurrence into a factor so that there are two known levels, either
+#present or absent.
+data$occurrenceStatus<-factor(data$occurrenceStatus)
+
+#Wrm maken we dit weer in een tibble?
+data <- as_tibble(data)%>%
+  dplyr::select(c(-ID,-scientificname))
+data
+```
+
+```{r}
+set.seed(222)
+#Put 4/5 in the training set
+data_split<- initial_split(data, prop=4/5)
+
+#Create data frames for the two sets:
+train_data <- training(data_split)
+test_data <- testing(data_split)
+```
+
+
+
+
+```{r}
+model #this is the model
+library(caret) #without this it might give an error trying to use a predict function from another package
+library(stacks)
+stack_fit <-
+  model %>%
+  fit_members()
+
+prediction <- predict(model,test_data)%>%
+  bind_cols(predict(model,test_data, type="prob")) %>%
+  bind_cols(test_data %>%
+              dplyr::select(occurrenceStatus))
+prediction
+
+```
+
+
+# Prepare environmental data
+```{r}
+#Make a custom function that can be used with the terra::predict function
+predprob <- function(...) predict(...,type="prob")$.pred_1
+#Try out how to make this one work as well
+predmem <- function(...) predict(..., type="prob",members=TRUE)
+```
+
+#For monthly maps using the same rasters as it was build on
+
+```{r}
+#Load the different layers (see environmental clean script)
+temp
+sal
+mean_npp
+bathy
+
+#Different layers, crop them to the same extent of bathy
+temp <- crop(temp,bathy) #layer1: 1999-01-01 -> layer270: 2021-06-01
+sal <- crop(sal,bathy)#layer1: 1999-01-01 -> layer270: 2021-06-01
+mean_npp <- crop(mean_npp,bathy)#layer1: 1999-01-01 -> layer288: 2022-12-31
+ext(bathy) <- ext(temp) #because there where small differences and then we can't combine them
+bathy <- resample(bathy, temp) #because after the change in extent we couldn't combine them without resampling bathy as its resolution changed
+monthly_prediction <- list()
+#Loop over every monthy layer (Takes +-1 min for 12 months)
+for(i in 1:270){
+  #Get out the right monthly layer and combine them together in a new spatraster that contains one layer for each predictor
+  monthly_info <- c(temp[[i]],sal[[i]],mean_npp[[i]],bathy)
+  #Delete the NA values
+  mask <- !is.na(monthly_info)
+  monthly_info <- terra::mask(monthly_info,mask)
+  #To make sure the names of the layers correpond to the names the model was trained on
+  names(monthly_info) <- c(
+            "temp",
+            "sal","npp","bathy")
+  monthly_prediction[[i]] <- predict(monthly_info,model,fun=predprob,na.rm=TRUE)
+}
+#To turn it into a raster layer  
+monthly_prediction <- rast(monthly_prediction)
+
+ggplot()+
+  geom_spatraster(data=monthly_prediction[[240]])+
+  scale_fill_viridis_c()#tidyterra package makes this possible
+
+#Write the monthly predictions into tif files for use in Rutendo's shiny app
+for(i in 1:nlyr(monthly_prediction)){
+filepath <- file.path(datadir,"species_rasters","monthly",paste0("Month",i,".tif"))
+  writeRaster(monthly_prediction[[i]],filepath,overwrite=TRUE)  
+}
+
+
+
+testmonthlyraster <- raster(file.path(datadir,"species_rasters","monthly","Month3.tif"))
+plot(testmonthlyraster)
+```
+```{r}
+for(i in 1:12){
+  ggplot()+
+  geom_spatraster(data=mean_prediction[[1]])+
+    scale_fill_viridis_c(limits = c(0, 1), name = "Habitat suitability")+
+    ggtitle(paste0("Month ",i))
+}
+
+ggplot()+
+  geom_spatraster(data=mean_prediction[[12]])+
+    scale_fill_viridis_c(limits = c(0, 1), name = "Habitat suitability")+
+    ggtitle(paste0("Month ",i))
+```
+
+```{r}
+#monthly_means
+monthly_seq <- rep(1:12,length.out=nlyr(monthly_prediction))
+monthly_dataset <- split(monthly_prediction,monthly_seq)
+monthly_dataset
+mean_prediction<-list()
+for(i in 1:length(monthly_dataset)){
+  mean_prediction[[i]] <- app(monthly_dataset[[i]],fun=mean)}
+
+for(i in 1:length(mean_prediction)){
+filepath <- file.path(datadir,"species_rasters","monthly_mean",paste0("Monthly_mean",i,".tif"))
+  writeRaster(mean_prediction[[i]],filepath,overwrite=TRUE)  
+  }
+```
+
+
+```{r}
+#Plotting an anmiated visual, gganimate is nice but not feasible for rasterdata yet, maybe with gifski for creating stopmotion but this package can't be downloaded
+library(gganimate)
+
+# Create separate plots
+plotlist <- list()
+for(i in 1:12){
+  plotlist[[paste0("Plot",i)]] <- ggplot()+
+  geom_spatraster(data=mean_prediction[[i]])+
+    scale_fill_viridis_c(limits = c(0, 1), name = "Habitat suitability")+
+    ggtitle(paste0("Month ",i))
+}
+
+#Storing the different frames in a folder
+for (i in seq_along(plotlist)) {
+  filename <- file.path(datadir,"plots","frames",paste0("plot", i, ".png"))
+  png(filename)
+  print(plotlist[[i]])
+  dev.off()
+}
+
+
+library(magick)
+# Read PNG files
+image<-list()
+for(i in 1:12){
+image[[i]] <- image_read(file.path(datadir,"plots","frames",paste0("plot", i, ".png")))
+}
+
+
+# Create GIF
+gif <- image_join(image)
+animated_gif <- image_animate(gif,fps=2)
+image_write(animated_gif,file.path(datadir,"plots","monthly_pred.gif"))
+```
+