Modify implementation of summarise_pr in pf_simplify() to account for…

… the number of samples of each cell.
edwardlavender · Sep 13, 2021 · 1fdc415 · 1fdc415
1 parent 6db7b7d
commit 1fdc415
Show file tree

Hide file tree

Showing 7 changed files with 20 additions and 18 deletions.
diff --git a/R/pf_analyse_either.R b/R/pf_analyse_either.R
@@ -34,7 +34,7 @@
 #' # ... using pf_simplify() with return = "archive"
 #' dat_dcpf_histories_connected <-
 #'   pf_simplify(dat_dcpf_histories,
-#'              summarise_pr = max,
+#'              summarise_pr = TRUE,
 #'              return = "archive")
 #' ## Path based implementation
 #' # The example data 'dat_dcpf_paths' contains a sample of paths reconstructed
@@ -87,11 +87,11 @@ pf_plot_map <- function(xpf,
       if(any(duplicated(elm$id_current))) {
         if(xpf$method != "pf_simplify"){
           warning(paste0("xpf$history[[", t, "]] contains duplicate cells. ",
-                         "Implementing pf_simplify() with 'summarise_pr' and return = 'archive' specified first is advised."),
+                         "Implementing pf_simplify() with 'summarise_pr = TRUE' and return = 'archive' specified first is advised."),
                   immediate. = TRUE, call. = FALSE)
         } else {
           warning(paste0("xpf$history[[", t, "]] contains duplicate cells. ",
-                         "Did you implement pf_simplify() without specifying 'summarise_pr'?"),
+                         "Did you implement pf_simplify() without 'summarise_pr = TRUE'?"),
                   immediate. = TRUE, call. = FALSE)
         }
       }
@@ -187,7 +187,7 @@ pf_plot_map <- function(xpf,
 #' #### Example (1): Implement function using default options
 #' ## Implementation based on particles
 #' pp <- par(mfrow = c(1, 2))
-#' pf_kud(pf_simplify(dat_dcpf_histories, summarise_pr = max, return = "archive"),
+#' pf_kud(pf_simplify(dat_dcpf_histories, summarise_pr = TRUE, return = "archive"),
 #'        bathy = bathy, sample_size = 100,
 #'        estimate_ud = kud_around_coastline, grid = grid)
 #' ## Implementation based on paths

diff --git a/R/pf_classes.R b/R/pf_classes.R
@@ -22,7 +22,7 @@
 #'   }
 #' }
 #'
-#' \item{method}{A character that defines whether or not \code{history} was derived directly from \code{\link[flapper]{pf}} (\code{method = "pf"}), in which case \code{history} contains all of the particles sampled at each time step, or via \code{\link[flapper]{pf}} plus \code{\link[flapper]{pf_simplify}} with \code{return = "archive"} (\code{method = "pf_simplify"}), in which case \code{history} contains the subset of particles at each time step that were re-sampled at the next time step; for particles that were sampled multiple times on a given time step, this only contains the most probable sample (see \code{\link[flapper]{pf_simplify}}).}
+#' \item{method}{A character that defines whether or not \code{history} was derived directly from \code{\link[flapper]{pf}} (\code{method = "pf"}), in which case \code{history} contains all of the particles sampled at each time step, or via \code{\link[flapper]{pf}} plus \code{\link[flapper]{pf_simplify}} with \code{return = "archive"} (\code{method = "pf_simplify"}), in which case \code{history} contains the subset of particles at each time step that were re-sampled at the next time step; for implementations with \code{summarise_pr = TRUE}, for particles that were sampled multiple times on a given time step, this only one sample with an adjusted probability score that accounts for the number of times that a location was sampled (see \code{\link[flapper]{pf_simplify}}).}
 #'
 #' \item{args}{A named list that records the function arguments passed to \code{\link[flapper]{pf}}. This is as inputted to \code{\link[flapper]{pf}}, but with the `calc_distance_graph' elements added if unsupplied and applicable.}
 #'

diff --git a/R/pf_simplify.R b/R/pf_simplify.R
@@ -10,7 +10,7 @@
 #' @param calc_distance_graph (optional) If \code{calc_distance = "lcp"}, \code{calc_distance_graph} is a graph object that defines the distances between connected cells in \code{bathy}. If unsupplied, this is taken from \code{archive$args$calc_distance_graph}, if available, or computed via \code{\link[flapper]{lcp_graph_surface}}.
 #' @param cl,varlist,use_all_cores Parallelisation options for the first stage of the algorithm, which identifies connected cell pairs, associated distances and movement probabilities. The first parallelisation option is to parallelise the algorithm over time steps via \code{cl}. This is a cluster object created by \code{\link[parallel]{makeCluster}} or an integer defining the number of child processes (ignored on Windows) (see \code{\link[pbapply]{pblapply}}). If \code{cl} is supplied, \code{varlist} may be required. This is a character vector of object names to export (see \code{\link[parallel]{clusterExport}}). Exported objects must be located in the global environment. The second parallelisation option is to parallelise shortest distance calculations within time steps via a logical input (\code{TRUE}) to \code{use_all_cores} that is passed to \code{\link[cppRouting]{get_distance_matrix}}. This option is only implemented for \code{calc_distance = "lcp"}.
 #' @param return A character (\code{return = "path"} or \code{return = "archive"}) that defines the type of object that is returned (see Details).
-#' @param summarise_pr (optional) For \code{return = "archive"}, \code{summarise_pf} is a function that summarises the probabilities of duplicate cell records for each time step (e.g., \code{\link[base]{mean}} or \code{\link[base]{max}}). If supplied, only one record of each sampled cell is returned per time step, with the associated probability calculated from \code{summarise_pf}. This option is useful for deriving maps of the `probability of use' across an area based on particle histories because it ensures that `probability of use' scores depend on the number of time steps during which an individual could have occupied a location, rather than the total number of samples of that location (see \code{\link[flapper]{pf_plot_map}}).
+#' @param summarise_pr (optional) For \code{return = "archive"}, \code{summarise_pf} logical input that defines whether or not to summarise the probabilities of duplicate cell records for each time step. If supplied, only one record of each sampled cell is returned per time step, with the associated probability calculated as the sum of the normalised probabilities of all samples for that cell. This option is useful for deriving maps of the `probability of use' across an area based on particle histories because it ensures that `probability of use' scores depend on the number of time steps during which an individual could have occupied a location, rather than the total number of samples of that location (see \code{\link[flapper]{pf_plot_map}}).
 #' @param max_n_copies (optional) For \code{return = "path"}, \code{max_n_copies} is an integer that specifies the maximum number of copies of a sampled cell that are retained at each time stamp. Each copy represents a different route to that cell. By default, all copies (i.e. routes to that cell are retained) via \code{max_n_copies = NULL}. However, in cases where there are a large number of paths through a landscape, the function can run into vector memory limitations during path assembly, so \code{max_n_copies} may need to be set. In this case, at each time step, if there are more than \code{max_n_copies} paths to a given cell, then a subset of these (\code{max_n_copies}) are sampled, according to the \code{sample_method} argument.
 #' @param sample_method (optional) For \code{return = "path"}, if \code{max_n_copies} is supplied, \code{sample_method} is a character that defines the sampling method. Currently supported options are: \code{"random"}, which implements random sampling; \code{"weighted"}, which implements weighted sampling, with random samples taken according to their probability at the current time step; and \code{"max"}, which selects for the top \code{max_n_copies} most likely copies of a given cell according to the probability associated with movement into that cell from the previous location.
 #' @param max_n_paths (optional) For \code{return = "path"}, \code{max_n_paths} is an integer that specifies the maximum number of paths to be reconstructed. During path assembly, following the implementation of \code{max_n_copies} (if provided), \code{max_n_paths} are selected at random at each time step. This option is provided to improve the speed of path assembly in situations with large numbers of paths.
@@ -19,7 +19,7 @@
 #'
 #' @details The implementation of this function depends on how \code{\link[flapper]{pf}} has been implemented and the \code{return} argument. Under the default options in \code{\link[flapper]{pf}}, the fast Euclidean distances method is used to sample sequential particle positions, in which case the history of each particle through the landscape is not retained and has to be assembled afterwards. In this case, \code{\link[flapper]{pf_simplify}} calculates the distances between all combinations of cells at each time step, using either a Euclidean distances or shortest distances algorithm according to the input to \code{calc_distance}. Distances are converted to probabilities using the `intrinsic' probabilities associated with each location and the movement models retained in \code{archive} from the call to \code{\link[flapper]{pf}} to identify possible movement paths between cells at each time step. If the fast Euclidean distances method has not been used, then pairwise cell movements are retained by \code{\link[flapper]{pf}}. In this case, the function simply recalculates distances between sequential cell pairs and the associated cell probabilities, which are then processed according to the \code{return} argument.
 #'
-#' Following the identification of pairwise cell movements, if \code{return = "archive"}, the function selects all of the unique cells at each time step that were connected to cells at the next time step. (For cells that were selected multiple times at a given time step, due to sampling with replacement in \code{\link[flapper]{pf}}, if \code{summarise_pr} is supplied, only one sample (e.g., the probable sample) is retained: in maps of the `probability of use' across an area (see \code{\link[flapper]{pf_plot_map}}), this ensures that cell scores depend on the number of time steps when the individual could have occupied a given cell, rather than the total number of samples of a location.) Otherwise, if \code{return = "path"}, pairwise cell movements are assembled into complete movement paths.
+#' Following the identification of pairwise cell movements, if \code{return = "archive"}, the function selects all of the unique cells at each time step that were connected to cells at the next time step. (For cells that were selected multiple times at a given time step, due to sampling with replacement in \code{\link[flapper]{pf}}, if \code{summarise_pr} is supplied, only one sample is retained: in maps of the `probability of use' across an area (see \code{\link[flapper]{pf_plot_map}}), this ensures that cell scores depend on the number of time steps when the individual could have occupied a given cell, rather than the total number of samples of a location.) Otherwise, if \code{return = "path"}, pairwise cell movements are assembled into complete movement paths.
 #'
 #' @return If \code{return = "archive"}, the function returns a \code{\link[flapper]{pf_archive-class}} object, as inputted, but in which only the most likely record of each cell that was connected to cells at the next time step is retained and with the \code{method = "pf_simplify"} flag. If \code{return = "path"}, the function returns a \code{\link[flapper]{pf_path-class}} object, which is a dataframe that defines the movement paths.
 #'
@@ -159,7 +159,7 @@
 #' # Now, there is only one (the most likely) record of sampled cells
 #' # ... at each time step.
 #' out_dcpf_6c <- pf_simplify(out_dcpf_6b,
-#'                            summarise_pr = max,
+#'                            summarise_pr = TRUE,
 #'                            return = "archive")
 #' head(out_dcpf_6c$history[[1]])
 #' table(duplicated(out_dcpf_6c$history[[1]]$id_current))
@@ -213,7 +213,7 @@ pf_simplify <- function(archive,
                         calc_distance_graph = NULL,
                         cl = NULL, varlist = NULL, use_all_cores = FALSE,
                         return = c("path", "archive"),
-                        summarise_pr = NULL,
+                        summarise_pr = FALSE,
                         max_n_copies = NULL,
                         sample_method = c("random", "weighted", "max"),
                         max_n_paths = NULL,
@@ -496,10 +496,12 @@ pf_simplify <- function(archive,
         history_for_t %>%
         dplyr::group_by(.data$id_current) %>%
         dplyr::arrange(.data$id_current, dplyr::desc(.data$pr_current))
-      if(!is.null(summarise_pr)){
+      if(summarise_pr){
+        denom <- sum(history_for_t$pr_current)
         history_for_t <-
           history_for_t %>%
-          dplyr::mutate(pr_current = summarise_pr(.data$pr_current)) %>%
+          dplyr::mutate(pr_current = .data$pr_current/denom) %>%
+          dplyr::mutate(pr_current = sum(.data$pr_current)) %>%
           dplyr::slice(1L)
       }
       history_for_t <-

diff --git a/man/pf_archive-class.Rd b/man/pf_archive-class.Rd
diff --git a/man/pf_kud.Rd b/man/pf_kud.Rd
diff --git a/man/pf_plot_map.Rd b/man/pf_plot_map.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,7 +22,7 @@ @@
     #'   }
     #' }
     #'
-    #' \item{method}{A character that defines whether or not \code{history} was derived directly from \code{\link[flapper]{pf}} (\code{method = "pf"}), in which case \code{history} contains all of the particles sampled at each time step, or via \code{\link[flapper]{pf}} plus \code{\link[flapper]{pf_simplify}} with \code{return = "archive"} (\code{method = "pf_simplify"}), in which case \code{history} contains the subset of particles at each time step that were re-sampled at the next time step; for particles that were sampled multiple times on a given time step, this only contains the most probable sample (see \code{\link[flapper]{pf_simplify}}).}
+    #' \item{method}{A character that defines whether or not \code{history} was derived directly from \code{\link[flapper]{pf}} (\code{method = "pf"}), in which case \code{history} contains all of the particles sampled at each time step, or via \code{\link[flapper]{pf}} plus \code{\link[flapper]{pf_simplify}} with \code{return = "archive"} (\code{method = "pf_simplify"}), in which case \code{history} contains the subset of particles at each time step that were re-sampled at the next time step; for implementations with \code{summarise_pr = TRUE}, for particles that were sampled multiple times on a given time step, this only one sample with an adjusted probability score that accounts for the number of times that a location was sampled (see \code{\link[flapper]{pf_simplify}}).}
     #'
     #' \item{args}{A named list that records the function arguments passed to \code{\link[flapper]{pf}}. This is as inputted to \code{\link[flapper]{pf}}, but with the `calc_distance_graph' elements added if unsupplied and applicable.}
     #'
@@ Expand Down @@