From 27645f456f84f0961dbe26cbb255dabb3c615d7e Mon Sep 17 00:00:00 2001
From: Altan Orhon <altan@ontocollage.net>
Date: Tue, 28 Jan 2025 15:50:31 -0800
Subject: [PATCH] Use collapse::pivot in resample.R functions

---
 DESCRIPTION  |   3 +-
 R/resample.R | 232 +++++++++++++++++++++++++++++++++++++--------------
 R/utils.R    |  73 ++++++++++++++++
 3 files changed, 243 insertions(+), 65 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f85ed81..5ce4f11 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -62,7 +62,8 @@ Suggests:
     rmarkdown,
     testthat (>= 3.0.0),
     patchwork,
-    progress
+    progress,
+    collapse
 Config/testthat/edition: 3
 URL: https://github.com/tractometry/tractable
 BugReports: https://github.com/tractometry/tractable/issues
diff --git a/R/resample.R b/R/resample.R
index 152d708..8b0f01b 100644
--- a/R/resample.R
+++ b/R/resample.R
@@ -1,11 +1,12 @@
+
 #' Shuffle an AFQ dataframe
 #'
 #' @description
-#' This function shuffles participants' demographic information (i.e., age, 
-#' group, sex), thereby destroying correlations between participants' tract 
+#' This function shuffles participants' demographic information (i.e., age,
+#' group, sex), thereby destroying correlations between participants' tract
 #' profiles and phenotypic data.
 #'
-#' @param df              The input dataframe. 
+#' @param df              The input dataframe.
 #' @param target          The column name that encodes the metric to model.
 #' @param shuffle_cols    Column names that should be shuffled.
 #' @param node_col        The column name that encodes tract node positions.
@@ -16,8 +17,8 @@
 #'                        Default: "tractID"
 #' @param participant_col The column name that encodes participant ID.
 #'                        Default: "subjectID".
-#' @param sample_uniform  Boolean flag. If TRUE, shuffling should sample 
-#'                        uniformly from the unique values in the columns. If 
+#' @param sample_uniform  Boolean flag. If TRUE, shuffling should sample
+#'                        uniformly from the unique values in the columns. If
 #'                        FALSE, shuffling will shuffle without replacement.
 #'
 #' @return A shuffled AFQ dataframe
@@ -31,14 +32,14 @@ shuffle_df <- function(
   df,
   target,
   shuffle_cols    = NULL,
-  node_col        = "nodeID", 
+  node_col        = "nodeID",
   node_group      = NULL,
-  tract_col       = "tractID", 
+  tract_col       = "tractID",
   participant_col = "subjectID",
   sample_uniform  = FALSE
 ) {
-  # argument input control 
-  stopifnot("`df` must be a class data.frame or tibble" = 
+  # argument input control
+  stopifnot("`df` must be a class data.frame or tibble" =
     any(class(df) %in% c("data.frame", "tbl_df")))
   stopifnot("`target` must be a character" = is.character(target))
   if (!is.null(shuffle_cols)) {
@@ -52,11 +53,18 @@ shuffle_df <- function(
   stopifnot("`participant_col` must be a character" = is.character(participant_col))
   stopifnot("`sample_uniform` must be a logical" = is.logical(sample_uniform))
 
-  # pivot data frame to one row per participant
-  df_wide <- tidyr::pivot_wider(
-    data        = df,
-    names_from  = tidyselect::all_of(node_col),
-    values_from = tidyselect::all_of(target)
+  # Get option whether to use package `collapse`:
+  use_collapse <- as.logical(getOption("tractable.use_collapse", default = FALSE))
+  if(use_collapse) use_collapse <- rlang::is_installed("collapse")
+  # Get option to control pivot row ordering for `tidyr::pivot_wider` (mostly for testing against `collapse`):
+  pivot_names_vary <- match.arg(
+    getOption("tractable.pivot_names_vary", default = "fastest"),
+    c("fastest", "slowest")
+  )
+  # Get option to control pivot row ordering for `tidyr::pivot_longer` (mostly for testing against `collapse`):
+  pivot_cols_vary <- match.arg(
+    getOption("tractable.pivot_cols_vary", default = "fastest"),
+    c("fastest", "slowest")
   )
 
   # if not given, determine shuffle columns
@@ -66,30 +74,72 @@ shuffle_df <- function(
     shuffle_cols <- original_colnames[!original_colnames %in% static_cols]
   }
 
-  # shuffle participants' shuffle_cols and the grouping variable
-  for (svar in unique(c(shuffle_cols, node_group))) {
-    x <- df_wide[[svar]] # current values to shuffle
-    if (sample_uniform) {
-      # sample uniformly from the unique values (with replacement)
-      df_wide[[svar]] <- sample(unique(x), length(x), replace = TRUE)
+  # pivot data frame to one row per participant
+  if (!use_collapse) {
+    df_wide <- tidyr::pivot_wider(
+      data        = df,
+      names_from  = tidyselect::all_of(node_col),
+      values_from = tidyselect::all_of(target)
+    )
+
+    # shuffle participants' shuffle_cols and the grouping variable
+    for (svar in unique(c(shuffle_cols, node_group))) {
+      x <- df_wide[[svar]] # current values to shuffle
+      if (sample_uniform) {
+        # sample uniformly from the unique values (with replacement)
+        df_wide[[svar]] <- sample(unique(x), length(x), replace = TRUE)
+      } else {
+        # sample and shuffle the existing values
+        df_wide[[svar]] <- sample(x, length(x))
+      }
+    }
+
+    # return to long format (one row per node)
+    df_shuffled <- tidyr::pivot_longer(
+      data      = df_wide,
+      cols      = tidyselect::all_of(as.character(unique(df[[node_col]]))),
+      names_to  = node_col,
+      values_to = target
+    ) %>%
+      dplyr::select(tidyselect::all_of(original_colnames))
+
+    # format column class to match original
+    for (var in original_colnames) {
+      class(df_shuffled[[var]]) <- class(df[[var]])
+    }
+
+    # set names attribute to match orignal:
+    names(df_shuffled[[node_col]]) <- names(df[[node_col]])
+  } else {
+    df_wide <- collapse::pivot(df,
+      how = "wider",
+      names = node_col,
+      values = target
+    )
+
+    # set function to sample depending on whether to do it uniformly or not:
+    f <- if (sample_uniform) {
+      \(x) {
+        sample(collapse::funique(x), length(x), replace = TRUE)
+      }
     } else {
-      # sample and shuffle the existing values
-      df_wide[[svar]] <- sample(x, length(x))
+      \(x) {
+        sample(x, length(x))
+      }
     }
-  }
 
-  # return to long format (one row per node)
-  df_shuffled <- tidyr::pivot_longer(
-    data      = df_wide,
-    cols      = tidyselect::all_of(as.character(unique(df[[node_col]]))),
-    names_to  = node_col, 
-    values_to = target
-  ) %>% 
-    dplyr::select(tidyselect::all_of(original_colnames)) 
-
-  # format column class to match original
-  for (var in original_colnames) {
-    class(df_shuffled[[var]]) <- class(df[[var]])
+    # shuffle participants' shuffle_cols and the grouping variable:
+    collapse::settransformv(df_wide, vars = unique(c(shuffle_cols, node_group)), FUN = f)
+    # return to long format (one row per node)
+    df_shuffled <- collapse::pivot(data = df_wide, values = as.character(collapse::funique(df[[node_col]]))) |> collapse::colorderv(original_colnames)
+
+    # Cast `node_col` back from factor to original type if it wasn't a factor:
+    if (!is.factor(df[[node_col]])) {
+      df_shuffled[[node_col]] <- .cast_as_thing.factor(df_shuffled[[node_col]], df[[node_col]], .use_collapse = TRUE)
+    }
+    # Copy original attributes back to `node_col`:
+    collapse::copyMostAttrib(df_shuffled[[node_col]], df[[node_col]])
+    names(df_shuffled[[node_col]]) <- names(df[[node_col]])
   }
 
   return(df_shuffled)
@@ -99,11 +149,11 @@ shuffle_df <- function(
 #' Bootstrap an AFQ dataframe
 #'
 #' @description
-#' This function bootstrap samples an AFQ dataframe by participant. That is, it 
-#' first pivots to wide format with one row per participant, bootstrap samples, 
+#' This function bootstrap samples an AFQ dataframe by participant. That is, it
+#' first pivots to wide format with one row per participant, bootstrap samples,
 #' and finally pivots back to long format.
 #'
-#' @param df              The input dataframe. 
+#' @param df              The input dataframe.
 #' @param target          The column name that encodes the metric to model.
 #' @param node_col        The column name that encodes tract node positions.
 #'                        Default: "nodeID"
@@ -111,7 +161,7 @@ shuffle_df <- function(
 #'                        Default: NULL.
 #' @param participant_col The column name that encodes participant ID.
 #'                        Default: "subjectID".
-#' 
+#'
 #' @return A shuffled AFQ dataframe
 #' @export
 #'
@@ -122,12 +172,12 @@ shuffle_df <- function(
 bootstrap_df <- function(
   df,
   target,
-  node_col        = "nodeID", 
+  node_col        = "nodeID",
   node_group      = "group",
   participant_col = "subjectID"
 ) {
-  # argument input control 
-  stopifnot("`df` must be a class data.frame or tibble" = 
+  # argument input control
+  stopifnot("`df` must be a class data.frame or tibble" =
     any(class(df) %in% c("data.frame", "tbl_df")))
   stopifnot("`target` must be a character" = is.character(target))
   stopifnot("`node_col` must be a character" = is.character(node_col))
@@ -136,32 +186,86 @@ bootstrap_df <- function(
   }
   stopifnot("`participant_col` must be a character" = is.character(participant_col))
 
-  # pivot data frame to one row per participant
-  df_wide <- tidyr::pivot_wider(
-    data        = df,
-    names_from  = tidyselect::all_of(node_col), 
-    values_from = tidyselect::all_of(target)
-  ) %>% 
-    dplyr::slice_sample(prop = 1, replace = TRUE)
+  # Get option whether to use package `collapse`:
+  use_collapse <- as.logical(getOption("tractable.use_collapse", default = FALSE))
+  if(use_collapse) use_collapse <- rlang::is_installed("collapse")
+
+  # Get option to control pivot row ordering for `tidyr::pivot_wider` (mostly for testing against `collapse`):
+  pivot_names_vary <- match.arg(
+    getOption("tractable.pivot_names_vary", default = "fastest"),
+    c("fastest", "slowest")
+  )
+  # Get option to control pivot row ordering for `tidyr::pivot_longer` (mostly for testing against `collapse`):
+  pivot_cols_vary <- match.arg(
+    getOption("tractable.pivot_cols_vary", default = "fastest"),
+    c("fastest", "slowest")
+  )
 
   # determine columns not used for pivoting
   original_colnames <- colnames(df)
   static_cols <- original_colnames[!original_colnames %in% c(node_col, target)]
 
-  # return to long format (one row per node)
-  df_bootstrap <- tidyr::pivot_longer(
-    data      = df_wide,
-    cols      = -tidyselect::all_of(static_cols),
-    names_to  = node_col,
-    values_to = target
-  ) %>% 
-    dplyr::select(tidyselect::all_of(original_colnames)) 
-
-  # format column class to match original
-  for (var in original_colnames) {
-    class(df_bootstrap[[var]]) <- class(df[[var]])
-  }
+  if (!use_collapse) {
+    # pivot data frame to one row per participant
+    df_wide <- tidyr::pivot_wider(
+      data = df,
+      names_from = tidyselect::all_of(node_col),
+      values_from = tidyselect::all_of(target),
+      names_vary = pivot_names_vary
+    ) %>%
+      dplyr::slice_sample(prop = 1, replace = TRUE)
+
+    # return to long format (one row per node)
+    df_bootstrap <- tidyr::pivot_longer(
+      data      = df_wide,
+      cols      = -tidyselect::all_of(static_cols),
+      names_to  = node_col,
+      values_to = target,
+      cols_vary =  pivot_cols_vary
+    ) %>%
+      dplyr::select(tidyselect::all_of(original_colnames))
 
+    # format column class to match original
+    for (var in original_colnames) {
+      class(df_bootstrap[[var]]) <- class(df[[var]])
+    }
+  } else {
+    # pivot data frame to one row per participant
+    df_wide <- collapse::pivot(df,
+      how = "wider",
+      names = node_col,
+      values = target
+    )
+
+    # subset df_wide randomly:
+    df_wide <- collapse::ss(
+      df_wide,
+      i = if (inherits(df, c("GRP_df", "grouped_df"))) {
+        unlist(
+          lapply(collapse::GRPN(df_wide, expand = FALSE), sample.int, replace = TRUE),
+          recursive = FALSE,
+          use.names = FALSE
+        )
+      } else {
+        sample.int(nrow(df_wide), replace = TRUE)
+      }
+    )
+
+    # return to long format (one row per node):
+    df_bootstrap <- collapse::pivot(
+      data = df_wide,
+      values = setdiff(colnames(df_wide), static_cols),
+      names = list(variable = node_col, value = target)
+    ) |> collapse::colorderv(original_colnames)
+
+    # Cast `node_col` back from factor to original type if it wasn't a factor:
+    if (!is.factor(df[[node_col]])) {
+      df_bootstrap[[node_col]] <- .cast_as_thing.factor(df_bootstrap[[node_col]], df[[node_col]], .use_collapse = TRUE)
+    }
+    # Copy original attributes back to `node_col`:
+    collapse::copyMostAttrib(df_bootstrap[[node_col]], df[[node_col]])
+    names(df_bootstrap[[node_col]]) <- names(df[[node_col]])
+  }
   return(df_bootstrap)
 }
 
@@ -221,7 +325,7 @@ bootstrap_df <- function(
 #   n_samples,
 #   target,
 #   tract,
-#   node_col = "nodeID", 
+#   node_col = "nodeID",
 #   node_group = "group",
 #   participant_col = "subjectID",
 #   sample_uniform = FALSE,
@@ -253,7 +357,7 @@ bootstrap_df <- function(
 #         df              = df,
 #         target          = target,
 #         shuffle_cols    = covariates,
-#         node_col        = node_col, 
+#         node_col        = node_col,
 #         node_group      = node_group,
 #         participant_col = participant_col,
 #         sample_uniform  = sample_uniform
diff --git a/R/utils.R b/R/utils.R
index ffb3368..4013966 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -6,3 +6,76 @@
   if (length(func_kwargs) == 0) { return(NULL) }
   return(func_kwargs)
 }
+
+.collap_slice_resample <- function(df){
+  if(inherits(df, c("GRP_df", "grouped_df"))) {
+    indices <- unlist(lapply(collapse::GRPN(df, expand=FALSE), sample.int, replace = TRUE), recursive = FALSE, use.names = FALSE)
+  } else {
+    indices <- sample.int(nrow(df), replace = TRUE)
+  }
+  collapse::ss(df, i = indices)
+}
+
+
+.cast_as_thing.factor <- function(from, to=NULL, toClass=NULL, conv_fun_namer = function(cls) Filter(\(x)nchar(x)>3,paste0(c("as.", "as_"),Filter(\(x)nchar(x)>0,c(cls[[1]],paste0(toupper(substr(cls[[1]],1,1)),substring(cls[[1]],2)))))), ..., .use_collapse=FALSE) {
+  stopifnot(inherits(from, "factor"))
+  if (is.null(from)) {
+    return(from)
+  }
+  if (rlang::obj_address(from) == rlang::obj_address(to)) {
+    return(from)
+  }
+  if (!is.null(to)) {
+    if (all(class(from) == class(to))) {
+      return(from)
+    }
+  }
+  res <- NULL
+  cls <- if (is.null(to)) toClass else class(to)
+  if (inherits(from, cls[[1]])) {
+    return(from)
+  }
+  if (!is.null(res <- tryCatch(methods::as(from, cls[[1]]), error = \(e)NULL))) {
+    return(res)
+  }
+  pkg_ns <- tryCatch(asNamespace(attr(methods::getClass(cls), "package", exact = TRUE)), error = \(...)NULL)
+  conv_fn_names <- conv_fun_namer(cls[[1]])
+  for (conv_fn_name in conv_fn_names) {
+    conv_fn <- get0(conv_fn_name)
+    if (is.null(conv_fn) && !is.null(pkg_ns)) conv_fn <- get0(conv_fn_name, envir = pkg_ns)
+    if (!is.null(conv_fn)) break
+  }
+
+  if (is.null(conv_fn)) {
+    for (conv_fn_name in conv_fn_names) {
+      found <- utils::getAnywhere(conv_fn_name)
+      if (length(found$objs) > 0) {
+        conv_fn <- conv_fns$objs[[1]]
+        break
+      }
+    }
+  }
+  if (!is.null(conv_fn)) res <- conv_fn(from)
+
+  if (!is.null(to)) {
+    if (!.use_collapse) {
+      unique_vals <- unique(to)
+      names(unique_vals) <- as.character(unique_vals)
+      stopifnot(!anyNA(names(unique_vals)))
+      res <- unique_vals[match(as.character(from), table = names(unique_vals))]
+    } else {
+      unique_vals <- collapse::funique(to)
+      names(unique_vals) <- as.character(unique_vals)
+      stopifnot(length(collapse::whichNA(names(unique_vals))) == 0)
+      res <- unique_vals[collapse::fmatch(collapse::as_character_factor(from), table = names(unique_vals))]
+    }
+  }
+  stopifnot(!is.null(res))
+  dn <- attr(res, "dimnames", exact = TRUE)
+  d <- attr(res, "dim", exact = TRUE)
+  attributes(res) <- attributes(to)
+  attr(res, "dimnames") <- dn
+  attr(res, "dimnames") <- d
+  res
+}
+