From 27645f456f84f0961dbe26cbb255dabb3c615d7e Mon Sep 17 00:00:00 2001 From: Altan Orhon Date: Tue, 28 Jan 2025 15:50:31 -0800 Subject: [PATCH] Use collapse::pivot in resample.R functions --- DESCRIPTION | 3 +- R/resample.R | 232 +++++++++++++++++++++++++++++++++++++-------------- R/utils.R | 73 ++++++++++++++++ 3 files changed, 243 insertions(+), 65 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f85ed81..5ce4f11 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,7 +62,8 @@ Suggests: rmarkdown, testthat (>= 3.0.0), patchwork, - progress + progress, + collapse Config/testthat/edition: 3 URL: https://github.com/tractometry/tractable BugReports: https://github.com/tractometry/tractable/issues diff --git a/R/resample.R b/R/resample.R index 152d708..8b0f01b 100644 --- a/R/resample.R +++ b/R/resample.R @@ -1,11 +1,12 @@ + #' Shuffle an AFQ dataframe #' #' @description -#' This function shuffles participants' demographic information (i.e., age, -#' group, sex), thereby destroying correlations between participants' tract +#' This function shuffles participants' demographic information (i.e., age, +#' group, sex), thereby destroying correlations between participants' tract #' profiles and phenotypic data. #' -#' @param df The input dataframe. +#' @param df The input dataframe. #' @param target The column name that encodes the metric to model. #' @param shuffle_cols Column names that should be shuffled. #' @param node_col The column name that encodes tract node positions. @@ -16,8 +17,8 @@ #' Default: "tractID" #' @param participant_col The column name that encodes participant ID. #' Default: "subjectID". -#' @param sample_uniform Boolean flag. If TRUE, shuffling should sample -#' uniformly from the unique values in the columns. If +#' @param sample_uniform Boolean flag. If TRUE, shuffling should sample +#' uniformly from the unique values in the columns. If #' FALSE, shuffling will shuffle without replacement. #' #' @return A shuffled AFQ dataframe @@ -31,14 +32,14 @@ shuffle_df <- function( df, target, shuffle_cols = NULL, - node_col = "nodeID", + node_col = "nodeID", node_group = NULL, - tract_col = "tractID", + tract_col = "tractID", participant_col = "subjectID", sample_uniform = FALSE ) { - # argument input control - stopifnot("`df` must be a class data.frame or tibble" = + # argument input control + stopifnot("`df` must be a class data.frame or tibble" = any(class(df) %in% c("data.frame", "tbl_df"))) stopifnot("`target` must be a character" = is.character(target)) if (!is.null(shuffle_cols)) { @@ -52,11 +53,18 @@ shuffle_df <- function( stopifnot("`participant_col` must be a character" = is.character(participant_col)) stopifnot("`sample_uniform` must be a logical" = is.logical(sample_uniform)) - # pivot data frame to one row per participant - df_wide <- tidyr::pivot_wider( - data = df, - names_from = tidyselect::all_of(node_col), - values_from = tidyselect::all_of(target) + # Get option whether to use package `collapse`: + use_collapse <- as.logical(getOption("tractable.use_collapse", default = FALSE)) + if(use_collapse) use_collapse <- rlang::is_installed("collapse") + # Get option to control pivot row ordering for `tidyr::pivot_wider` (mostly for testing against `collapse`): + pivot_names_vary <- match.arg( + getOption("tractable.pivot_names_vary", default = "fastest"), + c("fastest", "slowest") + ) + # Get option to control pivot row ordering for `tidyr::pivot_longer` (mostly for testing against `collapse`): + pivot_cols_vary <- match.arg( + getOption("tractable.pivot_cols_vary", default = "fastest"), + c("fastest", "slowest") ) # if not given, determine shuffle columns @@ -66,30 +74,72 @@ shuffle_df <- function( shuffle_cols <- original_colnames[!original_colnames %in% static_cols] } - # shuffle participants' shuffle_cols and the grouping variable - for (svar in unique(c(shuffle_cols, node_group))) { - x <- df_wide[[svar]] # current values to shuffle - if (sample_uniform) { - # sample uniformly from the unique values (with replacement) - df_wide[[svar]] <- sample(unique(x), length(x), replace = TRUE) + # pivot data frame to one row per participant + if (!use_collapse) { + df_wide <- tidyr::pivot_wider( + data = df, + names_from = tidyselect::all_of(node_col), + values_from = tidyselect::all_of(target) + ) + + # shuffle participants' shuffle_cols and the grouping variable + for (svar in unique(c(shuffle_cols, node_group))) { + x <- df_wide[[svar]] # current values to shuffle + if (sample_uniform) { + # sample uniformly from the unique values (with replacement) + df_wide[[svar]] <- sample(unique(x), length(x), replace = TRUE) + } else { + # sample and shuffle the existing values + df_wide[[svar]] <- sample(x, length(x)) + } + } + + # return to long format (one row per node) + df_shuffled <- tidyr::pivot_longer( + data = df_wide, + cols = tidyselect::all_of(as.character(unique(df[[node_col]]))), + names_to = node_col, + values_to = target + ) %>% + dplyr::select(tidyselect::all_of(original_colnames)) + + # format column class to match original + for (var in original_colnames) { + class(df_shuffled[[var]]) <- class(df[[var]]) + } + + # set names attribute to match orignal: + names(df_shuffled[[node_col]]) <- names(df[[node_col]]) + } else { + df_wide <- collapse::pivot(df, + how = "wider", + names = node_col, + values = target + ) + + # set function to sample depending on whether to do it uniformly or not: + f <- if (sample_uniform) { + \(x) { + sample(collapse::funique(x), length(x), replace = TRUE) + } } else { - # sample and shuffle the existing values - df_wide[[svar]] <- sample(x, length(x)) + \(x) { + sample(x, length(x)) + } } - } - # return to long format (one row per node) - df_shuffled <- tidyr::pivot_longer( - data = df_wide, - cols = tidyselect::all_of(as.character(unique(df[[node_col]]))), - names_to = node_col, - values_to = target - ) %>% - dplyr::select(tidyselect::all_of(original_colnames)) - - # format column class to match original - for (var in original_colnames) { - class(df_shuffled[[var]]) <- class(df[[var]]) + # shuffle participants' shuffle_cols and the grouping variable: + collapse::settransformv(df_wide, vars = unique(c(shuffle_cols, node_group)), FUN = f) + # return to long format (one row per node) + df_shuffled <- collapse::pivot(data = df_wide, values = as.character(collapse::funique(df[[node_col]]))) |> collapse::colorderv(original_colnames) + + # Cast `node_col` back from factor to original type if it wasn't a factor: + if (!is.factor(df[[node_col]])) { + df_shuffled[[node_col]] <- .cast_as_thing.factor(df_shuffled[[node_col]], df[[node_col]], .use_collapse = TRUE) + } + # Copy original attributes back to `node_col`: + collapse::copyMostAttrib(df_shuffled[[node_col]], df[[node_col]]) + names(df_shuffled[[node_col]]) <- names(df[[node_col]]) } return(df_shuffled) @@ -99,11 +149,11 @@ shuffle_df <- function( #' Bootstrap an AFQ dataframe #' #' @description -#' This function bootstrap samples an AFQ dataframe by participant. That is, it -#' first pivots to wide format with one row per participant, bootstrap samples, +#' This function bootstrap samples an AFQ dataframe by participant. That is, it +#' first pivots to wide format with one row per participant, bootstrap samples, #' and finally pivots back to long format. #' -#' @param df The input dataframe. +#' @param df The input dataframe. #' @param target The column name that encodes the metric to model. #' @param node_col The column name that encodes tract node positions. #' Default: "nodeID" @@ -111,7 +161,7 @@ shuffle_df <- function( #' Default: NULL. #' @param participant_col The column name that encodes participant ID. #' Default: "subjectID". -#' +#' #' @return A shuffled AFQ dataframe #' @export #' @@ -122,12 +172,12 @@ shuffle_df <- function( bootstrap_df <- function( df, target, - node_col = "nodeID", + node_col = "nodeID", node_group = "group", participant_col = "subjectID" ) { - # argument input control - stopifnot("`df` must be a class data.frame or tibble" = + # argument input control + stopifnot("`df` must be a class data.frame or tibble" = any(class(df) %in% c("data.frame", "tbl_df"))) stopifnot("`target` must be a character" = is.character(target)) stopifnot("`node_col` must be a character" = is.character(node_col)) @@ -136,32 +186,86 @@ bootstrap_df <- function( } stopifnot("`participant_col` must be a character" = is.character(participant_col)) - # pivot data frame to one row per participant - df_wide <- tidyr::pivot_wider( - data = df, - names_from = tidyselect::all_of(node_col), - values_from = tidyselect::all_of(target) - ) %>% - dplyr::slice_sample(prop = 1, replace = TRUE) + # Get option whether to use package `collapse`: + use_collapse <- as.logical(getOption("tractable.use_collapse", default = FALSE)) + if(use_collapse) use_collapse <- rlang::is_installed("collapse") + + # Get option to control pivot row ordering for `tidyr::pivot_wider` (mostly for testing against `collapse`): + pivot_names_vary <- match.arg( + getOption("tractable.pivot_names_vary", default = "fastest"), + c("fastest", "slowest") + ) + # Get option to control pivot row ordering for `tidyr::pivot_longer` (mostly for testing against `collapse`): + pivot_cols_vary <- match.arg( + getOption("tractable.pivot_cols_vary", default = "fastest"), + c("fastest", "slowest") + ) # determine columns not used for pivoting original_colnames <- colnames(df) static_cols <- original_colnames[!original_colnames %in% c(node_col, target)] - # return to long format (one row per node) - df_bootstrap <- tidyr::pivot_longer( - data = df_wide, - cols = -tidyselect::all_of(static_cols), - names_to = node_col, - values_to = target - ) %>% - dplyr::select(tidyselect::all_of(original_colnames)) - - # format column class to match original - for (var in original_colnames) { - class(df_bootstrap[[var]]) <- class(df[[var]]) - } + if (!use_collapse) { + # pivot data frame to one row per participant + df_wide <- tidyr::pivot_wider( + data = df, + names_from = tidyselect::all_of(node_col), + values_from = tidyselect::all_of(target), + names_vary = pivot_names_vary + ) %>% + dplyr::slice_sample(prop = 1, replace = TRUE) + + # return to long format (one row per node) + df_bootstrap <- tidyr::pivot_longer( + data = df_wide, + cols = -tidyselect::all_of(static_cols), + names_to = node_col, + values_to = target, + cols_vary = pivot_cols_vary + ) %>% + dplyr::select(tidyselect::all_of(original_colnames)) + # format column class to match original + for (var in original_colnames) { + class(df_bootstrap[[var]]) <- class(df[[var]]) + } + } else { + # pivot data frame to one row per participant + df_wide <- collapse::pivot(df, + how = "wider", + names = node_col, + values = target + ) + + # subset df_wide randomly: + df_wide <- collapse::ss( + df_wide, + i = if (inherits(df, c("GRP_df", "grouped_df"))) { + unlist( + lapply(collapse::GRPN(df_wide, expand = FALSE), sample.int, replace = TRUE), + recursive = FALSE, + use.names = FALSE + ) + } else { + sample.int(nrow(df_wide), replace = TRUE) + } + ) + + # return to long format (one row per node): + df_bootstrap <- collapse::pivot( + data = df_wide, + values = setdiff(colnames(df_wide), static_cols), + names = list(variable = node_col, value = target) + ) |> collapse::colorderv(original_colnames) + + # Cast `node_col` back from factor to original type if it wasn't a factor: + if (!is.factor(df[[node_col]])) { + df_bootstrap[[node_col]] <- .cast_as_thing.factor(df_bootstrap[[node_col]], df[[node_col]], .use_collapse = TRUE) + } + # Copy original attributes back to `node_col`: + collapse::copyMostAttrib(df_bootstrap[[node_col]], df[[node_col]]) + names(df_bootstrap[[node_col]]) <- names(df[[node_col]]) + } return(df_bootstrap) } @@ -221,7 +325,7 @@ bootstrap_df <- function( # n_samples, # target, # tract, -# node_col = "nodeID", +# node_col = "nodeID", # node_group = "group", # participant_col = "subjectID", # sample_uniform = FALSE, @@ -253,7 +357,7 @@ bootstrap_df <- function( # df = df, # target = target, # shuffle_cols = covariates, -# node_col = node_col, +# node_col = node_col, # node_group = node_group, # participant_col = participant_col, # sample_uniform = sample_uniform diff --git a/R/utils.R b/R/utils.R index ffb3368..4013966 100644 --- a/R/utils.R +++ b/R/utils.R @@ -6,3 +6,76 @@ if (length(func_kwargs) == 0) { return(NULL) } return(func_kwargs) } + +.collap_slice_resample <- function(df){ + if(inherits(df, c("GRP_df", "grouped_df"))) { + indices <- unlist(lapply(collapse::GRPN(df, expand=FALSE), sample.int, replace = TRUE), recursive = FALSE, use.names = FALSE) + } else { + indices <- sample.int(nrow(df), replace = TRUE) + } + collapse::ss(df, i = indices) +} + + +.cast_as_thing.factor <- function(from, to=NULL, toClass=NULL, conv_fun_namer = function(cls) Filter(\(x)nchar(x)>3,paste0(c("as.", "as_"),Filter(\(x)nchar(x)>0,c(cls[[1]],paste0(toupper(substr(cls[[1]],1,1)),substring(cls[[1]],2)))))), ..., .use_collapse=FALSE) { + stopifnot(inherits(from, "factor")) + if (is.null(from)) { + return(from) + } + if (rlang::obj_address(from) == rlang::obj_address(to)) { + return(from) + } + if (!is.null(to)) { + if (all(class(from) == class(to))) { + return(from) + } + } + res <- NULL + cls <- if (is.null(to)) toClass else class(to) + if (inherits(from, cls[[1]])) { + return(from) + } + if (!is.null(res <- tryCatch(methods::as(from, cls[[1]]), error = \(e)NULL))) { + return(res) + } + pkg_ns <- tryCatch(asNamespace(attr(methods::getClass(cls), "package", exact = TRUE)), error = \(...)NULL) + conv_fn_names <- conv_fun_namer(cls[[1]]) + for (conv_fn_name in conv_fn_names) { + conv_fn <- get0(conv_fn_name) + if (is.null(conv_fn) && !is.null(pkg_ns)) conv_fn <- get0(conv_fn_name, envir = pkg_ns) + if (!is.null(conv_fn)) break + } + + if (is.null(conv_fn)) { + for (conv_fn_name in conv_fn_names) { + found <- utils::getAnywhere(conv_fn_name) + if (length(found$objs) > 0) { + conv_fn <- conv_fns$objs[[1]] + break + } + } + } + if (!is.null(conv_fn)) res <- conv_fn(from) + + if (!is.null(to)) { + if (!.use_collapse) { + unique_vals <- unique(to) + names(unique_vals) <- as.character(unique_vals) + stopifnot(!anyNA(names(unique_vals))) + res <- unique_vals[match(as.character(from), table = names(unique_vals))] + } else { + unique_vals <- collapse::funique(to) + names(unique_vals) <- as.character(unique_vals) + stopifnot(length(collapse::whichNA(names(unique_vals))) == 0) + res <- unique_vals[collapse::fmatch(collapse::as_character_factor(from), table = names(unique_vals))] + } + } + stopifnot(!is.null(res)) + dn <- attr(res, "dimnames", exact = TRUE) + d <- attr(res, "dim", exact = TRUE) + attributes(res) <- attributes(to) + attr(res, "dimnames") <- dn + attr(res, "dimnames") <- d + res +} +