From 3836124cab958c84380f284945b5ce04b01ac964 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 17 Jan 2025 10:01:35 -0800 Subject: [PATCH] document should_use_sparsity() --- R/sparsevctrs.R | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/R/sparsevctrs.R b/R/sparsevctrs.R index 6eb9384..d75b5c1 100644 --- a/R/sparsevctrs.R +++ b/R/sparsevctrs.R @@ -34,8 +34,27 @@ allow_sparse <- function(x) { all(res$allow_sparse_x[res$engine == x$engine]) } -should_use_sparsity <- function(sparsity, model, n_rows) { - if (is.null(model) || model == "ranger") { +# This function was created using from the output of a mars model fit on the +# simulation data generated in `analysis/time_analysis.R` +# https://github.com/tidymodels/benchmark-sparsity-threshold +# +# The model was extracted using {tidypredict} and hand-tuned for speed. +# +# The model was fit on `sparsity`, `engine` and `n_rows` and the outcome was +# `log_fold` which is defined as +# `log(time to fit with dense data / time to fit with sparse data)`. +# Meaning that values above above 0 would reflects longer fit times for dense, +# Hence we want to use sparse data. +# +# At this time the only engines that support sparse data are glmnet, LiblineaR, +# ranger, and xgboost. Which is why they are the only ones listed here. +# This is fine as this code will only run if `allow_sparse()` returns `TRUE` +# Which only happens for these engines. +# +# Ranger is hard-coded to always fail since they appear to use the same +# algorithm for sparse and dense data, resulting in identical times. +should_use_sparsity <- function(sparsity, engine, n_rows) { + if (is.null(engine) || engine == "ranger") { return("no") } @@ -53,7 +72,7 @@ should_use_sparsity <- function(sparsity, model, n_rows) { ifelse(n_rows < 8000, 8000 - n_rows, 0) * -0.000798307404212627 - if (model == "xgboost") { + if (engine == "xgboost") { log_fold <- log_fold + ifelse(sparsity < 0.984615384615385, 0.984615384615385 - sparsity, 0) * 0.113098025073806 + @@ -64,7 +83,7 @@ should_use_sparsity <- function(sparsity, model, n_rows) { 0.913457808326756 } - if (model == "LiblineaR") { + if (engine == "LiblineaR") { log_fold <- log_fold + ifelse(sparsity > 0.836601307189543, sparsity - 0.836601307189543, 0) * -5.39592564852111