From 3836124cab958c84380f284945b5ce04b01ac964 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Fri, 17 Jan 2025 10:01:35 -0800
Subject: [PATCH] document should_use_sparsity()

---
 R/sparsevctrs.R | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/R/sparsevctrs.R b/R/sparsevctrs.R
index 6eb9384..d75b5c1 100644
--- a/R/sparsevctrs.R
+++ b/R/sparsevctrs.R
@@ -34,8 +34,27 @@ allow_sparse <- function(x) {
   all(res$allow_sparse_x[res$engine == x$engine])
 }
 
-should_use_sparsity <- function(sparsity, model, n_rows) {
-  if (is.null(model) || model == "ranger") {
+# This function was created using from the output of a mars model fit on the
+# simulation data generated in `analysis/time_analysis.R`
+# https://github.com/tidymodels/benchmark-sparsity-threshold
+#
+# The model was extracted using {tidypredict} and hand-tuned for speed.
+#
+# The model was fit on `sparsity`, `engine` and `n_rows` and the outcome was 
+# `log_fold` which is defined as 
+# `log(time to fit with dense data / time to fit with sparse data)`.
+# Meaning that values above above 0 would reflects longer fit times for dense,
+# Hence we want to use sparse data.
+#
+# At this time the only engines that support sparse data are glmnet, LiblineaR, 
+# ranger, and xgboost. Which is why they are the only ones listed here.
+# This is fine as this code will only run if `allow_sparse()` returns `TRUE`
+# Which only happens for these engines.
+# 
+# Ranger is hard-coded to always fail since they appear to use the same 
+# algorithm for sparse and dense data, resulting in identical times.
+should_use_sparsity <- function(sparsity, engine, n_rows) {
+  if (is.null(engine) || engine == "ranger") {
     return("no")
   }
 
@@ -53,7 +72,7 @@ should_use_sparsity <- function(sparsity, model, n_rows) {
       ifelse(n_rows < 8000, 8000 - n_rows, 0) *
       -0.000798307404212627
 
-  if (model == "xgboost") {
+  if (engine == "xgboost") {
     log_fold <- log_fold +
       ifelse(sparsity < 0.984615384615385, 0.984615384615385 - sparsity, 0) *
         0.113098025073806 +
@@ -64,7 +83,7 @@ should_use_sparsity <- function(sparsity, model, n_rows) {
       0.913457808326756
   }
 
-  if (model == "LiblineaR") {
+  if (engine == "LiblineaR") {
     log_fold <- log_fold +
       ifelse(sparsity > 0.836601307189543, sparsity - 0.836601307189543, 0) *
         -5.39592564852111