diff --git a/.gitignore b/.gitignore index 0f6767c2a..dec5256d0 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,8 @@ inst/doc /doc/ /Meta/ CRAN-SUBMISSION +benchmarks/data paper/data .idea/ .vsc/ -paper/data \ No newline at end of file +paper/data diff --git a/benchmarks/rf_use_case/Rplots.pdf b/benchmarks/rf_use_case/Rplots.pdf new file mode 100644 index 000000000..243ae234c Binary files /dev/null and b/benchmarks/rf_use_case/Rplots.pdf differ diff --git a/benchmarks/rf_use_case/get_data.R b/benchmarks/rf_use_case/get_data.R new file mode 100644 index 000000000..5d39c67c1 --- /dev/null +++ b/benchmarks/rf_use_case/get_data.R @@ -0,0 +1,25 @@ +library(here) +library(mlr3oml) +library(tidytable) + +cc18_collection = ocl(99) + +cc18_simple = list_oml_data(data_id = cc18_collection$data_ids, + number_classes = 2, + number_missing_values = 0) + +cc18_small = cc18_simple |> + filter(NumberOfSymbolicFeatures == 1) |> # the target class is a symbolic feature + select(data_id, name, NumberOfFeatures, NumberOfInstances) |> + filter(name %in% c("qsar-biodeg", "madelon", "kc1", "blood-transfusion-service-center", "climate-model-simulation-crashes")) + +data_dir = here("benchmarks", "data") +if (!dir.exists(data_dir)) { + dir.create(data_dir) +} + +options(mlr3oml.cache = here(data_dir, "oml")) +mlr3misc::pwalk(cc18_small, function(data_id, name, NumberOfFeatures, NumberOfInstances) odt(data_id)) + +dir.create(here("benchmarks", "data", "oml", "collections")) +fwrite(cc18_small, here("benchmarks", "data", "oml", "collections", "cc18_small.csv")) diff --git a/benchmarks/rf_use_case/install_packages.R b/benchmarks/rf_use_case/install_packages.R new file mode 100644 index 000000000..9262c2a77 --- /dev/null +++ b/benchmarks/rf_use_case/install_packages.R @@ -0,0 +1,11 @@ +devtools::install_github("mlr-org/mlr3torch") +devtools::install_github("mlr-org/mlr3tuning@fix/int-tune-trafo") + +# Package names +packages = c("here", "mlr3oml", "tidytable", "mlr3", "mlr3learners", "mlr3tuning", "mlr3mbo", "bbotk", "bench", "data.table") + +# Install packages not yet installed +installed_packages = packages %in% rownames(installed.packages()) +if (any(installed_packages == FALSE)) { + install.packages(packages[!installed_packages], repos = "https://ftp.fau.de/cran/") +} \ No newline at end of file diff --git a/benchmarks/rf_use_case/run_benchmark.R b/benchmarks/rf_use_case/run_benchmark.R new file mode 100644 index 000000000..cc2c71cd2 --- /dev/null +++ b/benchmarks/rf_use_case/run_benchmark.R @@ -0,0 +1,62 @@ +library(mlr3verse) +library(mlr3oml) +library(mlr3torch) +library(mlr3batchmark) +library(mlr3mbo) +library(mlr3tuning) + +ids = c(1067, 1464, 1485, 1494, 40994) +task_list = lapply(ids, function(id) tsk("oml", data_id = id)) + +mlp = lrn("classif.mlp", + activation = nn_relu, + n_layers = to_tune(lower = 1, upper = 10), + neurons = to_tune(p_int(lower = 10, upper = 1000)), + batch_size = to_tune(c(64, 128, 256)), + p = to_tune(0.1, 0.9), + epochs = to_tune(lower = 1, upper = 1000L, internal = TRUE), + validate = "test", + measures_valid = msr("classif.logloss"), + patience = 10, + device = "auto", + predict_type = "prob" +) + +mlp$encapsulate("callr", lrn("classif.featureless")) + +surrogate = srlrn(as_learner(po("imputesample", affect_columns = selector_type("logical")) %>>% + po("imputeoor", multiplier = 3, affect_columns = selector_type(c("integer", "numeric", "character", "factor", "ordered"))) %>>% + po("colapply", applicator = as.factor, affect_columns = selector_type("character")) %>>% + lrn("regr.ranger")), catch_errors = TRUE) + +# define an AutoTuner that wraps the classif.mlp +at = auto_tuner( + learner = mlp, + tuner = tnr("mbo", surrogate = surrogate), + resampling = rsmp("cv", folds = 5), + measure = msr("internal_valid_score", minimize = TRUE), + term_evals = 1 +) + +lrn_rf = lrn("classif.ranger") + +design = benchmark_grid( + task_list, + learners = list(at, lrn_rf), + resampling = rsmp("cv", folds = 3) +) + +design1 = benchmark_grid( + task_list[[1]], + learners = list(at, lrn_rf), + resampling = rsmp("holdout") +) + +benchmark(design1) + +reg = makeExperimentRegistry( + file.dir = here("benchmarks", "rf_use_case", "reg"), + packages = c("mlr3verse", "mlr3oml", "mlr3torch", "batchmark") +) + +batchmark(design) diff --git a/benchmarks/rf_use_case/single_task.R b/benchmarks/rf_use_case/single_task.R new file mode 100644 index 000000000..5dd830924 --- /dev/null +++ b/benchmarks/rf_use_case/single_task.R @@ -0,0 +1,85 @@ +library(mlr3) +library(mlr3learners) +library(mlr3oml) +library(mlr3torch) +library(mlr3tuning) +library(mlr3mbo) +library(bbotk) + +library(bench) +library(data.table) +library(here) + +options(mlr3oml.cache = here("benchmarks", "data", "oml")) + +# define the tasks +cc18_small = fread(here(getOption("mlr3oml.cache"), "collections", "cc18_small.csv")) + +task_list = mlr3misc::pmap(cc18_small, function(data_id, name, NumberOfFeatures, NumberOfInstances) tsk("oml", data_id = data_id)) + +task_list + +# define the learners +# neurons = function(n_layers, latent_dim) { +# rep(latent_dim, n_layers) +# } + +# n_layers_values <- 1:5 +# latent_dim_values <- seq(10, 200, by = 20) +# neurons_search_space <- mapply( +# neurons, +# expand.grid(n_layers = n_layers_values, latent_dim = latent_dim_values)$n_layers, +# expand.grid(n_layers = n_layers_values, latent_dim = latent_dim_values)$latent_dim, +# SIMPLIFY = FALSE +# ) + +mlp = lrn("classif.mlp", + activation = nn_relu, + neurons = to_tune(ps( + n_layers = p_int(lower = 1, upper = 10), latent = p_int(10, 500), + .extra_trafo = function(x, param_set) { + list(neurons = rep(x$latent, x$n_layers)) + }) + ), + # neurons = to_tune(neurons_search_space), + batch_size = to_tune(c(64, 128, 256)), + p = to_tune(0.1, 0.7), + epochs = to_tune(upper = 1000L, internal = TRUE), + validate = "test", + measures_valid = msr("classif.acc"), + patience = 10, + device = "cpu" +) + +mlp$encapsulate("callr", lrn("classif.featureless")) + +# define an AutoTuner that wraps the classif.mlp +at = auto_tuner( + learner = mlp, + tuner = tnr("mbo"), + resampling = rsmp("cv", folds = 5), + measure = msr("classif.acc"), + term_evals = 10 +) + +future::plan("multisession", workers = 8) + +lrn_rf = lrn("classif.ranger") + +options(mlr3.exec_random = FALSE) + +design = benchmark_grid( + task_list[[1]], + learners = list(at, lrn_rf), + resampling = rsmp("cv", folds = 3) +) +design = design[order(mlr3misc::ids(learner)), ] + +time = bench::system_time( + bmr <- benchmark(design) +) + +bmrdt = as.data.table(bmr) + +fwrite(bmrdt, here("R", "rf_use_case", "results", "bmrdt.csv")) +fwrite(time, here("R", "rf_use_case", "results", "time.csv")) \ No newline at end of file diff --git a/benchmarks/rf_use_case/view_results.R b/benchmarks/rf_use_case/view_results.R new file mode 100644 index 000000000..8b3da5f4b --- /dev/null +++ b/benchmarks/rf_use_case/view_results.R @@ -0,0 +1,12 @@ +library(data.table) +library(mlr3) + +library(here) + +bmr_ce = fread(here("benchmarks", "rf_use_case", "results", "bmr_ce.csv")) + +bmr_ce + +time = fread(here("benchmarks", "rf_use_case", "results", "time.csv")) + +time