mlr-org · be-marc · Feb 11, 2025 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -28,7 +28,9 @@ Authors@R:
            comment = c(ORCID = "0000-0002-8115-0400")),
     person("Sebastian", "Fischer", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0002-9609-3197")),
-    person("Lona", "Koers", , "[email protected]", role = "ctb")
+    person("Lona", "Koers", , "[email protected]", role = "ctb"),
+    person("John", "Zobolas", , "[email protected]", role = "ctb",
+           comment = c(ORCID = "0000-0002-3609-8674"))
   )
 Description: Efficient, object-oriented programming on the
     building blocks of machine learning. Provides 'R6' objects for tasks,

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # mlr3 (development version)
 
+* feat: add new `col_role` offset in `Task` and offset `Learner` property.
+A warning is produced if a learner that doesn't support offsets is trained with a task that has an offset column.
 * fix: the `$predict_newdata()` method of `Learner` now automatically conducts type conversions (#685)
 * BREAKING_CHANGE: Predicting on a `task` with the wrong column information is now an error and not a warning.
 * Column names with UTF-8 characters are now allowed by default.

diff --git a/R/Task.R b/R/Task.R
@@ -496,7 +496,7 @@ Task = R6Class("Task",
       }
 
       # columns with these roles must be present in data
-      mandatory_roles = c("target", "feature", "weight", "group", "stratum", "order")
+      mandatory_roles = c("target", "feature", "weight", "group", "stratum", "order", "offset")
       mandatory_cols = unlist(private$.col_roles[mandatory_roles], use.names = FALSE)
       missing_cols = setdiff(mandatory_cols, data$colnames)
       if (length(missing_cols)) {
@@ -896,6 +896,7 @@ Task = R6Class("Task",
     #' * `"strata"`: The task is resampled using one or more stratification variables (role `"stratum"`).
     #' * `"groups"`: The task comes with grouping/blocking information (role `"group"`).
     #' * `"weights"`: The task comes with observation weights (role `"weight"`).
+    #' * `"offset"`: The task includes one or more offset columns specifying fixed adjustments for model training and possibly for prediction (role `"offset"`).
     #' * `"ordered"`: The task has columns which define the row order (role `"order"`).
     #'
     #' Note that above listed properties are calculated from the `$col_roles` and may not be set explicitly.
@@ -907,6 +908,7 @@ Task = R6Class("Task",
           if (length(col_roles$group)) "groups" else NULL,
           if (length(col_roles$stratum)) "strata" else NULL,
           if (length(col_roles$weight)) "weights" else NULL,
+          if (length(col_roles$offset)) "offset" else NULL,
           if (length(col_roles$order)) "ordered" else NULL
         )
       } else {
@@ -951,6 +953,11 @@ Task = R6Class("Task",
     #'   Not more than a single column can be associated with this role.
     #' * `"stratum"`: Stratification variables. Multiple discrete columns may have this role.
     #' * `"weight"`: Observation weights. Not more than one numeric column may have this role.
+    #' * `"offset"`: Numeric columns used to specify fixed adjustments for model training.
+    #'   Some models use offsets to simply shift predictions, while others incorporate them to boost predictions from a baseline model.
+    #'   For learners supporting offsets in multiclass settings, an offset column must be provided for each target class.
+    #'   These columns must follow the naming convention `"offset_{target_class_name}"`.
+    #'   For an example of a learner that supports offsets, see `LearnerClassifXgboost`  of \CRANpkg{mlr3learners}.
     #'
     #' `col_roles` is a named list whose elements are named by column role and each element is a `character()` vector of column names.
     #' To alter the roles, just modify the list, e.g. with \R's set functions ([intersect()], [setdiff()], [union()], \ldots).
@@ -1084,6 +1091,32 @@ Task = R6Class("Task",
       setnames(data, c("row_id", "weight"))[]
     },
 
+    #' @field offset ([data.table::data.table()])\cr
+    #' If the task has a column with designated role `"offset"`, a table with two or more columns:
+    #'
+    #' * `row_id` (`integer()`), and
+    #' * offset variable(s) (`numeric()`).
+    #'
+    #' For regression or binary classification tasks, there will be only a single-column offset.
+    #' For multiclass tasks, it may return multiple offset columns, one for each target class.
+    #' If there is only one offset column, it will be named as `offset`.
+    #'
+    #' If there are no columns with the `"offset"` role, `NULL` is returned.
+    offset = function(rhs) {
+      assert_has_backend(self)
+      assert_ro_binding(rhs)
+      offset_cols = private$.col_roles$offset
+      if (length(offset_cols) == 0L) {
+        return(NULL)
+      }
+
+      data = self$backend$data(private$.row_roles$use, c(self$backend$primary_key, offset_cols))
+      if (length(offset_cols) == 1L) {
+        setnames(data, c("row_id", "offset"))[]
+      } else  {
+        setnames(data, c("row_id", offset_cols))[]
+      }
+    },
 
     #' @field labels (named `character()`)\cr
     #'   Retrieve `labels` (prettier formated names) from columns.
@@ -1250,6 +1283,17 @@ task_check_col_roles.Task = function(task, new_roles, ...) {
     }
   }
 
+  # check offset
+  if (length(new_roles[["offset"]]) && any(fget(task$col_info, new_roles[["offset"]], "type", key = "id") %nin% c("numeric", "integer"))) {
+    stopf("Offset column(s) %s must be a numeric or integer column", paste0("'", new_roles[["offset"]], "'", collapse = ","))
+  }
+
+  if (any(task$missings(cols = new_roles[["offset"]]) > 0)) {
+    missings = task$missings(cols = new_roles[["offset"]])
+    missings = names(missings[missings > 0])
+    stopf("Offset column(s) %s contain missing values", paste0("'", missings, "'", collapse = ","))
+  }
+
   return(new_roles)
 }
 
@@ -1266,16 +1310,25 @@ task_check_col_roles.TaskClassif = function(task, new_roles, ...) {
     stopf("Target column(s) %s must be a factor or ordered factor", paste0("'", new_roles[["target"]], "'", collapse = ","))
   }
 
+  if (length(new_roles[["offset"]]) > 1L && length(task$class_names) == 2L) {
+    stop("There may only be up to one column with role 'offset' for binary classification tasks")
+  }
+
+  if (length(new_roles[["offset"]]) > 1L) {
+    expected_names = paste0("offset_", task$class_names)
+    expect_subset(new_roles[["offset"]], expected_names, label = "col_roles")
+  }
+
   NextMethod()
 }
 
 #' @rdname task_check_col_roles
 #' @export
 task_check_col_roles.TaskRegr = function(task, new_roles, ...) {
-
-  # check target
-  if (length(new_roles[["target"]]) > 1L) {
-    stopf("There may only be up to one column with role 'target'")
+  for (role in c("target", "offset")) {
+    if (length(new_roles[[role]]) > 1L) {
+      stopf("There may only be up to one column with role '%s'", role)
+    }
   }
 
   if (length(new_roles[["target"]]) && any(fget(task$col_info, new_roles[["target"]], "type", key = "id") %nin% c("numeric", "integer"))) {

diff --git a/R/assertions.R b/R/assertions.R
@@ -147,6 +147,11 @@ assert_task_learner = function(task, learner, param_values = NULL, cols = NULL)
     }
   }
 
+  if ("offset" %in% task$properties && "offset" %nin% learner$properties) {
+    warningf("Task '%s' has offset, but learner '%s' does not support this, so it will be ignored",
+             task$id, learner$id)
+  }
+
   tmp = mlr_reflections$task_mandatory_properties[[task$task_type]]
   if (length(tmp)) {
     tmp = setdiff(intersect(task$properties, tmp), learner$properties)

diff --git a/R/mlr_reflections.R b/R/mlr_reflections.R
@@ -94,14 +94,14 @@ local({
     "use"
   )
 
-  tmp = c("feature", "target", "name", "order", "stratum", "group", "weight")
+  tmp = c("feature", "target", "name", "order", "stratum", "group", "weight", "offset")
   mlr_reflections$task_col_roles = list(
     regr = tmp,
     classif = tmp,
     unsupervised = c("feature", "name", "order")
   )
 
-  tmp = c("strata", "groups", "weights")
+  tmp = c("strata", "groups", "weights", "offset")
   mlr_reflections$task_properties = list(
     classif = c(tmp, "twoclass", "multiclass"),
     regr = tmp,
@@ -114,11 +114,11 @@ local({
 
   mlr_reflections$task_print_col_roles = list(
     before = character(),
-    after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights" = "weight")
+    after = c("Order by" = "order", "Strata" = "stratum", "Groups" = "group", "Weights" = "weight", "Offset" = "offset")
   )
 
   ### Learner
-  tmp = c("featureless", "missings", "weights", "importance", "selected_features", "oob_error", "hotstart_forward", "hotstart_backward", "validation", "internal_tuning", "marshal")
+  tmp = c("featureless", "missings", "weights", "importance", "selected_features", "oob_error", "hotstart_forward", "hotstart_backward", "validation", "internal_tuning", "marshal", "offset")
   mlr_reflections$learner_properties = list(
     classif = c(tmp, "twoclass", "multiclass"),
     regr = tmp

diff --git a/inst/testthat/helper_autotest.R b/inst/testthat/helper_autotest.R
@@ -74,11 +74,30 @@ generate_generic_tasks = function(learner, proto) {
   # task with weights
   if ("weights" %in% learner$properties) {
     tmp = proto$clone(deep = TRUE)$cbind(data.frame(weights = runif(n)))
-    tmp$col_roles$weight = "weights"
-    tmp$col_roles$features = setdiff(tmp$col_roles$features, "weights")
+    tmp$set_col_roles(cols = "weights", roles = "weight")
     tasks$weights = tmp
   }
 
+  # task with offset
+  if ("offset" %in% learner$properties) {
+    if ("multiclass" %in% tmp$properties) {
+      offset_cols = paste0("offset_", proto$class_names)
+      # One offset column per class
+      offset_data = as.data.frame(
+        mlr3misc::set_names(
+          lapply(offset_cols, function(col) runif(n)),
+          offset_cols
+        )
+      )
+      tmp = proto$clone(deep = TRUE)$cbind(offset_data)
+      tmp$set_col_roles(cols = offset_cols, roles = "offset")
+    } else {
+      tmp = proto$clone(deep = TRUE)$cbind(data.frame(offset = runif(n)))
+      tmp$set_col_roles(cols = "offset", roles = "offset")
+    }
+    tasks$offset = tmp
+  }
+
   # task with non-ascii feature names
   if (p > 0L) {
     sel = proto$feature_types[list(learner$feature_types), "id", on = "type", with = FALSE, nomatch = NULL][[1L]]

diff --git a/man-roxygen/param_learner_properties.R b/man-roxygen/param_learner_properties.R
@@ -4,6 +4,7 @@
 #'   The following properties are currently standardized and understood by learners in \CRANpkg{mlr3}:
 #'   * `"missings"`: The learner can handle missing values in the data.
 #'   * `"weights"`: The learner supports observation weights.
+#'   * `"offset"`: The learner can incorporate offset values to adjust predictions.
 #'   * `"importance"`: The learner supports extraction of importance scores, i.e. comes with an `$importance()` extractor function (see section on optional extractors in [Learner]).
 #'   * `"selected_features"`: The learner supports extraction of the set of selected features, i.e. comes with a `$selected_features()` extractor function (see section on optional extractors in [Learner]).
 #'   * `"oob_error"`: The learner supports extraction of estimated out of bag error, i.e. comes with a `oob_error()` extractor function (see section on optional extractors in [Learner]).

diff --git a/man/Learner.Rd b/man/Learner.Rd
diff --git a/man/LearnerClassif.Rd b/man/LearnerClassif.Rd
diff --git a/man/LearnerRegr.Rd b/man/LearnerRegr.Rd
diff --git a/man/Task.Rd b/man/Task.Rd
diff --git a/man/mlr3-package.Rd b/man/mlr3-package.Rd
diff --git a/tests/testthat/test_Task.R b/tests/testthat/test_Task.R
@@ -248,15 +248,18 @@ test_that("stratify works", {
 })
 
 test_that("groups/weights work", {
-  b = as_data_backend(data.table(x = runif(20), y = runif(20), w = runif(20), g = sample(letters[1:2], 20, replace = TRUE)))
+  b = as_data_backend(data.table(x = runif(20), y = runif(20), w = runif(20),
+                                 o = runif(20), g = sample(letters[1:2], 20, replace = TRUE)))
   task = TaskRegr$new("test", b, target = "y")
   task$set_row_roles(16:20, character())
 
   expect_false("groups" %chin% task$properties)
   expect_false("weights" %chin% task$properties)
+  expect_false("offset" %chin% task$properties)
   expect_null(task$groups)
   expect_null(task$weights)
 
+  # weight
   task$col_roles$weight = "w"
   expect_subset("weights", task$properties)
   expect_data_table(task$weights, ncols = 2, nrows = 15)
@@ -265,6 +268,7 @@ test_that("groups/weights work", {
   task$col_roles$weight = character()
   expect_true("weights" %nin% task$properties)
 
+  # group
   task$col_roles$group = "g"
   expect_subset("groups", task$properties)
   expect_data_table(task$groups, ncols = 2, nrows = 15)
@@ -726,3 +730,4 @@ test_that("warn when internal valid task has 0 obs", {
   task = tsk("iris")
   expect_warning({task$internal_valid_task = 151}, "has 0 observations")
 })
+
diff --git a/tests/testthat/test_TaskClassif.R b/tests/testthat/test_TaskClassif.R
@@ -112,3 +112,53 @@ test_that("target is encoded as factor (#629)", {
   dt$target = ordered(dt$target)
   TaskClassif$new(id = "XX", backend = dt, target = "target")
 })
+
+test_that("offset column role works with binary tasks", {
+  task = tsk("pima")
+  expect_null(task$offset)
+
+  task$set_col_roles("age", "offset")
+  expect_subset("offset", task$properties)
+  expect_data_table(task$offset, nrows = task$nrow, ncols = 2)
+  expect_subset(c("row_id", "offset"), names(task$offset))
+
+  expect_error({
+     task$col_roles$offset = c("glucose", "diabetes")
+  }, "There may only be up to one column with role")
+
+  expect_error({
+    task$col_roles$offset = c("glucose")
+  }, "contain missing values")
+
+  expect_warning(lrn("classif.rpart")$train(task), "has offset")
+})
+
+test_that("offset column role works with multiclass tasks", {
+  task = tsk("penguins")
+  expect_null(task$offset)
+  task$set_col_roles("year", "offset")
+  expect_subset("offset", task$properties)
+  expect_data_table(task$offset, nrows = task$nrow, ncols = 2)
+  expect_subset(c("row_id", "offset"), names(task$offset))
+
+  expect_error({
+    task$col_roles$offset = "bill_length"
+  }, "contain missing values")
+
+  task = tsk("wine")
+
+  expect_error({
+    task$col_roles$offset = c("alcohol", "ash")
+  }, "Must be a subset of")
+
+  task = tsk("wine")
+  data = task$data()
+  set(data, j = "offset_1", value = runif(nrow(data)))
+  set(data, j = "offset_2", value = runif(nrow(data)))
+  task = as_task_classif(data, target = "type")
+  task$set_col_roles(c("offset_1", "offset_2"), "offset")
+
+  expect_subset("offset", task$properties)
+  expect_data_table(task$offset, nrows = task$nrow, ncols = 3)
+  expect_subset(c("row_id", "offset_1", "offset_2"), names(task$offset))
+})