Merge pull request #2794 from mlr-org/fix2771__factor_na

mlr-org · Oct 28, 2020 · 7d2138f · 7d2138f
2 parents 4cb6ab9 + bfc2cf9
commit 7d2138f
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,8 @@
 # mlr 2.18.0.9000
 
-- Internal changes only.
+- Warning if `fix.factors.prediction = TRUE` causes the generation of NAs for new factor levels in prediction.
+- Clear error message if prediction of wrapped learner has not the same length as `newdata`.
+- Internal changes.
 
 
 # mlr 2.18.0

diff --git a/R/predict.R b/R/predict.R
@@ -144,6 +144,12 @@ predict.WrappedModel = function(object, task, newdata, subset = NULL, ...) {
         dump = addClasses(get("last.dump", envir = .GlobalEnv), "mlr.dump")
       }
     }
+    # did the prediction fail otherwise?
+    np = nrow(p)
+    if (is.null(np)) np = length(p)
+    if (np != nrow(newdata)) {
+      stopf("predictLearner for %s has returned %i predictions instead of %i!", learner$id, np, nrow(newdata))
+    }
   }
   if (missing(task)) {
     ids = NULL

diff --git a/R/predictLearner.R b/R/predictLearner.R
@@ -55,8 +55,13 @@ predictLearner2 = function(.learner, .model, .newdata, ...) {
     ns = intersect(colnames(.newdata), ns)
     fls = fls[ns]
     if (length(ns) > 0L) {
-      .newdata[ns] = mapply(factor, x = .newdata[ns],
-        levels = fls, SIMPLIFY = FALSE)
+      safe_factor = function(x, levels) {
+        if (length(setdiff(levels(x), levels)) > 0) {
+          warning("fix.factors.prediction = TRUE produced NAs because of new factor levels in prediction data.")
+        }
+        factor(x, levels)
+      }
+      .newdata[ns] = mapply(safe_factor, x = .newdata[ns], levels = fls, SIMPLIFY = FALSE)
     }
   }
   p = predictLearner(.learner, .model, .newdata, ...)

diff --git a/man/getFeatureImportance.Rd b/man/getFeatureImportance.Rd
diff --git a/tests/testthat/test_base_resample.R b/tests/testthat/test_base_resample.R
@@ -186,7 +186,8 @@ test_that("resample printer respects show.info", {
 })
 
 test_that("resample drops unseen factors in predict data set", {
-  data = data.frame(a = c("a", "b", "a", "b", "a", "c"),
+  data = data.frame(
+    a = c("a", "b", "a", "b", "a", "c"),
     b = c(1, 1, 2, 2, 2, 1),
     trg = c("a", "b", "a", "b", "a", "b"),
     stringsAsFactors = TRUE)
@@ -202,6 +203,12 @@ test_that("resample drops unseen factors in predict data set", {
 
   lrn = makeLearner("classif.logreg", fix.factors.prediction = TRUE)
   model = train(lrn, subsetTask(task, 1:4))
-  predict(model, subsetTask(task, 5:6))
-  resample(lrn, task, resinst)
+  expect_warning(predict(model, subsetTask(task, 5:6)), "produced NAs because of new factor levels")
+  expect_warning(resample(lrn, task, resinst), "produced NAs because of new factor levels")
+
+  # do it manually
+  train_task = makeClassifTask("unseen.factors", data[1:4,], "trg", fixup = "quiet") # quiet becasue
+  # we get dropped factors warning (which we want here)
+  model = train(lrn, train_task)
+  expect_warning(predict(model, newdata = data[5:6,]), "produced NAs because of new factor levels")
 })
diff --git a/tests/testthat/test_classif_ksvm.R b/tests/testthat/test_classif_ksvm.R
@@ -48,3 +48,21 @@ test_that("classif_ksvm", {
   testCV("classif.ksvm", multiclass.df, multiclass.target, tune.train = tt, tune.predict = tp,
     parset = list(kernel = "polydot", degree = 3, offset = 2, scale = 1.5))
 })
+
+test_that("classif_ksvm produces error for new factor levels on predict", {
+  # https://github.com/mlr-org/mlr/issues/2771
+  train_data = data.frame(
+    A = sample(c("A","B"), 10, TRUE),
+    B = factor(sample(c("A", "B"), 10, replace = T))
+  )
+  test_data = data.frame(
+    A = sample(c("A","B"), 10, TRUE),
+    B = factor(sample(c("A", "B","C"), 10, replace = T))
+  )
+  lrn = makeLearner("classif.ksvm", fix.factors.prediction = TRUE)
+  train_task = makeClassifTask(data = train_data, target = "A")
+  model = train(lrn, train_task)
+  expect_warning({
+    expect_error(predict(model, newdata = test_data), "has returned .+ instead of 10")
+  }, "produced NAs because of new factor levels")
+})