From f20eba5ca2dbb892a4dee6dac0af40f423071cca Mon Sep 17 00:00:00 2001 From: Jakob Richter Date: Mon, 26 Oct 2020 13:08:05 +0100 Subject: [PATCH 1/4] add error and warning on new factor levels --- R/predict.R | 6 ++++++ R/predictLearner.R | 9 +++++++-- tests/testthat/test_base_resample.R | 10 +++++++++- tests/testthat/test_classif_ksvm.R | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/R/predict.R b/R/predict.R index 3a0de7270c..5da64383da 100644 --- a/R/predict.R +++ b/R/predict.R @@ -144,6 +144,12 @@ predict.WrappedModel = function(object, task, newdata, subset = NULL, ...) { dump = addClasses(get("last.dump", envir = .GlobalEnv), "mlr.dump") } } + # did the prediction fail otherwise? + np = nrow(p) + if (is.null(np)) np = length(p) + if (np != nrow(newdata)) { + stopf("predictLearner for %s has returned %i predictions instead of %i!", learner$id, np, nrow(newdata)) + } } if (missing(task)) { ids = NULL diff --git a/R/predictLearner.R b/R/predictLearner.R index 1f326dcd55..3e8d4f1825 100644 --- a/R/predictLearner.R +++ b/R/predictLearner.R @@ -55,8 +55,13 @@ predictLearner2 = function(.learner, .model, .newdata, ...) { ns = intersect(colnames(.newdata), ns) fls = fls[ns] if (length(ns) > 0L) { - .newdata[ns] = mapply(factor, x = .newdata[ns], - levels = fls, SIMPLIFY = FALSE) + safe_factor = function(x, levels) { + if (length(setdiff(levels(x), levels)) > 0) { + warning("fix.factors.prediction = TRUE produced NAs because of new factor levels in prediction data.") + } + factor(x, levels) + } + .newdata[ns] = mapply(safe_factor, x = .newdata[ns], levels = fls, SIMPLIFY = FALSE) } } p = predictLearner(.learner, .model, .newdata, ...) diff --git a/tests/testthat/test_base_resample.R b/tests/testthat/test_base_resample.R index bd63c22a75..73870636ea 100644 --- a/tests/testthat/test_base_resample.R +++ b/tests/testthat/test_base_resample.R @@ -186,7 +186,8 @@ test_that("resample printer respects show.info", { }) test_that("resample drops unseen factors in predict data set", { - data = data.frame(a = c("a", "b", "a", "b", "a", "c"), + data = data.frame( + a = c("a", "b", "a", "b", "a", "c"), b = c(1, 1, 2, 2, 2, 1), trg = c("a", "b", "a", "b", "a", "b"), stringsAsFactors = TRUE) @@ -204,4 +205,11 @@ test_that("resample drops unseen factors in predict data set", { model = train(lrn, subsetTask(task, 1:4)) predict(model, subsetTask(task, 5:6)) resample(lrn, task, resinst) + + # do it manually + train_task = makeClassifTask("unseen.factors", data[1:4,], "trg", fixup = "quiet") # quiet becasue + # we get dropped factors warning (which we want here) + model = train(lrn, train_task) + predict(model, newdata = data[5:6,]) + }) diff --git a/tests/testthat/test_classif_ksvm.R b/tests/testthat/test_classif_ksvm.R index 9a3b8953c1..7375816e8a 100644 --- a/tests/testthat/test_classif_ksvm.R +++ b/tests/testthat/test_classif_ksvm.R @@ -48,3 +48,21 @@ test_that("classif_ksvm", { testCV("classif.ksvm", multiclass.df, multiclass.target, tune.train = tt, tune.predict = tp, parset = list(kernel = "polydot", degree = 3, offset = 2, scale = 1.5)) }) + +test_that("classif_ksvm produces error for new factor levels on predict" { + # https://github.com/mlr-org/mlr/issues/2771 + train_data = data.frame( + A = sample(c("A","B"), 10, TRUE), + B = factor(sample(c("A", "B"), 10, replace = T)) + ) + test_data = data.frame( + A = sample(c("A","B"), 10, TRUE), + B = factor(sample(c("A", "B","C"), 10, replace = T)) + ) + lrn = makeLearner("classif.ksvm", fix.factors.prediction = TRUE) + train_task = makeClassifTask(data = train_data, target = "A") + model = train(lrn, train_task) + expect_warning({ + expect_error(predict(model, newdata = test_data), "has returned .+ instead of 10") + }, "produced NAs because of new factor levels") +}) From f19ae655b1fb259ad44b111181d6d681679db696 Mon Sep 17 00:00:00 2001 From: Jakob Richter Date: Mon, 26 Oct 2020 13:18:47 +0100 Subject: [PATCH 2/4] fix --- tests/testthat/test_base_resample.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/testthat/test_base_resample.R b/tests/testthat/test_base_resample.R index 73870636ea..2bc60341a2 100644 --- a/tests/testthat/test_base_resample.R +++ b/tests/testthat/test_base_resample.R @@ -210,6 +210,5 @@ test_that("resample drops unseen factors in predict data set", { train_task = makeClassifTask("unseen.factors", data[1:4,], "trg", fixup = "quiet") # quiet becasue # we get dropped factors warning (which we want here) model = train(lrn, train_task) - predict(model, newdata = data[5:6,]) - + expect_warning(predict(model, newdata = data[5:6,]), "produced NAs because of new factor levels") }) From aa00188b7ccde6ae049e7c728b85e16ac2b94cc4 Mon Sep 17 00:00:00 2001 From: Jakob Richter Date: Mon, 26 Oct 2020 17:42:19 +0100 Subject: [PATCH 3/4] fix --- man/getFeatureImportance.Rd | 2 +- tests/testthat/test_base_resample.R | 4 ++-- tests/testthat/test_classif_ksvm.R | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/getFeatureImportance.Rd b/man/getFeatureImportance.Rd index 0d6f63232f..89c0b47145 100644 --- a/man/getFeatureImportance.Rd +++ b/man/getFeatureImportance.Rd @@ -54,7 +54,7 @@ This is identical to randomForest. \item randomForestSRC \cr This method can calculate feature importance for various measures. By default the Breiman-Cutler permutation method is used. See -\code{\link[randomForestSRC:vimp.rfsrc]{randomForestSRC::vimp()}} for details. +\code{\link[randomForestSRC:vimp]{randomForestSRC::vimp()}} for details. \item ranger \cr Supports both measures mentioned above for the randomForest learner. Note, that you need to specifically set the learners parameter diff --git a/tests/testthat/test_base_resample.R b/tests/testthat/test_base_resample.R index 2bc60341a2..628490b370 100644 --- a/tests/testthat/test_base_resample.R +++ b/tests/testthat/test_base_resample.R @@ -203,8 +203,8 @@ test_that("resample drops unseen factors in predict data set", { lrn = makeLearner("classif.logreg", fix.factors.prediction = TRUE) model = train(lrn, subsetTask(task, 1:4)) - predict(model, subsetTask(task, 5:6)) - resample(lrn, task, resinst) + expect_warning(predict(model, subsetTask(task, 5:6)), "produced NAs because of new factor levels") + expect_warning(resample(lrn, task, resinst), "produced NAs because of new factor levels") # do it manually train_task = makeClassifTask("unseen.factors", data[1:4,], "trg", fixup = "quiet") # quiet becasue diff --git a/tests/testthat/test_classif_ksvm.R b/tests/testthat/test_classif_ksvm.R index 7375816e8a..dbaa2559c7 100644 --- a/tests/testthat/test_classif_ksvm.R +++ b/tests/testthat/test_classif_ksvm.R @@ -49,7 +49,7 @@ test_that("classif_ksvm", { parset = list(kernel = "polydot", degree = 3, offset = 2, scale = 1.5)) }) -test_that("classif_ksvm produces error for new factor levels on predict" { +test_that("classif_ksvm produces error for new factor levels on predict", { # https://github.com/mlr-org/mlr/issues/2771 train_data = data.frame( A = sample(c("A","B"), 10, TRUE), From bfc2cf92fc6ac475e13f89d666857e0a3a3a66c8 Mon Sep 17 00:00:00 2001 From: Jakob Richter Date: Wed, 28 Oct 2020 14:08:33 +0100 Subject: [PATCH 4/4] news --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index cb3ba38197..f230bd663b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ # mlr 2.18.0.9000 -- Internal changes only. +- Warning if `fix.factors.prediction = TRUE` causes the generation of NAs for new factor levels in prediction. +- Clear error message if prediction of wrapped learner has not the same length as `newdata`. +- Internal changes. # mlr 2.18.0