1
- # ' @title Factor Encoding
1
+ # ' @title Piecewise Linear Encoding Base Class
2
2
# '
3
3
# ' @usage NULL
4
- # ' @name mlr_pipeops_encode
5
- # ' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
4
+ # ' @name mlr_pipeops_encodepl
5
+ # ' @format Abstract [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
6
6
# '
7
7
# ' @description
8
+ # ' Abstract base class for piecewise linear encoding.
9
+ # '
8
10
# ' Encodes columns of type `numeric` and `integer`.
9
11
# '
10
12
# '
37
39
# ' Initialized to `""`. One of:
38
40
# '
39
41
# ' @section Methods:
40
- # ' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
42
+ # ' Methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`], as well as
43
+ # ' * `.get_bins(task, cols)`\cr
44
+ # ' ([`Task`][mlr3::Task], `character`) -> `list` \cr
45
+ # '
41
46
# '
42
47
# ' @references
43
48
# ' `r format_bib("gorishniy_2022")`
44
49
# '
45
50
# ' @family PipeOps
51
+ # ' @family PipeOpsPLE
46
52
# ' @template seealso_pipeopslist
47
53
# ' @include PipeOpTaskPreproc.R
48
54
# ' @export
49
- # ' @examples
50
- # ' library("mlr3")
51
- # '
52
55
PipeOpEncodePL = R6Class(" PipeOpEncodePL" ,
53
56
inherit = PipeOpTaskPreprocSimple ,
54
57
public = list (
55
- initialize = function (task_type , id = " encodepl" , param_vals = list ()) {
56
- # NOTE: Might use different name, change assert, and conditions
57
- assert_choice(task_type , mlr_reflections $ task_types $ task )
58
- if (task_type == " TaskRegr" ) {
59
- private $ .tree_learner = LearnerRegrRpart $ new()
60
- } else if (task_type == " TaskClassif" ) {
61
- private $ .tree_learner = LearnerClassifRpart $ new()
62
- } else {
63
- stopf(" Task type %s not supported" , task_type )
64
- }
65
-
66
- private $ .encodepl_param_set = ps(
67
- method = p_fct(levels = c(" quantiles" , " tree" ), tags = c(" train" , " predict" , " required" )),
68
- quantiles_numsplits = p_int(lower = 2 , default = 2 , tags = c(" train" , " predict" ), depends = quote(method == " quantiles" ))
69
- )
70
- private $ .encodepl_param_set $ values = list (method = " quantiles" )
71
-
72
- super $ initialize(id , param_set = alist(encodepl = private $ .encodepl_param_set , private $ .tree_learner $ param_set ),
73
- param_vals = param_vals , packages = c(" stats" , private $ .tree_learner $ packages ),
58
+ initialize = function (id = " encodepl" , param_set = ps(), param_vals = list ()) {
59
+ super $ initialize(id , param_set = param_set , param_vals = param_vals ,
74
60
task_type = task_type , tags = " encode" , feature_types = c(" numeric" , " integer" ))
75
61
}
76
62
),
77
63
private = list (
78
64
79
- .tree_learner = NULL ,
80
- .encodepl_param_set = NULL ,
65
+ .get_bins = function (task , cols ) {
66
+ stop(" Abstract." )
67
+ },
81
68
82
69
.get_state = function (task ) {
83
70
cols = private $ .select_cols(task )
84
71
if (! length(cols )) {
85
- return (task ) # early exit
72
+ return (list ( bins = numeric ( 0 )) ) # early exit
86
73
}
87
-
88
- pv = private $ .encodepl_param_set $ values
89
- numsplits = pv $ quantiles_numsplits %??% 2
90
-
91
- if (pv $ method == " quantiles" ) {
92
- # TODO: check that min / max is correct here (according to paper / implementation)
93
- bins = lapply(task $ data(cols = cols ), function (d ) {
94
- unique(c(min(d ), stats :: quantile(d , seq(1 , numsplits - 1 ) / numsplits , na.rm = TRUE ), max(d )))
95
- })
96
- } else {
97
- learner = private $ .tree_learner
98
-
99
- bins = list ()
100
- for (col in cols ) {
101
- t = task $ clone(deep = TRUE )$ select(col )
102
- splits = learner $ train(t )$ model $ splits
103
- # Get column "index" in model splits
104
- boundaries = unname(sort(splits [, " index" ]))
105
-
106
- d = task $ data(cols = col )
107
- bins [[col ]] = c(min(d ), boundaries , max(d ))
108
- }
109
- }
110
-
111
- list (bins = bins )
74
+ list (bins = .get_bins(task , cols ))
112
75
},
113
76
114
77
.transform = function (task ) {
@@ -126,8 +89,6 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL",
126
89
)
127
90
)
128
91
129
- mlr_pipeops $ add(" encodepl" , PipeOpEncodePL , list (task_type = " TaskRegr" ))
130
-
131
92
# Helper function to implement piecewise linear encoding.
132
93
# * column: numeric vector
133
94
# * colname: name of `column`
@@ -149,3 +110,68 @@ encode_piecewise_linear = function(column, colname, bins) {
149
110
150
111
dt
151
112
}
113
+
114
+ # ' PipeOpEncodePLQuantiles
115
+ PipeOpEncodePLQuantiles = R6Class(" PipeOpEncodePLQuantiles" ,
116
+ inherit = PipeOpEncodePL ,
117
+ public = list (
118
+ initialize = function (id = " encodeplquantiles" , param_vals = list ()) {
119
+ ps = ps(
120
+ numsplits = p_int(lower = 2 , default = 2 , tags = c(" train" , " predict" , " required" ))
121
+ )
122
+ super $ initialize(id , param_set = ps , param_vals = param_vals , packages = " stats" )
123
+ }
124
+ ),
125
+ private = list (
126
+
127
+ .get_bins = function (task , cols ) {
128
+ numsplits = self $ param_set $ values $ numsplits %??% 2
129
+ lapply(task $ data(cols = cols ), function (d ) {
130
+ unique(c(min(d ), stats :: quantile(d , seq(1 , numsplits - 1 ) / numsplits , na.rm = TRUE ), max(d )))
131
+ })
132
+ }
133
+ )
134
+ )
135
+
136
+ mlr_pipeops $ add(" encodeplquantiles" , PipeOpEncodePLQuantiles )
137
+
138
+ # ' PipeOpEncodePLTree
139
+ PipeOpEncodePLTree = R6Class(" PipeOpEncodePLTree" ,
140
+ inherit = PipeOpEncodePL ,
141
+ public = list (
142
+ initialize = function (task_type , id = " encodepltree" , param_vals = list ()) {
143
+ assert_choice(task_type , mlr_reflections $ task_types $ task )
144
+ if (task_type == " TaskRegr" ) {
145
+ private $ .tree_learner = LearnerRegrRpart $ new()
146
+ } else if (task_type == " TaskClassif" ) {
147
+ private $ .tree_learner = LearnerClassifRpart $ new()
148
+ } else {
149
+ stopf(" Task type %s not supported." , task_type )
150
+ }
151
+
152
+ super $ initialize(id , param_set = alist(private $ .tree_learner $ param_set ), param_vals = param_vals ,
153
+ packages = private $ .tree_learner $ packages , task_type = task_type )
154
+ }
155
+ ),
156
+ private = list (
157
+
158
+ .tree_learner = NULL ,
159
+
160
+ .get_bins = function (task , cols ) {
161
+ learner = private $ .tree_learner
162
+
163
+ bins = list ()
164
+ for (col in cols ) {
165
+ t = task $ clone(deep = TRUE )$ select(col )
166
+ # Get column "index" in model splits
167
+ boundaries = unname(sort(learner $ train(t )$ model $ splits [, " index" ]))
168
+ d = task $ data(cols = col )
169
+ bins [[col ]] = c(min(d ), boundaries , max(d ))
170
+ }
171
+ bins
172
+ }
173
+ )
174
+ )
175
+
176
+ # Registering with "TaskRegr", however both "TaskRegr" and "TaskClassif" are acceptable, see issue ...
177
+ mlr_pipeops $ add(" encodepltree" , PipeOpEncodePLTree , list (task_type = " TaskRegr" ))
0 commit comments