Skip to content

Commit e512171

Browse files
committed
Review changes.
1 parent 5f1f4d8 commit e512171

File tree

9 files changed

+91
-76
lines changed

9 files changed

+91
-76
lines changed

src/Impute.jl

+18-22
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ function __init__()
2828

2929
@warn(
3030
"""
31-
All matrix imputation methods will be switching to the JuliaStats column-major convention
31+
All matrix imputation methods will be switching to the column-major convention
3232
(e.g., each column corresponds to an observation, and each row corresponds to a variable).
3333
"""
3434
)
@@ -51,31 +51,27 @@ Base.showerror(io::IO, err::ImputeError) = println(io, "ImputeError: $(err.msg)"
5151
include("context.jl")
5252
include("imputors.jl")
5353

54-
const global imputation_methods = Dict{Symbol, Type}(
55-
:drop => DropObs,
56-
:dropobs => DropObs,
57-
:dropvars => DropVars,
58-
:interp => Interpolate,
59-
:fill => Fill,
60-
:locf => LOCF,
61-
:nocb => NOCB,
54+
const global imputation_methods = (
55+
drop = DropObs,
56+
dropobs = DropObs,
57+
dropvars = DropVars,
58+
interp = Interpolate,
59+
fill = Fill,
60+
locf = LOCF,
61+
nocb = NOCB,
6262
)
6363

6464
include("deprecated.jl")
6565

66-
let
67-
for (k, v) in imputation_methods
68-
local typename = nameof(v)
69-
local f = k
70-
local f! = Symbol(k, :!)
71-
72-
# NOTE: The
73-
@eval begin
74-
$f(data; kwargs...) = impute($typename(; _extract_context_kwargs(kwargs...)...), data)
75-
$f!(data; kwargs...) = impute!($typename(; _extract_context_kwargs(kwargs...)...), data)
76-
$f(; kwargs...) = data -> impute($typename(; _extract_context_kwargs(kwargs...)...), data)
77-
$f!(; kwargs...) = data -> impute!($typename(; _extract_context_kwargs(kwargs...)...), data)
78-
end
66+
for (f, v) in pairs(imputation_methods)
67+
typename = nameof(v)
68+
f! = Symbol(f, :!)
69+
70+
@eval begin
71+
$f(data; kwargs...) = impute($typename(; _extract_context_kwargs(kwargs...)...), data)
72+
$f!(data; kwargs...) = impute!($typename(; _extract_context_kwargs(kwargs...)...), data)
73+
$f(; kwargs...) = data -> impute($typename(; _extract_context_kwargs(kwargs...)...), data)
74+
$f!(; kwargs...) = data -> impute!($typename(; _extract_context_kwargs(kwargs...)...), data)
7975
end
8076
end
8177

src/context.jl

+40-33
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ An imputation context records summary information about missing data for an impu
55
"""
66
abstract type AbstractContext end
77

8+
# We implement a version of copy for all contexts which reconstructs the context from the
9+
# raw fields.
10+
Base.copy(ctx::T) where {T <: AbstractContext} = T(fieldvalues(ctx)...)
11+
812
"""
913
ismissing(ctx::AbstractContext, x) -> Bool
1014
@@ -18,23 +22,23 @@ exceeds our `ctx.limit` we throw an `ImputeError`
1822
* `x`: the value to check (may be an single values, abstract array or row)
1923
"""
2024
function Base.ismissing(ctx::AbstractContext, x)
21-
missing = if isa(x, NamedTuple)
22-
any(entry -> ctx.is_missing(entry[2]), pairs(x))
25+
was_missing = if isa(x, NamedTuple)
26+
any(ctx.is_missing, Tuple(x))
2327
elseif isa(x, AbstractArray)
2428
any(ctx.is_missing, x)
2529
else
2630
ctx.is_missing(x)
2731
end
2832

29-
missing_update!(ctx, missing)
33+
missing_update!(ctx, was_missing)
3034

31-
return missing
35+
return was_missing
3236
end
3337

3438
"""
3539
findfirst(ctx::AbstractContext, data::AbstractVector) -> Int
3640
37-
Returns the first not missing index in `data`.
41+
Returns the first non-missing index in `data`.
3842
3943
# Arguments
4044
* `ctx::AbstractContext`: the context to pass into `ismissing`
@@ -50,7 +54,7 @@ end
5054
"""
5155
findlast(ctx::AbstractContext, data::AbstractVector) -> Int
5256
53-
Returns the last not missing index in `data`.
57+
Returns the last non-missing index in `data`.
5458
5559
# Arguments
5660
* `ctx::AbstractContext`: the context to pass into `ismissing`
@@ -66,7 +70,7 @@ end
6670
"""
6771
findnext(ctx::AbstractContext, data::AbstractVector) -> Int
6872
69-
Returns the next not missing index in `data`.
73+
Returns the next non-missing index in `data`.
7074
7175
# Arguments
7276
* `ctx::AbstractContext`: the context to pass into `ismissing`
@@ -88,7 +92,7 @@ weighted.
8892
# Fields
8993
* `n::Int`: number of observations
9094
* `count::Int`: number of missing values found
91-
* `limit::Float64`: allowable limit for missing values to impute
95+
* `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0).
9296
* `is_missing::Function`: returns a Bool if the value counts as missing
9397
* `on_complete::Function`: a function to run when imputation is complete
9498
"""
@@ -105,37 +109,35 @@ function Context(;
105109
is_missing::Function=ismissing,
106110
on_complete::Function=complete
107111
)
108-
Context(0, 0, limit, is_missing, on_complete)
112+
return Context(0, 0, limit, is_missing, on_complete)
109113
end
110114

111-
function (ctx::Context)(f::Function)
115+
function Base.empty(ctx::Context)
112116
_ctx = copy(ctx)
113117
_ctx.num = 0
114118
_ctx.count = 0
115119

116-
result = f(_ctx)
117-
ctx.on_complete(_ctx)
118-
return result
120+
return _ctx
119121
end
120122

121-
Base.copy(x::Context) = Context(x.num, x.count, x.limit, x.is_missing, x.on_complete)
122-
123-
function missing_update!(ctx::Context, miss)
123+
function missing_update!(ctx::Context, was_missing)
124124
ctx.num += 1
125125

126-
if miss
126+
if was_missing
127127
ctx.count += 1
128128
end
129129
end
130130

131-
function complete(ctx::Context)
131+
function complete(ctx::Context, data)
132132
missing_ratio = ctx.count / ctx.num
133133

134134
if missing_ratio > ctx.limit
135135
throw(ImputeError(
136136
"More than $(ctx.limit * 100)% of values were missing ($missing_ratio)."
137137
))
138138
end
139+
140+
return data
139141
end
140142

141143

@@ -149,11 +151,11 @@ This context type can be useful if some missing observation are more important t
149151
# Fields
150152
* `num::Int`: number of observations
151153
* `s::Float64`: sum of missing values weights
152-
* `limit::Float64`: allowable limit for missing values to impute
154+
* `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0).
153155
* `is_missing::Function`: returns a Bool if the value counts as missing
154-
* `on_complete::Function`: a function to run when imputation is complete
156+
* `on_complete::Function`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0).
155157
* `wv::AbstractWeights`: a set of statistical weights to use when evaluating the importance
156-
of each observation
158+
of each observation. Will be accumulated during imputation.
157159
"""
158160
mutable struct WeightedContext <: AbstractContext
159161
num::Int
@@ -170,37 +172,42 @@ function WeightedContext(
170172
is_missing::Function=ismissing,
171173
on_complete::Function=complete
172174
)
173-
WeightedContext(0, 0.0, limit, is_missing, on_complete, wv)
175+
return WeightedContext(0, 0.0, limit, is_missing, on_complete, wv)
174176
end
175177

176-
function (ctx::WeightedContext)(f::Function)
178+
function Base.empty(ctx::WeightedContext)
177179
_ctx = copy(ctx)
178180
_ctx.num = 0
179181
_ctx.s = 0.0
180182

181-
result = f(_ctx)
182-
ctx.on_complete(_ctx)
183-
return result
184-
end
185-
186-
function Base.copy(x::WeightedContext)
187-
WeightedContext(x.num, x.s, x.limit, x.is_missing, x.on_complete, x.wv)
183+
return _ctx
188184
end
189185

190-
function missing_update!(ctx::WeightedContext, miss)
186+
function missing_update!(ctx::WeightedContext, was_missing)
191187
ctx.num += 1
192188

193-
if miss
189+
if was_missing
194190
ctx.s += ctx.wv[ctx.num]
195191
end
196192
end
197193

198-
function complete(ctx::WeightedContext)
194+
function complete(ctx::WeightedContext, data)
199195
missing_ratio = ctx.s / sum(ctx.wv)
200196

201197
if missing_ratio > ctx.limit
202198
throw(ImputeError(
203199
"More than $(ctx.limit * 100)% of weighted values were missing ($missing_ratio)."
204200
))
205201
end
202+
203+
return data
204+
end
205+
206+
for T in [Context, WeightedContext]
207+
@eval begin
208+
function (ctx::$T)(f::Function)
209+
_ctx = empty(ctx)
210+
return ctx.on_complete(_ctx, f(_ctx))
211+
end
212+
end
206213
end

src/deprecated.jl

+4-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Base.@deprecate(
1010
impute!(imp::Imputor, context::AbstractContext, data; kwargs...),
1111
impute!(typeof(imp)(; context=context), data; kwargs...)
1212
)
13+
1314
#####################################################################
1415
# Deprecate all impute calls where the first argument is an Imputor #
1516
#####################################################################
@@ -88,6 +89,7 @@ function impute(data, args...; kwargs...)
8889
""",
8990
:impute
9091
)
92+
# Call `deepcopy` because we can trust that it's available for all types.
9193
return impute!(deepcopy(data), args...; kwargs...)
9294
end
9395

@@ -148,8 +150,8 @@ function chain(data, args...; kwargs...)
148150
""",
149151
:chain
150152
)
151-
result = deepcopy(data)
152-
return chain!(data, args...; kwargs...)
153+
# Call `deepcopy` because we can trust that it's available for all types.
154+
return chain!(deepcopy(data), args...; kwargs...)
153155
end
154156

155157
#####################

src/imputors.jl

+4-3
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@ abstract type Imputor end
1313
"""
1414
impute(imp::Imputor, data)
1515
16-
Copies the `data` before calling the corresponding `impute!(imp, ...)` call.
16+
Returns a new copy of the `data` with the missing data imputed by the imputor `imp`.
1717
"""
1818
function impute(imp::Imputor, data)
19-
impute!(imp, deepcopy(data))
19+
# Call `deepcopy` because we can trust that it's available for all types.
20+
return impute!(imp, deepcopy(data))
2021
end
2122

2223
"""
@@ -33,7 +34,7 @@ if this is not the desired behaviour custom imputor methods should overload this
3334
* `AbstractMatrix`: the input `data` with values imputed
3435
"""
3536
function impute!(imp::Imputor, data::AbstractMatrix)
36-
for i in 1:size(data, 2)
37+
for i in axes(data, 2)
3738
impute!(imp, view(data, :, i))
3839
end
3940
return data

src/imputors/drop.jl

+5-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ struct DropObs <: Imputor
1111
context::AbstractContext
1212
end
1313

14-
"""DropObs(; context=Context()) -> DropObs"""
1514
DropObs(; context=Context()) = DropObs(context)
1615

1716
"""
@@ -54,7 +53,9 @@ NOTES (or premature optimizations):
5453
"""
5554
function impute!(imp::DropObs, data::AbstractMatrix)
5655
imp.context() do c
57-
mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1))
56+
mask = map(axes(data, 1)) do i
57+
!ismissing(c, view(data, i, :))
58+
end
5859
return data[mask, :]
5960
end
6061
end
@@ -101,7 +102,6 @@ struct DropVars <: Imputor
101102
context::AbstractContext
102103
end
103104

104-
"""DropVars(; context=Context()) -> DropVars"""
105105
DropVars(; context=Context()) = DropVars(context)
106106

107107
"""
@@ -119,10 +119,10 @@ requires copying the matrix.
119119
* `AbstractMatrix`: a new matrix with missing columns removed
120120
"""
121121
function impute!(imp::DropVars, data::AbstractMatrix)
122-
mask = map(1:size(data, 2)) do i
122+
mask = map(axes(data, 2)) do i
123123
try
124124
imp.context() do c
125-
for j in 1:size(data, 1)
125+
for j in axes(data, 1)
126126
ismissing(c, data[j, i])
127127
end
128128
end

src/imputors/fill.jl

+5-3
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ Fill(; value=mean, context=Context()) = Fill(value, context)
2020
"""
2121
impute!(imp::Fill, data::AbstractVector)
2222
23-
Computes the fill value if `imp.value` is a `Function` (i.e., `imp.value(drop(copy(data)))`)
24-
and replaces all missing values in the `data` with that value.
23+
Fill in missing values with a values determined by `imp.value`.
24+
If `imp.value` is a function then the fill values calculated by invoking that function on
25+
the collection of all nonmissing values.
2526
"""
2627
function impute!(imp::Fill, data::AbstractVector)
2728
imp.context() do c
2829
fill_val = if isa(imp.value, Function)
29-
imp.value(Iterators.drop(copy(data); context=c))
30+
# Call `deepcopy` because we can trust that it's available for all types.
31+
imp.value(Iterators.drop(deepcopy(data); context=c))
3032
else
3133
imp.value
3234
end

src/imputors/interp.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ function impute!(imp::Interpolate, data::AbstractVector{<:Union{T, Missing}}) wh
2323
imp.context() do c
2424
i = findfirst(c, data) + 1
2525

26-
while i < length(data)
26+
while i < lastindex(data)
2727
if ismissing(c, data[i])
2828
prev_idx = i - 1
2929
next_idx = findnext(c, data, i + 1)

src/imputors/locf.jl

+10-6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
"""
2+
LOCF <: Imputor
3+
4+
Last observation carried forward. Fill in missing values with the most recent
5+
nonmissing value.
6+
7+
See also:
8+
- [NOCB](@ref): Next Observation Carried Backward
9+
"""
110
struct LOCF <: Imputor
211
context::AbstractContext
312
end
@@ -14,16 +23,11 @@ existing observation.
1423
WARNING: missing elements at the head of the array may not be imputed if there is no
1524
existing observation to carry forward. As a result, this method does not guarantee
1625
that all missing values will be imputed.
17-
18-
# Usage
19-
```
20-
21-
```
2226
"""
2327
function impute!(imp::LOCF, data::AbstractVector)
2428
imp.context() do c
2529
start_idx = findfirst(c, data) + 1
26-
for i in start_idx:length(data)
30+
for i in start_idx:lastindex(data)
2731
if ismissing(c, data[i])
2832
data[i] = data[i-1]
2933
end

0 commit comments

Comments
 (0)