diff --git a/Project.toml b/Project.toml index b66b346..08facf1 100644 --- a/Project.toml +++ b/Project.toml @@ -17,9 +17,11 @@ Tables = "0.2" julia = "1" [extras] +AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["DataFrames", "RDatasets", "Test"] +test = ["AxisArrays", "DataFrames", "Dates", "RDatasets", "Test"] diff --git a/README.md b/README.md index 6943409..c744373 100644 --- a/README.md +++ b/README.md @@ -111,4 +111,10 @@ julia> Impute.interp(df) |> Impute.locf() |> Impute.nocb() │ 469 │ -247.6 │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ ``` -**Warning**: Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). +**Warning:** + +- Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). +- In-place calls aren't guaranteedto mutate the original data, but it will try avoid copying if possible. + In the future, it may be possible to detect whether in-place operations are permitted on an array or table using traits: + - https://github.com/JuliaData/Tables.jl/issues/116 + - https://github.com/JuliaDiffEq/ArrayInterface.jl/issues/22 diff --git a/docs/src/index.md b/docs/src/index.md index 31a547f..a2ddf2e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -52,4 +52,10 @@ Finally, we can chain multiple simple methods together to give a complete datase Impute.interp(df) |> Impute.locf() |> Impute.nocb() ``` -Warning: Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). +**Warning:** + +- Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). +- In-place calls aren't guaranteedto mutate the original data, but it will try avoid copying if possible. + In the future, it may be possible to detect whether in-place operations are permitted on an array or table using traits: + - https://github.com/JuliaData/Tables.jl/issues/116 + - https://github.com/JuliaDiffEq/ArrayInterface.jl/issues/22 diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index ab7a628..2b86da6 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -29,13 +29,16 @@ end # TODO: Switch to using Base.@kwdef on 1.1 DropObs(; context=Context()) = DropObs(context) -function impute!(data::AbstractVector, imp::DropObs) - imp.context() do c - filter!(x -> !ismissing!(c, x), data) - end +# Special case impute! for vectors because we know filter! will work +function impute!(data::Vector, imp::DropObs) + imp.context(c -> filter!(x -> !ismissing!(c, x), data)) +end + +function impute(data::AbstractVector, imp::DropObs) + imp.context(c -> filter(x -> !ismissing!(c, x), data)) end -function impute!(data::AbstractMatrix, imp::DropObs; dims=1) +function impute(data::AbstractMatrix, imp::DropObs; dims=1) imp.context() do c return filterobs(data; dims=dims) do obs !ismissing!(c, obs) @@ -43,11 +46,7 @@ function impute!(data::AbstractMatrix, imp::DropObs; dims=1) end end -# Deleting elements from subarrays doesn't work so we need to collect that data into -# a separate array. -impute!(data::SubArray, imp::DropObs) = impute!(collect(data), imp::DropObs) - -function impute!(table, imp::DropObs) +function impute(table, imp::DropObs) imp.context() do c @assert istable(table) rows = Tables.rows(table) @@ -96,48 +95,29 @@ end # TODO: Switch to using Base.@kwdef on 1.1 DropVars(; context=Context()) = DropVars(context) -function impute!(data::AbstractMatrix, imp::DropVars; dims=1) - return filtervars(data; dims=dims) do var - try - imp.context() do c - for x in var - ismissing!(c, x) - end - end - return true - catch e - if isa(e, ImputeError) - return false - else - rethrow(e) - end +function impute(data::AbstractMatrix, imp::DropVars; dims=1) + imp.context() do c + return filtervars(data; dims=dims) do vars + !ismissing!(c, vars) end end end -function impute!(table, imp::DropVars) +function impute(table, imp::DropVars) istable(table) || throw(MethodError(impute!, (table, imp))) cols = Tables.columns(table) - cnames = Iterators.filter(propertynames(cols)) do cname - try - imp.context() do c - col = getproperty(cols, cname) - for i in eachindex(col) - ismissing!(c, col[i]) - end - end - return true - catch e - if isa(e, ImputeError) - return false - else - rethrow(e) - end + imp.context() do c + cnames = Iterators.filter(propertynames(cols)) do cname + !ismissing!(c, getproperty(cols, cname)) end - end - selected = Tables.select(table, cnames...) - table = materializer(table)(selected) - return table + selected = Tables.select(table, cnames...) + table = materializer(table)(selected) + return table + end end + +# Add impute! methods to override the default behaviour in imputors.jl +impute!(data::AbstractMatrix, imp::Union{DropObs, DropVars}) = impute(data, imp) +impute!(data, imp::Union{DropObs, DropVars}) = impute(data, imp) diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index 395fb9d..abaefc6 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -38,7 +38,7 @@ function impute!(data::AbstractVector, imp::Fill) imp.context() do c fill_val = if isa(imp.value, Function) # Call `deepcopy` because we can trust that it's available for all types. - imp.value(Impute.drop(deepcopy(data); context=c)) + imp.value(Impute.drop(data; context=c)) else imp.value end diff --git a/test/runtests.jl b/test/runtests.jl index e5cd3ac..3a134c9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,9 @@ using Impute using Tables using Test +using AxisArrays using DataFrames +using Dates using RDatasets using Statistics using StatsBase @@ -18,67 +20,211 @@ import Impute: WeightedContext, ImputeError + @testset "Impute" begin + # Defining our missing datasets a = allowmissing(1.0:1.0:20.0) a[[2, 3, 7]] .= missing mask = map(!ismissing, a) ctx = Context(; limit=0.2) - @testset "Equality $T" for T in (DropObs, DropVars, Interpolate, Fill, LOCF, NOCB) - @test T() == T() + # We call collect to not have a wrapper type that references the same data. + m = collect(reshape(a, 5, 4)) + + aa = AxisArray( + deepcopy(m), + Axis{:time}(DateTime(2017, 6, 5, 5):Hour(1):DateTime(2017, 6, 5, 9)), + Axis{:id}(1:4) + ) + + table = DataFrame( + :sin => allowmissing(sin.(1.0:1.0:20.0)), + :cos => allowmissing(sin.(1.0:1.0:20.0)), + ) + + table.sin[[2, 3, 7, 12, 19]] .= missing + + @testset "Equality" begin + @testset "$T" for T in (DropObs, DropVars, Interpolate, Fill, LOCF, NOCB) + @test T() == T() + end end @testset "Drop" begin @testset "DropObs" begin - result = impute(a, DropObs(; context=ctx)) - expected = copy(a) - deleteat!(expected, [2, 3, 7]) + @testset "Vector" begin + result = impute(a, DropObs(; context=ctx)) + expected = deleteat!(deepcopy(a), [2, 3, 7]) - @test result == expected - @test result == Impute.dropobs(a; context=ctx) + @test result == expected + @test result == Impute.dropobs(a; context=ctx) - a2 = copy(a) - Impute.dropobs!(a2; context=ctx) - @test a2 == expected + a2 = deepcopy(a) + Impute.dropobs!(a2; context=ctx) + @test a2 == expected + end + + @testset "Matrix" begin + # Because we're removing 2 of our 5 rows we need to change the limit. + ctx = Context(; limit=0.4) + result = impute(m, DropObs(; context=ctx)) + expected = m[[1, 4, 5], :] + + @test isequal(result, expected) + @test isequal(result, Impute.dropobs(m; context=ctx)) + @test isequal(collect(result'), Impute.dropobs(collect(m'); dims=2, context=ctx)) + + m_ = Impute.dropobs!(m; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(m, expected) + @test isequal(m_, expected) + end + + @testset "Tables" begin + ctx = Context(; limit=0.4) + @testset "DataFrame" begin + df = deepcopy(table) + result = impute(df, DropObs(; context=ctx)) + expected = dropmissing(df) + + @test isequal(result, expected) + @test isequal(result, Impute.dropobs(df; context=ctx)) + + df_ = Impute.dropobs!(df; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(df, expected) + @test isequal(df_, expected) + end + + @testset "Column Table" begin + coltab = Tables.columntable(table) + + result = impute(coltab, DropObs(; context=ctx)) + expected = Tables.columntable(dropmissing(table)) + + @test isequal(result, expected) + @test isequal(result, Impute.dropobs(coltab; context=ctx)) + + coltab_ = Impute.dropobs!(coltab; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(coltab, expected) + @test isequal(coltab_, expected) + end + + @testset "Row Table" begin + rowtab = Tables.rowtable(table) + result = impute(rowtab, DropObs(; context=ctx)) + expected = Tables.rowtable(dropmissing(table)) + + @test isequal(result, expected) + @test isequal(result, Impute.dropobs(rowtab; context=ctx)) + + rowtab_ = Impute.dropobs!(rowtab; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + # @test_broken isequal(rowtab, expected) + @test isequal(rowtab_, expected) + end + end + + @testset "AxisArray" begin + # Because we're removing 2 of our 5 rows we need to change the limit. + ctx = Context(; limit=0.4) + result = impute(aa, DropObs(; context=ctx)) + expected = aa[[1, 4, 5], :] + + @test isequal(result, expected) + @test isequal(result, Impute.dropobs(aa; context=ctx)) + + aa_ = Impute.dropobs!(aa; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(aa, expected) + @test isequal(aa_, expected) + end end + @testset "DropVars" begin @testset "Vector" begin @test_throws MethodError Impute.dropvars(a) end @testset "Matrix" begin - m = reshape(a, 5, 4) - + ctx = Context(; limit=0.5) result = impute(m, DropVars(; context=ctx)) - expected = copy(m)[:, 2:4] + expected = copy(m)[:, 3:4] @test isequal(result, expected) @test isequal(result, Impute.dropvars(m; context=ctx)) - @test isequal(result', Impute.dropvars(m'; dims=2, context=ctx)) + @test isequal(collect(result'), Impute.dropvars(collect(m'); dims=2, context=ctx)) - Impute.dropvars!(m; context=ctx) + m_ = Impute.dropvars!(m; context=ctx) # The mutating test is broken because we need to making a copy of # the original matrix @test_broken isequal(m, expected) + @test isequal(m_, expected) end - @testset "DataFrame" begin - df = DataFrame( - :sin => allowmissing(sin.(1.0:1.0:20.0)), - :cos => allowmissing(sin.(1.0:1.0:20.0)), - ) - df.sin[[2, 3, 7, 12, 19]] .= missing - df.cos[[4, 9]] .= missing - result = impute(df, DropVars(; context=ctx)) - expected = select(df, :cos) + @testset "Tables" begin + @testset "DataFrame" begin + df = deepcopy(table) + result = impute(df, DropVars(; context=ctx)) + expected = select(df, :cos) + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(df; context=ctx)) + + Impute.dropvars!(df; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(df, expected) + end + + @testset "Column Table" begin + coltab = Tables.columntable(table) + + result = impute(coltab, DropVars(; context=ctx)) + expected = Tables.columntable(Tables.select(coltab, :cos)) + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(coltab; context=ctx)) + + Impute.dropvars!(coltab; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(coltab, expected) + end + + @testset "Row Table" begin + rowtab = Tables.rowtable(table) + result = impute(rowtab, DropVars(; context=ctx)) + expected = Tables.rowtable(Tables.select(rowtab, :cos)) + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(rowtab; context=ctx)) + + Impute.dropvars!(rowtab; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(rowtab, expected) + end + end + @testset "AxisArray" begin + ctx = Context(; limit=0.5) + result = impute(aa, DropVars(; context=ctx)) + expected = copy(aa)[:, 3:4] @test isequal(result, expected) - @test isequal(result, Impute.dropvars(df; context=ctx)) + @test isequal(result, Impute.dropvars(aa; context=ctx)) - Impute.dropvars!(df; context=ctx) + aa_ = Impute.dropvars!(aa; context=ctx) # The mutating test is broken because we need to making a copy of - # the original table - @test_broken isequal(df, expected) + # the original matrix + @test_broken isequal(aa, expected) + @test isequal(aa_, expected) end end end @@ -129,6 +275,20 @@ import Impute: Impute.fill!(a2; context=ctx) @test a2 == result end + + @testset "Matrix" begin + ctx = Context(; limit=1.0) + expected = Matrix(Impute.dropobs(dataset("boot", "neuro"); context=ctx)) + data = Matrix(dataset("boot", "neuro")) + + result = impute(data, Fill(; value=0.0, context=ctx)) + @test size(result) == size(data) + @test result == Impute.fill(data; value=0.0, context=ctx) + + data2 = copy(data) + Impute.fill!(data2; value=0.0, context=ctx) + @test data2 == result + end end @testset "LOCF" begin @@ -161,69 +321,6 @@ import Impute: @test a2 == result end - @testset "DataFrame" begin - ctx = Context(; limit=1.0) - @testset "Single DataFrame" begin - data = dataset("boot", "neuro") - df = impute(data, Interpolate(; context=ctx)) - @test isequal(df, Impute.interp(data; context=ctx)) - end - @testset "GroupedDataFrame" begin - hod = repeat(1:24, 12 * 10) - obj = repeat(1:12, 24 * 10) - n = length(hod) - - df = DataFrame( - :hod => hod, - :obj => obj, - :val => allowmissing( - [sin(x) * cos(y) for (x, y) in zip(hod, obj)] - ), - ) - - df.val[rand(1:n, 20)] .= missing - gdf1 = groupby(deepcopy(df), [:hod, :obj]) - gdf2 = groupby(df, [:hod, :obj]) - - f1 = Impute.interp(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() - f2 = Impute.interp!(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() - - result = vcat(f1.(gdf1)...) - @test df != result - @test size(result) == (24 * 12 * 10, 3) - @test all(!ismissing, Tables.matrix(result)) - - # Test that we can also mutate the dataframe directly - f2.(gdf2) - @test result == sort(df, (:hod, :obj)) - end - end - - @testset "Matrix" begin - ctx = Context(; limit=1.0) - expected = Matrix(Impute.dropobs(dataset("boot", "neuro"); context=ctx)) - data = Matrix(dataset("boot", "neuro")) - - @testset "Drop" begin - result = impute(data, DropObs(; context=ctx)) - @test size(result, 1) == 4 - @test result == Impute.dropobs(data; context=ctx, dims=1) - - @test result == expected - @test Impute.dropobs(data'; dims=2, context=ctx) == expected' - end - - @testset "Fill" begin - result = impute(data, Fill(; value=0.0, context=ctx)) - @test size(result) == size(data) - @test result == Impute.fill(data; value=0.0, context=ctx) - - data2 = copy(data) - Impute.fill!(data2; value=0.0, context=ctx) - @test data2 == result - end - end - @testset "Not enough data" begin ctx = Context(; limit=0.1) @test_throws ImputeError impute(a, DropObs(; context=ctx)) @@ -256,6 +353,36 @@ import Impute: result3 = impute(orig, imp) @test result == result2 @test result == result3 + + @testset "GroupedDataFrame" begin + hod = repeat(1:24, 12 * 10) + obj = repeat(1:12, 24 * 10) + n = length(hod) + + df = DataFrame( + :hod => hod, + :obj => obj, + :val => allowmissing( + [sin(x) * cos(y) for (x, y) in zip(hod, obj)] + ), + ) + + df.val[rand(1:n, 20)] .= missing + gdf1 = groupby(deepcopy(df), [:hod, :obj]) + gdf2 = groupby(df, [:hod, :obj]) + + f1 = Impute.interp(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() + f2 = Impute.interp!(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() + + result = vcat(f1.(gdf1)...) + @test df != result + @test size(result) == (24 * 12 * 10, 3) + @test all(!ismissing, Tables.matrix(result)) + + # Test that we can also mutate the dataframe directly + f2.(gdf2) + @test result == sort(df, (:hod, :obj)) + end end @testset "Column Table" begin @@ -270,6 +397,18 @@ import Impute: @test all(!ismissing, result) end + @testset "Row Table" begin + result = Tables.rowtable(orig) |> + Impute.interp!(; context=ctx) |> + Impute.locf!() |> + Impute.nocb!() |> + Tables.matrix + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end + @testset "Matrix" begin data = Matrix(orig) result = Impute.interp(data; context=ctx) |> Impute.locf!() |> Impute.nocb!() @@ -278,6 +417,19 @@ import Impute: # Confirm that we don't have any more missing values @test all(!ismissing, result) end + + @testset "AxisArray" begin + data = AxisArray( + Matrix(orig), + Axis{:row}(1:size(orig, 1)), + Axis{:V}(names(orig)), + ) + result = Impute.interp(data; context=ctx) |> Impute.locf!() |> Impute.nocb!() + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test all(!ismissing, result) + end end @testset "Alternate missing functions" begin @@ -288,8 +440,8 @@ import Impute: @test Impute.dropobs(data1; context=ctx1) == dropmissing(data1) - result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs!() - result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs!(; context=ctx2) + result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs() + result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs(; context=ctx2) @test result1 == result2 end