Skip to content

WIP Groupby perf #124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmark/REQUIRE
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DataTables
22 changes: 22 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
using PkgBenchmark
using Query
using DataTables

@benchgroup "Query" begin
N = 100_000_000;
A = rand(N);
B = rand(1:100, N);
dt = DataTable([A, B], [:A, :B]);

@bench "group" @from i in $dt begin
@group i.A by i.B into g
@select {m = mean(g)}
@collect DataTable
end

@bench "group2" @from i in $dt begin
@group i.A by i.B into g
@select {m = mean(g)}
@collect DataTable
end
end
32 changes: 32 additions & 0 deletions benchmark/perf.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
using DataTables, Query

N = 100_000_000;
A = rand(N);
B = rand(1:100, N);
dt = DataTable([A, B], [:A, :B]);
dt = DataTable(A = NullableArray(A), B = NullableArray(B));

@time by(dt, :B, d -> mean(d[:A]));

@time x = @from i in dt begin
@group i.A by i.B into g
@select {m = mean(g)}
@collect DataTable
end;

function foo1(dt)
by(dt, :B, d -> mean(d[:A]))
end

function foo2(dt)
x = @from i in dt begin
@group i.A by i.B into g
@select {m = mean(g)}
@collect DataTable
end
end

@time foo1(dt);
@time foo2(dt);

@profile foo2(dt);
39 changes: 26 additions & 13 deletions src/enumerable/enumerable_groupby.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ immutable Grouping{TKey,T} <: AbstractArray{T,1}
end

import Base.size
size{TKey,T}(A::Grouping{TKey,T}) = size(A.elements)
size{TKey,T}(A::Grouping{TKey,T}) = (length(A.elements),)
Base.IndexStyle(::Type{<:Grouping}) = IndexLinear()
import Base.getindex
getindex{TKey,T}(A::Grouping{TKey,T},i) = A.elements[i]
import Base.length
Expand Down Expand Up @@ -63,6 +64,10 @@ immutable EnumerableGroupBy{T,TKey,TR,SO,ES<:Function,RS<:Function} <: Enumerabl
resultSelector::RS
end

IterableTables.iteratorsize2(::Type{<:EnumerableGroupBy}) = IterableTables.HasLengthAfterStart()

Base.length(iter::EnumerableGroupBy, state) = length(state[1])

Base.eltype{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES}) = T

Base.eltype{T,TKey,TR,SO,ES}(iter::Type{EnumerableGroupBy{T,TKey,TR,SO,ES}}) = T
Expand All @@ -85,25 +90,33 @@ end

# TODO This should be rewritten as a lazy iterator
function start{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES})
result = OrderedDict{TKey,T}()
result = OrderedDict{TKey,Grouping{TKey,TR}}()
for i in iter.source
key = iter.elementSelector(i)
if !haskey(result, key)
result[key] = Grouping(key,Array{TR}(0))
end
push!(result[key].elements,iter.resultSelector(i))
let key=key
g = get!(result, key) do
return Grouping{TKey, TR}(key,Array{TR,1}(0))
end
push!(g.elements,iter.resultSelector(i))
end
end
return collect(values(result)),1
dict_iterator = values(result)
return dict_iterator,start(dict_iterator)
end

function next{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES}, state)
results = state[1]
curr_index = state[2]
return results[curr_index], (results, curr_index+1)
dict_iterator = state[1]
dict_iterator_state = state[2]

x = next(dict_iterator, dict_iterator_state)
v = x[1]
dict_iterator_state_new = x[2]

return v, (dict_iterator, dict_iterator_state_new)
end

function done{T,TKey,TR,SO,ES}(iter::EnumerableGroupBy{T,TKey,TR,SO,ES}, state)
results = state[1]
curr_index = state[2]
return curr_index > length(results)
dict_iterator = state[1]
dict_iterator_state = state[2]
return done(dict_iterator, dict_iterator_state)
end
4 changes: 4 additions & 0 deletions src/enumerable/enumerable_select.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ immutable EnumerableSelect{T, S, Q<:Function} <: Enumerable
f::Q
end

IterableTables.iteratorsize2{T,S,Q}(::Type{EnumerableSelect{T,S,Q}}) = IterableTables.iteratorsize2(S)

Base.iteratorsize{T,S,Q}(::Type{EnumerableSelect{T,S,Q}}) = Base.iteratorsize(S)

Base.eltype{T,S,Q}(iter::EnumerableSelect{T,S,Q}) = T
Expand All @@ -11,6 +13,8 @@ Base.eltype{T,S,Q}(iter::Type{EnumerableSelect{T,S,Q}}) = T

Base.length{T,S,Q}(iter::EnumerableSelect{T,S,Q}) = length(iter.source)

Base.length{T,S,Q}(iter::EnumerableSelect{T,S,Q}, state) = length(iter.source, state)

function select(source::Enumerable, f::Function, f_expr::Expr)
TS = eltype(source)
T = Base.return_types(f, (TS,))[1]
Expand Down
5 changes: 2 additions & 3 deletions src/sources/source_iterable.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ function start{T,S}(iter::EnumerableIterable{T,S})
return start(iter.source)
end

function next{T,S}(iter::EnumerableIterable{T,S}, state)
source_value, source_next_state = next(iter.source, state)
return source_value, source_next_state
@inline function next{T,S}(iter::EnumerableIterable{T,S}, state)
return next(iter.source, state)
end

function done{T,S}(iter::EnumerableIterable{T,S}, state)
Expand Down