Skip to content

Implement GroupBy.idxmin and GroupBy.idxmax #585

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 53 additions & 1 deletion dask_expr/_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,16 @@ class Size(SingleAggregation):
groupby_aggregate = M.sum


class IdxMin(SingleAggregation):
groupby_chunk = M.idxmin
groupby_aggregate = M.first


class IdxMax(IdxMin):
groupby_chunk = M.idxmax
groupby_aggregate = M.first


class ValueCounts(SingleAggregation):
groupby_chunk = staticmethod(_value_counts)
groupby_aggregate = staticmethod(_value_counts_aggregate)
Expand Down Expand Up @@ -1010,7 +1020,7 @@ def __init__(

def _numeric_only_kwargs(self, numeric_only):
kwargs = {"numeric_only": numeric_only}
return {"chunk_kwargs": kwargs, "aggregate_kwargs": kwargs}
return {"chunk_kwargs": kwargs.copy(), "aggregate_kwargs": kwargs.copy()}

def _single_agg(
self,
Expand Down Expand Up @@ -1128,6 +1138,26 @@ def size(self, **kwargs):
def value_counts(self, **kwargs):
return self._single_agg(ValueCounts, **kwargs)

def idxmin(
self, split_every=None, split_out=1, skipna=True, numeric_only=False, **kwargs
):
# TODO: Add shuffle and remove kwargs
numeric_kwargs = self._numeric_only_kwargs(numeric_only)
numeric_kwargs["chunk_kwargs"]["skipna"] = skipna
return self._single_agg(
IdxMin, split_every=split_every, split_out=split_out, **numeric_kwargs
)

def idxmax(
self, split_every=None, split_out=1, skipna=True, numeric_only=False, **kwargs
):
# TODO: Add shuffle and remove kwargs
numeric_kwargs = self._numeric_only_kwargs(numeric_only)
numeric_kwargs["chunk_kwargs"]["skipna"] = skipna
return self._single_agg(
IdxMax, split_every=split_every, split_out=split_out, **numeric_kwargs
)

def head(self, n=5, split_every=None, split_out=1):
chunk_kwargs = {"n": n}
aggregate_kwargs = {
Expand Down Expand Up @@ -1372,6 +1402,28 @@ def aggregate(self, arg=None, split_every=8, split_out=1, **kwargs):

agg = aggregate

def idxmin(
self, split_every=None, split_out=1, skipna=True, numeric_only=False, **kwargs
):
# pandas doesn't support numeric_only here, which is odd
return self._single_agg(
IdxMin,
split_every=None,
split_out=split_out,
chunk_kwargs=dict(skipna=skipna),
)

def idxmax(
self, split_every=None, split_out=1, skipna=True, numeric_only=False, **kwargs
):
# pandas doesn't support numeric_only here, which is odd
return self._single_agg(
IdxMax,
split_every=split_every,
split_out=split_out,
chunk_kwargs=dict(skipna=skipna),
)

def nunique(self, split_every=None, split_out=True):
slice = self._slice or self.obj.name
return new_collection(
Expand Down
3 changes: 2 additions & 1 deletion dask_expr/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def test_groupby_unsupported_by(pdf, df):

@pytest.mark.parametrize("split_every", [None, 5])
@pytest.mark.parametrize(
"api", ["sum", "mean", "min", "max", "prod", "first", "last", "var", "std"]
"api",
["sum", "mean", "min", "max", "prod", "first", "last", "var", "std", "idxmin"],
)
@pytest.mark.parametrize(
"numeric_only",
Expand Down