Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 17ea905

Browse files
authoredJan 11, 2022
Merge pull request #366 from machow/support-pandas-1.3
Support pandas 1.3
2 parents 6bb551a + 380eba0 commit 17ea905

16 files changed

+72
-48
lines changed
 

‎.github/workflows/ci.yml

+8-8
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,6 @@ jobs:
1717
requirements: ['-r requirements.txt']
1818
include:
1919
# historical requirements
20-
- name: "Minimum install_requires versions"
21-
requirements: numpy~=1.12.0 pandas~=0.24.0 SQLAlchemy~=1.2.19 psycopg2~=2.7.0 PyMySQL==1.0.2
22-
pytest_flags: --ignore=siuba/dply/forcats.py siuba
23-
python-version: 3.6
24-
- name: "2019-late dependencies"
25-
requirements: numpy==1.17.4 pandas==0.24.2 SQLAlchemy==1.2.19 psycopg2==2.8.4 PyMySQL==1.0.2
26-
pytest_flags: --ignore=siuba/dply/forcats.py siuba
27-
python-version: 3.6
2820
- name: "2020-early dependencies"
2921
requirements: numpy==1.17.4 pandas~=0.25.3 SQLAlchemy~=1.3.11 psycopg2~=2.8.4 PyMySQL==1.0.2
3022
pytest_flags: --ignore=siuba/dply/forcats.py siuba
@@ -37,6 +29,14 @@ jobs:
3729
python-version: 3.8
3830
requirements: numpy~=1.19.1 pandas~=1.1.0 SQLAlchemy~=1.4.13 psycopg2~=2.8.5 PyMySQL==1.0.2
3931
latest: true
32+
- name: "2022-early dependencies"
33+
python-version: 3.8
34+
requirements: numpy~=1.22.0 pandas~=1.3.5 SQLAlchemy~=1.4.29 psycopg2-binary~=2.9.3 PyMySQL==1.0.2
35+
latest: true
36+
- name: "2022-early dependencies"
37+
python-version: 3.10.1
38+
requirements: numpy~=1.22.0 pandas~=1.3.5 SQLAlchemy~=1.4.29 psycopg2-binary~=2.9.3 PyMySQL==1.0.2
39+
latest: true
4040

4141
steps:
4242
- uses: actions/checkout@v2

‎requirements-dev.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ pytest==5.3.5
6161
python-dateutil==2.8.1
6262
pytz==2020.1
6363
PyYAML==5.3.1
64-
pyzmq==19.0.0
64+
pyzmq==22.3.0
6565
requests==2.24.0
6666
scipy==1.5.2
6767
six==1.14.0

‎requirements-test.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,22 @@ jsonschema==3.2.0
1313
jupyter-client==6.0.0
1414
jupyter-core==4.6.3
1515
more-itertools==8.2.0
16-
nbformat==5.0.4
17-
nbval==0.9.5
16+
nbformat==5.1.3
17+
nbval==0.9.6
1818
packaging==20.3
1919
parso==0.6.2
2020
pexpect==4.8.0
2121
pickleshare==0.7.5
2222
pluggy==0.13.1
2323
prompt-toolkit==3.0.3
2424
ptyprocess==0.6.0
25-
py==1.8.1
25+
py==1.11.0
2626
Pygments==2.5.2
2727
pyparsing==2.4.6
2828
pyrsistent==0.15.7
29-
pytest==5.3.5
29+
pytest==6.2.5
3030
python-dateutil==2.8.1
31-
pyzmq==19.0.0
31+
pyzmq==22.3.0
3232
six==1.14.0
3333
sortedcontainers==2.1.0
3434
tornado==6.0.4

‎siuba/dply/forcats.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ def fct_reorder(fct, x, func = np.median, desc = False) -> pd.Categorical:
1616
func: function run over all values within a level of the categorical.
1717
desc: whether to sort in descending order.
1818
19+
Notes that NaN categories can't be ordered. When func returns NaN, sorting
20+
is always done with NaNs last.
21+
22+
1923
Examples:
2024
>>> fct_reorder(['a', 'a', 'b'], [4, 3, 2])
2125
['a', 'a', 'b']
@@ -34,11 +38,11 @@ def fct_reorder(fct, x, func = np.median, desc = False) -> pd.Categorical:
3438
x_vals = x.values if isinstance(x, pd.Series) else x
3539
s = pd.Series(x_vals, index = fct)
3640

37-
# for each cat, calc agg func, make values of ordered the codes
41+
# sort groups by calculated agg func. note that groupby uses dropna=True by default,
42+
# but that's okay, since pandas categoricals can't order the NA category
3843
ordered = s.groupby(level = 0).agg(func).sort_values(ascending = not desc)
39-
ordered[:] = np.arange(len(ordered))
40-
codes = ordered[s.index.values]
41-
return pd.Categorical.from_codes(codes, list(ordered.index))
44+
45+
return pd.Categorical(fct, categories=ordered.index)
4246

4347

4448
# fct_recode ------------------------------------------------------------------

‎siuba/dply/vector.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -270,19 +270,19 @@ def coalesce(x, *args):
270270
*args: other Series that are the same length as x, or a scalar
271271
272272
Examples:
273-
>>> x = pd.Series([1., None, None])
273+
>>> x = pd.Series([1.1, None, None])
274274
>>> abc = pd.Series(['a', 'b', None])
275275
>>> xyz = pd.Series(['x', 'y', 'z'])
276276
>>> coalesce(x, abc)
277-
0 1
277+
0 1.1
278278
1 b
279279
2 None
280280
dtype: object
281281
282282
>>> coalesce(x, abc, xyz)
283-
0 1
284-
1 b
285-
2 z
283+
0 1.1
284+
1 b
285+
2 z
286286
dtype: object
287287
288288
"""

‎siuba/dply/verbs.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -992,8 +992,18 @@ def nest(__data, *args, key = "data"):
992992
g_df = __data.groupby(grp_keys)
993993
splitter = g_df.grouper._get_splitter(g_df.obj[nest_keys])
994994

995+
# TODO: iterating over splitter now only produces 1 item (the dataframe)
996+
# check backwards compat
997+
def _extract_subdf_pandas_1_3(entry):
998+
# in pandas < 1.3, splitter.__iter__ returns tuple entries (ii, df)
999+
if isinstance(entry, tuple):
1000+
return entry[1]
1001+
1002+
# in pandas 1.3, each entry is just the dataframe
1003+
return entry
1004+
9951005
result_index = g_df.grouper.result_index
996-
nested_dfs = [x for ii, x in splitter]
1006+
nested_dfs = [_extract_subdf_pandas_1_3(x) for x in splitter]
9971007

9981008
out = pd.DataFrame({key: nested_dfs}, index = result_index).reset_index()
9991009

‎siuba/experimental/datetime.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def _get_series_dispatcher(f, x):
104104
105105
>>> per = pd.PeriodIndex([a_date], freq = "S")
106106
>>> floor_date(per, "M")
107-
PeriodIndex(['2020-02'], dtype='period[M]', freq='M')
107+
PeriodIndex(['2020-02'], dtype='period[M]'...
108108
109109
"""
110110

‎siuba/experimental/pd_groups/groupby.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
from pandas import Series
99
from pandas.api.types import is_scalar
1010
from pandas.core.groupby import SeriesGroupBy, DataFrameGroupBy
11-
from pandas.core import algorithms
11+
12+
try:
13+
from pandas.core.algorithms import take_1d
14+
except ImportError:
15+
from pandas.core.array_algos.take import take_1d
1216

1317

1418
# Custom SeriesGroupBy class ==================================================
@@ -114,7 +118,7 @@ def _broadcast_agg_gba(groupby):
114118

115119
src = groupby._orig_obj
116120
ids, _, ngroup = groupby._orig_grouper.group_info
117-
out = algorithms.take_1d(groupby.obj._values, ids)
121+
out = take_1d(groupby.obj._values, ids)
118122

119123
# Note: reductions like siuba.dply.vector.n(_) map DataFrameGroupBy -> GroupByAgg,
120124
# so the underlying object is a DataFrame, and does not have a .name attribute.

‎siuba/meta_hook.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
"""
2+
DEPRECATED.
3+
4+
Note that this module was experimental, and created very early in siuba's development.
5+
You should not rely on it for anything important.
6+
"""
7+
18
from importlib.abc import Loader, MetaPathFinder
29
from importlib.machinery import ModuleSpec
310
from importlib.util import find_spec
@@ -55,7 +62,8 @@ def exec_module(self, module):
5562
#self.orig_loader.exec_module(self.orig_module)
5663

5764
#for k,v in self.orig_module.__dict__.items():
58-
for k,v in self.orig_module.__dict__.items():
65+
all_items = list(self.orig_module.__dict__.items())
66+
for k,v in all_items:
5967
if k.startswith('_'):
6068
module.__dict__[k] = v
6169
else:

‎siuba/ops/generics.py

-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
ops_infix = Namespace(
66
__add__ = operation('__add__', 'elwise', 2),
77
__and__ = operation('__and__', 'elwise', 2),
8-
__div__ = operation('__div__', 'elwise', 2),
98
__eq__ = operation('__eq__', 'elwise', 2),
109
__floordiv__ = operation('__floordiv__', 'elwise', 2),
1110
__ge__ = operation('__ge__', 'elwise', 2),
@@ -22,7 +21,6 @@
2221
__pow__ = operation('__pow__', 'elwise', 2),
2322
__radd__ = operation('__radd__', 'elwise', 2),
2423
__rand__ = operation('__rand__', 'elwise', 2),
25-
__rdiv__ = operation('__rdiv__', 'elwise', 2),
2624
__rfloordiv__ = operation('__rfloordiv__', 'elwise', 2),
2725
__rmod__ = operation('__rmod__', 'elwise', 2),
2826
__rmul__ = operation('__rmul__', 'elwise', 2),

‎siuba/ops/support/examples.yml

-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ T: _.T
22
__add__: _ + _
33
__and__: _ & _
44
__array__: _.__array__()
5-
__div__: _.__div__(_)
65
__eq__: _ == _
76
__floordiv__: _ // _
87
__ge__: _ >= _
@@ -19,7 +18,6 @@ __pos__: +_
1918
__pow__: _**_
2019
__radd__: _ + _
2120
__rand__: _ & _
22-
__rdiv__: _.__rdiv__(_)
2321
__rfloordiv__: _ // _
2422
__rmod__: _ % _
2523
__rmul__: _ * _

‎siuba/sql/dialects/base.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,6 @@ def req_bool(f):
157157
# infix ----
158158
__add__ = sql_colmeth("__add__"),
159159
__and__ = req_bool(sql_colmeth("__and__")),
160-
__div__ = sql_colmeth("__div__"),
161160
__eq__ = sql_colmeth("__eq__"),
162161
__floordiv__ = sql_func_floordiv,
163162
__ge__ = sql_colmeth("__ge__"),
@@ -174,7 +173,6 @@ def req_bool(f):
174173
__pow__ = sql_not_impl(),
175174
__radd__ = sql_colmeth("__radd__"),
176175
__rand__ = req_bool(sql_colmeth("__rand__")),
177-
__rdiv__ = sql_colmeth("__rdiv__"),
178176
__rfloordiv__ = lambda x, y: sql_func_floordiv(y, x),
179177
__rmod__ = sql_colmeth("__rmod__"),
180178
__rmul__ = sql_colmeth("__rmul__"),
@@ -193,8 +191,8 @@ def req_bool(f):
193191

194192
add = sql_colmeth("__add__"),
195193
#and =
196-
div = sql_colmeth("__div__"),
197-
divide = sql_colmeth("__div__"),
194+
div = sql_colmeth("__truediv__"),
195+
divide = sql_colmeth("__truediv__"),
198196
#divmod =
199197
eq = sql_colmeth("__eq__"),
200198
#floordiv = sql_colmeth("__floordiv__"),
@@ -208,7 +206,7 @@ def req_bool(f):
208206
ne = sql_colmeth("__ne__"),
209207
pow = sql_not_impl(),
210208
radd = sql_colmeth("__radd__"),
211-
rdiv = sql_colmeth("__rdiv__"),
209+
rdiv = sql_colmeth("__rtruediv__"),
212210
#rdivmod =
213211
#rfloordiv = sql_colmeth("__pow__"),
214212
rmod = sql_colmeth("__rmod__"),

‎siuba/sql/dialects/mysql.py

-2
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,9 @@ def sql_func_between(col, left, right, inclusive=True):
7373

7474
# copied from postgres. MYSQL does true division over ints by default,
7575
# but it does not produce double precision.
76-
__div__ = sql_func_truediv,
7776
div = sql_func_truediv,
7877
divide = sql_func_truediv,
7978
rdiv = lambda x,y: sql_func_truediv(y, x),
80-
__rdiv__ = lambda x, y: sql_func_truediv(y, x),
8179

8280
__truediv__ = sql_func_truediv,
8381
truediv = sql_func_truediv,

‎siuba/sql/dialects/postgresql.py

-2
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,9 @@ def sql_func_truediv(x, y):
7272

7373
# infix and infix methods ----
7474

75-
__div__ = sql_func_truediv,
7675
div = sql_func_truediv,
7776
divide = sql_func_truediv,
7877
rdiv = lambda x,y: sql_func_truediv(y, x),
79-
__rdiv__ = lambda x, y: sql_func_truediv(y, x),
8078

8179
__truediv__ = sql_func_truediv,
8280
truediv = sql_func_truediv,

‎siuba/tests/test_dply_forcats.py

+6
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ def test_fct_reorder_custom_func():
3434

3535
assert_fct_equal(res, dst)
3636

37+
def test_fct_reorder_na_fct():
38+
import numpy as np
39+
res = fct_reorder([None, 'x', 'y'], [4, 3, 2], np.max)
40+
dst = Categorical([None, 'x', 'y'], ['y', 'x'])
41+
42+
assert_fct_equal(res, dst)
3743

3844
# fct_recode ------------------------------------------------------------------
3945

‎siuba/tests/test_dply_series_methods.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -263,15 +263,17 @@ def test_pandas_grouped_frame_fast_mutate(entry):
263263
res = fast_mutate(gdf, result = call_expr)
264264
dst = mutate(gdf, result = call_expr)
265265

266-
# TODO: apply mark to skip failing tests, rather than downcast
267-
# pandas grouped aggs, when not using cython, _try_cast back to original type
268-
# but since mutate uses apply, it doesn't :/. Currently only affects median func.
269-
dst_obj = dst.obj
266+
# TODO: apply mark to skip failing tests, rather than casting?
267+
# in pandas 1.2, grouped agg returns int, ungrouped agg returns float
268+
# in pandas 1.3, grouped agg returns float, same as ungrouped agg
269+
# (the difference is because the grouped agg in 1.2 did not use cython,
270+
# and tries casting back to the original column dtype)
271+
res_obj = res.obj
270272
if str_expr == '_.x.median()':
271-
dst_obj['result'] = dst_obj['result'].astype(gdf.x.obj.dtype)
273+
res_obj['result'] = res_obj['result'].astype(float)
272274

273275
assert isinstance(dst, DataFrameGroupBy)
274-
assert_frame_equal(res.obj, dst_obj)
276+
assert_frame_equal(res_obj, dst.obj)
275277

276278

277279
@pytest.mark.skip_backend('sqlite')
@@ -324,7 +326,7 @@ def test_pandas_grouped_frame_fast_summarize(agg_entry):
324326
# pandas grouped aggs, when not using cython, _try_cast back to original type
325327
# but since summarize uses apply, it doesn't :/. Currently only affects median func.
326328
if str_expr == '_.x.median()':
327-
dst['result'] = dst['result'].astype(gdf.x.obj.dtype)
329+
res['result'] = res['result'].astype(float)
328330

329331
assert_frame_equal(res, dst)
330332

0 commit comments

Comments
 (0)
Please sign in to comment.