Merge pull request #366 from machow/support-pandas-1.3

machow · web-flow · commit 17ea9051afa1 · 2022-01-11T14:40:46.000-05:00
Support pandas 1.3
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,14 +17,6 @@ jobs:
         requirements: ['-r requirements.txt']
         include:
           # historical requirements
-          - name: "Minimum install_requires versions"
-            requirements: numpy~=1.12.0 pandas~=0.24.0 SQLAlchemy~=1.2.19 psycopg2~=2.7.0 PyMySQL==1.0.2
-            pytest_flags: --ignore=siuba/dply/forcats.py siuba
-            python-version: 3.6
-          - name: "2019-late dependencies"
-            requirements: numpy==1.17.4 pandas==0.24.2 SQLAlchemy==1.2.19 psycopg2==2.8.4 PyMySQL==1.0.2
-            pytest_flags: --ignore=siuba/dply/forcats.py siuba
-            python-version: 3.6
           - name: "2020-early dependencies"
             requirements: numpy==1.17.4 pandas~=0.25.3 SQLAlchemy~=1.3.11 psycopg2~=2.8.4 PyMySQL==1.0.2
             pytest_flags: --ignore=siuba/dply/forcats.py siuba
@@ -37,6 +29,14 @@ jobs:
             python-version: 3.8
             requirements: numpy~=1.19.1 pandas~=1.1.0 SQLAlchemy~=1.4.13 psycopg2~=2.8.5 PyMySQL==1.0.2
             latest: true
+          - name: "2022-early dependencies"
+            python-version: 3.8
+            requirements: numpy~=1.22.0 pandas~=1.3.5 SQLAlchemy~=1.4.29 psycopg2-binary~=2.9.3 PyMySQL==1.0.2
+            latest: true
+          - name: "2022-early dependencies"
+            python-version: 3.10.1
+            requirements: numpy~=1.22.0 pandas~=1.3.5 SQLAlchemy~=1.4.29 psycopg2-binary~=2.9.3 PyMySQL==1.0.2
+            latest: true
 
     steps:
       - uses: actions/checkout@v2
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -61,7 +61,7 @@ pytest==5.3.5
 python-dateutil==2.8.1
 pytz==2020.1
 PyYAML==5.3.1
-pyzmq==19.0.0
+pyzmq==22.3.0
 requests==2.24.0
 scipy==1.5.2
 six==1.14.0
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -13,22 +13,22 @@ jsonschema==3.2.0
 jupyter-client==6.0.0
 jupyter-core==4.6.3
 more-itertools==8.2.0
-nbformat==5.0.4
-nbval==0.9.5
+nbformat==5.1.3
+nbval==0.9.6
 packaging==20.3
 parso==0.6.2
 pexpect==4.8.0
 pickleshare==0.7.5
 pluggy==0.13.1
 prompt-toolkit==3.0.3
 ptyprocess==0.6.0
-py==1.8.1
+py==1.11.0
 Pygments==2.5.2
 pyparsing==2.4.6
 pyrsistent==0.15.7
-pytest==5.3.5
+pytest==6.2.5
 python-dateutil==2.8.1
-pyzmq==19.0.0
+pyzmq==22.3.0
 six==1.14.0
 sortedcontainers==2.1.0
 tornado==6.0.4
diff --git a/siuba/dply/forcats.py b/siuba/dply/forcats.py
@@ -16,6 +16,10 @@ def fct_reorder(fct, x, func = np.median, desc = False) -> pd.Categorical:
         func: function run over all values within a level of the categorical.
         desc: whether to sort in descending order.
 
+    Notes that NaN categories can't be ordered. When func returns NaN, sorting
+    is always done with NaNs last.
+
+
     Examples:
         >>> fct_reorder(['a', 'a', 'b'], [4, 3, 2])
         ['a', 'a', 'b']
@@ -34,11 +38,11 @@ def fct_reorder(fct, x, func = np.median, desc = False) -> pd.Categorical:
     x_vals = x.values if isinstance(x, pd.Series) else x
     s = pd.Series(x_vals, index = fct)
 
-    # for each cat, calc agg func, make values of ordered the codes
+    # sort groups by calculated agg func. note that groupby uses dropna=True by default,
+    # but that's okay, since pandas categoricals can't order the NA category
     ordered = s.groupby(level = 0).agg(func).sort_values(ascending = not desc)
-    ordered[:] = np.arange(len(ordered))
-    codes = ordered[s.index.values]
-    return pd.Categorical.from_codes(codes, list(ordered.index))
+
+    return pd.Categorical(fct, categories=ordered.index)
 
 
 # fct_recode ------------------------------------------------------------------
diff --git a/siuba/dply/vector.py b/siuba/dply/vector.py
@@ -270,19 +270,19 @@ def coalesce(x, *args):
         *args: other Series that are the same length as x, or a scalar
 
     Examples:
-        >>> x = pd.Series([1., None, None])
+        >>> x = pd.Series([1.1, None, None])
         >>> abc = pd.Series(['a', 'b', None])
         >>> xyz = pd.Series(['x', 'y', 'z'])
         >>> coalesce(x, abc)
-        0       1
+        0     1.1
         1       b
         2    None
         dtype: object
 
         >>> coalesce(x, abc, xyz)
-        0    1
-        1    b
-        2    z
+        0    1.1
+        1      b
+        2      z
         dtype: object
         
     """
diff --git a/siuba/dply/verbs.py b/siuba/dply/verbs.py
@@ -992,8 +992,18 @@ def nest(__data, *args, key = "data"):
     g_df = __data.groupby(grp_keys)
     splitter = g_df.grouper._get_splitter(g_df.obj[nest_keys])
 
+    # TODO: iterating over splitter now only produces 1 item (the dataframe)
+    # check backwards compat
+    def _extract_subdf_pandas_1_3(entry):
+        # in pandas < 1.3, splitter.__iter__ returns tuple entries (ii, df)
+        if isinstance(entry, tuple):
+            return entry[1]
+
+        # in pandas 1.3, each entry is just the dataframe
+        return entry
+
     result_index = g_df.grouper.result_index
-    nested_dfs = [x for ii, x in splitter]
+    nested_dfs = [_extract_subdf_pandas_1_3(x) for x in splitter]
 
     out = pd.DataFrame({key: nested_dfs}, index = result_index).reset_index()
 
diff --git a/siuba/experimental/datetime.py b/siuba/experimental/datetime.py
@@ -104,7 +104,7 @@ def _get_series_dispatcher(f, x):
 
     >>> per = pd.PeriodIndex([a_date], freq = "S")
     >>> floor_date(per, "M")
-    PeriodIndex(['2020-02'], dtype='period[M]', freq='M')
+    PeriodIndex(['2020-02'], dtype='period[M]'...
 
 """
 
diff --git a/siuba/experimental/pd_groups/groupby.py b/siuba/experimental/pd_groups/groupby.py
@@ -8,7 +8,11 @@
 from pandas import Series
 from pandas.api.types import is_scalar
 from pandas.core.groupby import SeriesGroupBy, DataFrameGroupBy
-from pandas.core import algorithms
+
+try:
+    from pandas.core.algorithms import take_1d
+except ImportError:
+    from pandas.core.array_algos.take import take_1d
 
 
 # Custom SeriesGroupBy class ==================================================
@@ -114,7 +118,7 @@ def _broadcast_agg_gba(groupby):
 
     src = groupby._orig_obj
     ids, _, ngroup = groupby._orig_grouper.group_info
-    out = algorithms.take_1d(groupby.obj._values, ids)
+    out = take_1d(groupby.obj._values, ids)
     
     # Note: reductions like siuba.dply.vector.n(_) map DataFrameGroupBy -> GroupByAgg,
     # so the underlying object is a DataFrame, and does not have a .name attribute.
diff --git a/siuba/meta_hook.py b/siuba/meta_hook.py
@@ -1,3 +1,10 @@
+"""
+DEPRECATED.
+
+Note that this module was experimental, and created very early in siuba's development.
+You should not rely on it for anything important.
+"""
+
 from importlib.abc import Loader, MetaPathFinder
 from importlib.machinery import ModuleSpec
 from importlib.util import find_spec
@@ -55,7 +62,8 @@ def exec_module(self, module):
         #self.orig_loader.exec_module(self.orig_module)
 
         #for k,v in self.orig_module.__dict__.items():
-        for k,v in self.orig_module.__dict__.items():
+        all_items = list(self.orig_module.__dict__.items())
+        for k,v in all_items:
             if k.startswith('_'):
                 module.__dict__[k] = v
             else:
diff --git a/siuba/ops/generics.py b/siuba/ops/generics.py
@@ -5,7 +5,6 @@
 ops_infix = Namespace(
     __add__ = operation('__add__', 'elwise', 2),
     __and__ = operation('__and__', 'elwise', 2),
-    __div__ = operation('__div__', 'elwise', 2),
     __eq__ = operation('__eq__', 'elwise', 2),
     __floordiv__ = operation('__floordiv__', 'elwise', 2),
     __ge__ = operation('__ge__', 'elwise', 2),
@@ -22,7 +21,6 @@
     __pow__ = operation('__pow__', 'elwise', 2),
     __radd__ = operation('__radd__', 'elwise', 2),
     __rand__ = operation('__rand__', 'elwise', 2),
-    __rdiv__ = operation('__rdiv__', 'elwise', 2),
     __rfloordiv__ = operation('__rfloordiv__', 'elwise', 2),
     __rmod__ = operation('__rmod__', 'elwise', 2),
     __rmul__ = operation('__rmul__', 'elwise', 2),
diff --git a/siuba/ops/support/examples.yml b/siuba/ops/support/examples.yml
@@ -2,7 +2,6 @@ T: _.T
 __add__: _ + _
 __and__: _ & _
 __array__: _.__array__()
-__div__: _.__div__(_)
 __eq__: _ == _
 __floordiv__: _ // _
 __ge__: _ >= _
@@ -19,7 +18,6 @@ __pos__: +_
 __pow__: _**_
 __radd__: _ + _
 __rand__: _ & _
-__rdiv__: _.__rdiv__(_)
 __rfloordiv__: _ // _
 __rmod__: _ % _
 __rmul__: _ * _
diff --git a/siuba/sql/dialects/base.py b/siuba/sql/dialects/base.py
@@ -157,7 +157,6 @@ def req_bool(f):
     # infix ----
     __add__       = sql_colmeth("__add__"),
     __and__       = req_bool(sql_colmeth("__and__")),
-    __div__       = sql_colmeth("__div__"),
     __eq__        = sql_colmeth("__eq__"),
     __floordiv__  = sql_func_floordiv,
     __ge__        = sql_colmeth("__ge__"),
@@ -174,7 +173,6 @@ def req_bool(f):
     __pow__       = sql_not_impl(),
     __radd__      = sql_colmeth("__radd__"),
     __rand__      = req_bool(sql_colmeth("__rand__")),
-    __rdiv__      = sql_colmeth("__rdiv__"),
     __rfloordiv__ = lambda x, y: sql_func_floordiv(y, x),
     __rmod__      = sql_colmeth("__rmod__"),
     __rmul__      = sql_colmeth("__rmul__"),
@@ -193,8 +191,8 @@ def req_bool(f):
 
     add           = sql_colmeth("__add__"),        
     #and          =
-    div           = sql_colmeth("__div__"),        
-    divide        = sql_colmeth("__div__"),        
+    div           = sql_colmeth("__truediv__"),
+    divide        = sql_colmeth("__truediv__"),
     #divmod       = 
     eq            = sql_colmeth("__eq__"),         
     #floordiv     = sql_colmeth("__floordiv__"),         
@@ -208,7 +206,7 @@ def req_bool(f):
     ne            = sql_colmeth("__ne__"),         
     pow           = sql_not_impl(),
     radd          = sql_colmeth("__radd__"),       
-    rdiv          = sql_colmeth("__rdiv__"),       
+    rdiv          = sql_colmeth("__rtruediv__"),
     #rdivmod      = 
     #rfloordiv    = sql_colmeth("__pow__"),        
     rmod          = sql_colmeth("__rmod__"),       
diff --git a/siuba/sql/dialects/mysql.py b/siuba/sql/dialects/mysql.py
@@ -73,11 +73,9 @@ def sql_func_between(col, left, right, inclusive=True):
 
         # copied from postgres. MYSQL does true division over ints by default,
         # but it does not produce double precision.
-        __div__ = sql_func_truediv,
         div = sql_func_truediv,
         divide = sql_func_truediv,
         rdiv = lambda x,y: sql_func_truediv(y, x),
-        __rdiv__ = lambda x, y: sql_func_truediv(y, x),
 
         __truediv__ = sql_func_truediv,
         truediv = sql_func_truediv,
diff --git a/siuba/sql/dialects/postgresql.py b/siuba/sql/dialects/postgresql.py
@@ -72,11 +72,9 @@ def sql_func_truediv(x, y):
 
         # infix and infix methods ----
 
-        __div__ = sql_func_truediv,
         div = sql_func_truediv,
         divide = sql_func_truediv,
         rdiv = lambda x,y: sql_func_truediv(y, x),
-        __rdiv__ = lambda x, y: sql_func_truediv(y, x),
 
         __truediv__ = sql_func_truediv,
         truediv = sql_func_truediv,
diff --git a/siuba/tests/test_dply_forcats.py b/siuba/tests/test_dply_forcats.py
@@ -34,6 +34,12 @@ def test_fct_reorder_custom_func():
 
     assert_fct_equal(res, dst)
 
+def test_fct_reorder_na_fct():
+    import numpy as np
+    res = fct_reorder([None, 'x', 'y'], [4, 3, 2], np.max)
+    dst = Categorical([None, 'x', 'y'], ['y', 'x'])
+
+    assert_fct_equal(res, dst)
 
 # fct_recode ------------------------------------------------------------------
 
diff --git a/siuba/tests/test_dply_series_methods.py b/siuba/tests/test_dply_series_methods.py
@@ -263,15 +263,17 @@ def test_pandas_grouped_frame_fast_mutate(entry):
     res = fast_mutate(gdf, result = call_expr)
     dst = mutate(gdf, result = call_expr)
 
-    # TODO: apply mark to skip failing tests, rather than downcast
-    # pandas grouped aggs, when not using cython, _try_cast back to original type
-    # but since mutate uses apply, it doesn't :/. Currently only affects median func.
-    dst_obj = dst.obj
+    # TODO: apply mark to skip failing tests, rather than casting?
+    # in pandas 1.2, grouped agg returns int, ungrouped agg returns float
+    # in pandas 1.3, grouped agg returns float, same as ungrouped agg
+    # (the difference is because the grouped agg in 1.2 did not use cython,
+    # and tries casting back to the original column dtype)
+    res_obj = res.obj
     if str_expr == '_.x.median()':
-        dst_obj['result'] = dst_obj['result'].astype(gdf.x.obj.dtype)
+        res_obj['result'] = res_obj['result'].astype(float)
 
     assert isinstance(dst, DataFrameGroupBy)
-    assert_frame_equal(res.obj, dst_obj)
+    assert_frame_equal(res_obj, dst.obj)
 
 
 @pytest.mark.skip_backend('sqlite')
@@ -324,7 +326,7 @@ def test_pandas_grouped_frame_fast_summarize(agg_entry):
     # pandas grouped aggs, when not using cython, _try_cast back to original type
     # but since summarize uses apply, it doesn't :/. Currently only affects median func.
     if str_expr == '_.x.median()':
-        dst['result'] = dst['result'].astype(gdf.x.obj.dtype)
+        res['result'] = res['result'].astype(float)
 
     assert_frame_equal(res, dst)