diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d721213dc38e7..13ec91343f644 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -200,6 +200,7 @@ Other enhancements - :class:`Holiday` has gained the constructor argument and field ``exclude_dates`` to exclude specific datetimes from a custom holiday calendar (:issue:`54382`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) +- :class:`StringDtype` now supports addition to Series/DataFrame with floats, ints, and strings (:issue:`61581`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) @@ -227,7 +228,6 @@ Other enhancements - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) - Switched wheel upload to **PyPI Trusted Publishing** (OIDC) for release-tag pushes in ``wheels.yml``. (:issue:`61718`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: @@ -998,6 +998,7 @@ MultiIndex - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`) +- Bug in :class:`DataFrame` arithmetic operations with :class:`Series` now works with ``fill_value`` parameter (:issue:`61581`) - Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`) - Bug in :meth:`DataFrame.__setitem__` where column alignment logic would reindex the assigned value with an empty index, incorrectly setting all values to ``NaN``.(:issue:`61841`) - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` where reindexing :class:`Index` to a :class:`MultiIndex` would incorrectly set all values to ``NaN``.(:issue:`60923`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2eed608908440..322f118da6d10 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -890,7 +890,17 @@ def _op_method_error_message(self, other, op) -> str: def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type other_original = other - other = self._box_pa(other) + try: + other = self._box_pa(other) + except pa.lib.ArrowTypeError: + # was expecting time dtype but received non-temporal dtype (time offset) + from pandas.core.tools.timedeltas import to_timedelta + + other = self._box_pa(to_timedelta(other)) + except ValueError as err: + raise TypeError( + "Incompatible type when converting to PyArrow dtype for operation." + ) from err if ( pa.types.is_string(pa_type) @@ -898,17 +908,36 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: or pa.types.is_binary(pa_type) ): if op in [operator.add, roperator.radd]: - sep = pa.scalar("", type=pa_type) - try: - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - elif op is roperator.radd: - result = pc.binary_join_element_wise(other, self._pa_array, sep) - except pa.ArrowNotImplementedError as err: + # pyarrow gets upset if you try to join a NullArray + if ( + pa.types.is_integer(other.type) + or pa.types.is_floating(other.type) + or pa.types.is_null(other.type) + or pa.types.is_string(other.type) + or pa.types.is_large_string(other.type) + or pa.types.is_binary(other.type) + ): + other = other.cast(pa_type) + sep = pa.scalar("", type=pa_type) + try: + if op is operator.add: + result = pc.binary_join_element_wise( + self._pa_array, other, sep + ) + elif op is roperator.radd: + result = pc.binary_join_element_wise( + other, self._pa_array, sep + ) + except pa.ArrowNotImplementedError as err: + raise TypeError( + self._op_method_error_message(other_original, op) + ) from err + return self._from_pyarrow_array(result) + else: raise TypeError( - self._op_method_error_message(other_original, op) - ) from err - return self._from_pyarrow_array(result) + "Can only add string arrays to dtypes " + "null, int, float, str, and binary." + ) elif op in [operator.mul, roperator.rmul]: binary = self._pa_array integral = other diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7a61a252d86a6..7665182587e25 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -45,6 +45,7 @@ from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_float_dtype, is_integer_dtype, is_object_dtype, is_string_dtype, @@ -1110,10 +1111,28 @@ def _cmp_method(self, other, op): if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") result[mask] = self.dtype.na_value + if op.__name__ in ["add", "radd"]: + if isinstance(other, str) or is_string_dtype(other): + pass + elif is_float_dtype(other) or is_integer_dtype(other): + if is_float_dtype(other): + # Shorten whole number floats to match pyarrow behavior + other = [ + str(int(x)) if x.is_integer() else str(x) for x in other + ] + else: + other = other.astype(str) + else: + raise TypeError( + f"Only supports op({op.__name__}) between StringArray and " + "dtypes int, float, and str." + ) + result[valid] = op(self._ndarray[valid], other) if isinstance(other, Path): # GH#61940 return result + return self._from_backing_data(result) else: # logical diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 91f5cd1679a61..524240ba3ac66 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8468,27 +8468,34 @@ def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): blockwise. """ rvalues = series._values - if not isinstance(rvalues, np.ndarray): - # TODO(EA2D): no need to special-case with 2D EAs - if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): - # We can losslessly+cheaply cast to ndarray - rvalues = np.asarray(rvalues) + if lib.is_np_dtype(rvalues.dtype): + # We can losslessly+cheaply cast to ndarray + # i.e. ndarray or dt64[naive], td64 + # TODO(EA2D): no need to special case with 2D EAs + rvalues = np.asarray(rvalues) + + if axis == 0: + rvalues = rvalues.reshape(-1, 1) else: - return series + rvalues = rvalues.reshape(1, -1) - if axis == 0: - rvalues = rvalues.reshape(-1, 1) - else: - rvalues = rvalues.reshape(1, -1) + rvalues = np.broadcast_to(rvalues, self.shape) + # pass dtype to avoid doing inference + df = self._constructor(rvalues, dtype=rvalues.dtype) - rvalues = np.broadcast_to(rvalues, self.shape) - # pass dtype to avoid doing inference - return self._constructor( - rvalues, - index=self.index, - columns=self.columns, - dtype=rvalues.dtype, - ).__finalize__(series) + else: + # GH#61581 + if axis == 0: + df = DataFrame(dict.fromkeys(range(self.shape[1]), rvalues)) + else: + nrows = self.shape[0] + df = DataFrame( + {i: rvalues[[i]].repeat(nrows) for i in range(self.shape[1])}, + dtype=rvalues.dtype, + ) + df.index = self.index + df.columns = self.columns + return df.__finalize__(series) def _flex_arith_method( self, other, op, *, axis: Axis = "columns", level=None, fill_value=None @@ -8498,11 +8505,6 @@ def _flex_arith_method( if self._should_reindex_frame_op(other, op, axis, fill_value, level): return self._arith_method_with_reindex(other, op) - if isinstance(other, Series) and fill_value is not None: - # TODO: We could allow this in cases where we end up going - # through the DataFrame path - raise NotImplementedError(f"fill_value {fill_value} not supported.") - other = ops.maybe_prepare_scalar_for_op(other, self.shape) self, other = self._align_for_op(other, axis, flex=True, level=level) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 67762e0b89c73..86f9bae51557b 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,12 +1361,7 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - if box_with_array is pd.DataFrame: - # TODO: before implementing resolution-inference we got the same - # message with DataFrame and non-DataFrame. Why did that change? - msg = "cannot add PeriodArray and Timestamp" - else: - msg = "cannot add PeriodArray and DatetimeArray" + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): @@ -1376,16 +1371,10 @@ def test_period_add_timestamp_raises(self, box_with_array): with pytest.raises(TypeError, match=msg): pd.Index([ts]) + arr - if box_with_array is pd.DataFrame: - msg = "cannot add PeriodArray and DatetimeArray" - else: - msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray" + msg = "cannot add PeriodArray and DatetimeArray" + with pytest.raises(TypeError, match=msg): arr + pd.DataFrame([ts]) - if box_with_array is pd.DataFrame: - msg = "cannot add PeriodArray and DatetimeArray" - else: - msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'" with pytest.raises(TypeError, match=msg): pd.DataFrame([ts]) + arr diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 312dfb72e0950..555e69dc82589 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -118,7 +118,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops(pd.Timestamp("20180101")) # invalid array-likes - if op not in ("__mul__", "__rmul__"): + if op not in ("__mul__", "__rmul__", "__add__", "__radd__"): # TODO(extension) numpy's mul with object array sees booleans as numbers msg = "|".join( [ diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 777099e76fc73..d14e507a79643 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -144,6 +144,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): "not implemented", "not supported for dtype", "Can only string multiply by an integer", + "can't multiply sequence by non-int of type 'str'", ] ) with pytest.raises(TypeError, match=msg): @@ -152,8 +153,42 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(TypeError, match=msg): - ops(pd.Series("foo", index=s.index)) + str_ser = pd.Series("foo", index=s.index) + if ( + all_arithmetic_operators + in [ + "__add__", + "__radd__", + ] + and pd.options.future.infer_string + ): + res = ops(str_ser) + if all_arithmetic_operators == "__radd__": + data_expected = [] + for i in data: + if pd.isna(i): + data_expected.append(i) + elif i.is_integer(): + data_expected.append("foo" + str(int(i))) + else: + data_expected.append("foo" + str(i)) + + expected = pd.Series(data_expected, index=s.index) + else: + data_expected = [] + for i in data: + if pd.isna(i): + data_expected.append(i) + elif i.is_integer(): + data_expected.append(str(int(i)) + "foo") + else: + data_expected.append(str(i) + "foo") + + expected = pd.Series(data_expected, index=s.index) + tm.assert_series_equal(res, expected) + else: + with pytest.raises(TypeError, match=msg): + ops(str_ser) msg = "|".join( [ diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index aeceb9b8a3cb1..9aece48f2ea38 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -197,6 +197,26 @@ def test_error_invalid_values(data, all_arithmetic_operators): # assert_almost_equal stricter, but the expected with pd.NA seems # more-correct than np.nan here. tm.assert_series_equal(res, expected) + elif ( + all_arithmetic_operators + in [ + "__add__", + "__radd__", + ] + and pd.options.future.infer_string + ): + res = ops(str_ser) + if all_arithmetic_operators == "__radd__": + expected = pd.Series( + [np.nan if pd.isna(x) == 1 else "foo" + str(x) for x in data], + index=s.index, + ) + else: + expected = pd.Series( + [np.nan if pd.isna(x) == 1 else str(x) + "foo" for x in data], + index=s.index, + ) + tm.assert_series_equal(res, expected) else: with tm.external_error_raised(TypeError): ops(str_ser) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f10ebda94dc6a..65c2715600e56 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -4,6 +4,7 @@ """ import operator +from re import escape import numpy as np import pytest @@ -249,7 +250,32 @@ def test_mul(dtype): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(reason="GH-28527") +def test_add_series(dtype): + arr = pd.array(["a", "b", "c", "d"], dtype=dtype) + df = pd.Series(["t", "y", "v", "w"], dtype=object) + + result = arr + df + expected = pd.Series(["at", "by", "cv", "dw"]).astype(dtype) + tm.assert_series_equal(result, expected) + + result = df + arr + expected = pd.Series(["ta", "yb", "vc", "wd"]).astype(dtype) + tm.assert_series_equal(result, expected) + + +def test_add_series_float(dtype): + arr = pd.array(["a", "b", "c", "d"], dtype=dtype) + df = pd.Series([1, 2.0, 3.5, 4]) + + result = arr + df + expected = pd.Series(["a1", "b2", "c3.5", "d4"]).astype(dtype) + tm.assert_series_equal(result, expected) + + result = df + arr + expected = pd.Series(["1a", "2b", "3.5c", "4d"]).astype(dtype) + tm.assert_series_equal(result, expected) + + def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) @@ -264,7 +290,6 @@ def test_add_strings(dtype): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-28527") def test_add_frame(dtype): arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) @@ -273,11 +298,60 @@ def test_add_frame(dtype): result = arr + df expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) result = df + arr expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) + + +def test_add_frame_int(dtype): + arr = pd.array(["a", "b", "c", np.nan], dtype=dtype) + df = pd.DataFrame([[1, np.nan, 3, np.nan]]) + + result = arr + df + expected = pd.DataFrame([["a1", np.nan, "c3", np.nan]]).astype(dtype) + tm.assert_frame_equal(result, expected, check_dtype=False) + + result = df + arr + expected = pd.DataFrame([["1a", np.nan, "3c", np.nan]]).astype(dtype) + tm.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "invalid", + [ + pd.Timedelta(hours=31), + pd.Timestamp("2021-01-01"), + np.datetime64("NaT", "ns"), + pd.NaT, + True, + pd.Period("2025-09"), + pd.Categorical(["test"]), + pd.offsets.Minute(3), + pd.Interval(1, 2, closed="right"), + ], +) +def test_add_frame_invalid(dtype, invalid): + arr = pd.array(["a", np.nan], dtype=dtype) + df = pd.DataFrame([[invalid, invalid]]) + + if dtype.storage == "pyarrow": + if invalid == pd.Categorical(["test"]): + msg = "Incompatible type when converting to PyArrow dtype for operation." + else: + msg = ( + "Can only add string arrays to dtypes " + "null, int, float, str, and binary." + ) + with pytest.raises(TypeError, match=msg): + arr + df + else: + msg = escape( + "Only supports op(add) between StringArray and dtypes int, float, and str." + ) + with pytest.raises(TypeError, match=msg): + arr + df def test_comparison_methods_scalar(comparison_op, dtype): diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 583435b674ba1..888beb3bb5354 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -152,6 +152,9 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators ser = pd.Series(data) + if op_name in ["__add__", "__radd__"]: + pytest.mark.xfail(reason="Failed: DID NOT RAISE ") + self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) def test_divmod(self, data): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a9a98a5005bb3..48880469173cb 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -626,12 +626,6 @@ def test_arith_flex_frame_corner(self, float_frame): expected = float_frame.sort_index() * np.nan tm.assert_frame_equal(result, expected) - with pytest.raises(NotImplementedError, match="fill_value"): - float_frame.add(float_frame.iloc[0], fill_value=3) - - with pytest.raises(NotImplementedError, match="fill_value"): - float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) - @pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"]) def test_arith_flex_series_ops(self, simple_frame, op): # after arithmetic refactor, add truediv here @@ -672,11 +666,11 @@ def test_arith_flex_zero_len_raises(self): df_len0 = DataFrame(columns=["A", "B"]) df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - with pytest.raises(NotImplementedError, match="fill_value"): + msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" + with pytest.raises(TypeError, match=msg): df.add(ser_len0, fill_value="E") - with pytest.raises(NotImplementedError, match="fill_value"): - df_len0.sub(df["A"], axis=None, fill_value=3) + df_len0.sub(df["A"], axis=None, fill_value=3) def test_flex_add_scalar_fill_value(self): # GH#12723 @@ -2192,3 +2186,61 @@ def test_mixed_col_index_dtype(string_dtype_no_object): expected.columns = expected.columns.astype(string_dtype_no_object) tm.assert_frame_equal(result, expected) + + +dt_params = [ + (tm.ALL_INT_NUMPY_DTYPES[0], 5), + (tm.ALL_INT_EA_DTYPES[0], 5), + (tm.FLOAT_NUMPY_DTYPES[0], 4.9), + (tm.FLOAT_EA_DTYPES[0], 4.9), +] + +axes = [0, 1] + + +@pytest.mark.parametrize( + "data_type,fill_val, axis", + [(dt, val, axis) for axis in axes for dt, val in dt_params], +) +def test_df_fill_value_dtype(data_type, fill_val, axis): + # GH 61581 + base_data = np.arange(25).reshape(5, 5) + mult_list = [1, np.nan, 5, np.nan, 3] + np_int_flag = 0 + + try: + mult_data = pd.array(mult_list, dtype=data_type) + except ValueError as e: + # Numpy int type cannot represent NaN, it will end up here + if "cannot convert float NaN to integer" in str(e): + mult_data = np.asarray(mult_list) + np_int_flag = 1 + + columns = list("ABCDE") + df = DataFrame(base_data, columns=columns) + + for i in range(df.shape[0]): + try: + df.iat[i, i] = np.nan + df.iat[i + 1, i] = pd.NA + df.iat[i + 3, i] = pd.NA + except IndexError: + pass + + mult_mat = np.broadcast_to(mult_data, df.shape) + if axis == 0: + mask = np.isnan(mult_mat).T + else: + mask = np.isnan(mult_mat) + mask = df.isna().values & mask + + df_result = df.mul(mult_data, axis=axis, fill_value=fill_val) + if np_int_flag == 1: + mult_np = np.nan_to_num(mult_data, nan=fill_val) + df_expected = (df.fillna(fill_val).mul(mult_np, axis=axis)).mask(mask, np.nan) + else: + df_expected = ( + df.fillna(fill_val).mul(mult_data.fillna(fill_val), axis=axis) + ).mask(mask, np.nan) + + tm.assert_frame_equal(df_result, df_expected)