diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 933e8fbc175d8..422ba5201bc4e 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -199,8 +199,8 @@ class SortIntegerArray: params = [10**3, 10**5] def setup(self, N): - data = np.arange(N, dtype=float) - data[40] = np.nan + data = np.arange(N, dtype=float).astype(object) + data[40] = pd.NA self.array = pd.array(data, dtype="Int64") def time_argsort(self, N): diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index cd7851acae3f2..14fa64c01f1a5 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -4,6 +4,7 @@ import numpy as np from pandas import ( + NA, DataFrame, Index, MultiIndex, @@ -445,6 +446,8 @@ def setup(self, inplace, dtype): values[::2] = np.nan if dtype == "Int64": values = values.round() + values = values.astype(object) + values[::2] = NA self.df = DataFrame(values, dtype=dtype) self.fill_values = self.df.iloc[self.df.first_valid_index()].to_dict() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 19c556dfe9d1f..7c1d6457eea15 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -689,6 +689,10 @@ def setup(self, dtype, method, with_nans): null_vals = vals.astype(float, copy=True) null_vals[::2, :] = np.nan null_vals[::3, :] = np.nan + if dtype in ["Int64", "Float64"]: + null_vals = null_vals.astype(object) + null_vals[::2, :] = NA + null_vals[::3, :] = NA df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) df["key"] = keys self.df = df diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 3bb151a2dd339..11d5ab86e76ef 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -75,7 +75,7 @@ or convert from existing pandas data: .. ipython:: python - s1 = pd.Series([1, 2, np.nan], dtype="Int64") + s1 = pd.Series([1, 2, pd.NA], dtype="Int64") s1 s2 = s1.astype("string") s2 diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0f40f5bfa5fc9..27d5a65a08467 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -50,7 +50,7 @@ marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` .. ipython:: python - s = pd.Series([1, 2, np.nan], dtype='Int64') + s = pd.Series([1, 2, pd.NA], dtype='Int64') s @@ -166,7 +166,7 @@ See the :ref:`dtypes docs ` for more on extension arrays. .. ipython:: python - pd.array([1, 2, np.nan], dtype='Int64') + pd.array([1, 2, pd.NA], dtype='Int64') pd.array(['a', 'b', 'c'], dtype='category') Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d721213dc38e7..99a6be03c84d3 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -465,6 +465,55 @@ small behavior differences as collateral: - Adding or subtracting a :class:`Day` with a :class:`Timedelta` is no longer supported. - Adding or subtracting a :class:`Day` offset to a timezone-aware :class:`Timestamp` or datetime-like may lead to an ambiguous or non-existent time, which will raise. +.. _whatsnew_300.api_breaking.nan_vs_na: + +Changed treatment of NaN values in pyarrow and numpy-nullable floating dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, when dealing with a nullable dtype (e.g. ``Float64Dtype`` or ``int64[pyarrow]``), ``NaN`` was treated as interchangeable with :class:`NA` in some circumstances but not others. This was done to make adoption easier, but caused some confusion (:issue:`32265`). In 3.0, an option ``"mode.nan_is_na"`` (default ``True``) controls whether to treat ``NaN`` as equivalent to :class:`NA`. + +With ``pd.set_option("mode.nan_is_na", True)`` (again, this is the default), ``NaN`` can be passed to constructors, ``__setitem__``, ``__contains__`` and be treated the same as :class:`NA`. The only change users will see is that arithmetic and ``np.ufunc`` operations that previously introduced ``NaN`` entries produce :class:`NA` entries instead: + +*Old behavior:* + +.. code-block:: ipython + + In [2]: ser = pd.Series([0, None], dtype=pd.Float64Dtype()) + In [3]: ser / 0 + Out[3]: + 0 NaN + 1 + dtype: Float64 + +*New behavior:* + +.. ipython:: python + + ser = pd.Series([0, None], dtype=pd.Float64Dtype()) + ser / 0 + +By contrast, with ``pd.set_option("mode.nan_is_na", False)``, ``NaN`` is always considered distinct and specifically as a floating-point value, so cannot be used with integer dtypes: + +*Old behavior:* + +.. code-block:: ipython + + In [2]: ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype()) + In [3]: ser[1] + Out[3]: + +*New behavior:* + +.. ipython:: python + + pd.set_option("mode.nan_is_na", False) + ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype()) + ser[1] + +If we had passed ``pd.Int64Dtype()`` or ``"int64[pyarrow]"`` for the dtype in the latter example, this would raise, as a float ``NaN`` cannot be held by an integer dtype. + +With ``"mode.nan_is_na"`` set to ``False``, ``ser.to_numpy()`` (and ``frame.values`` and ``np.asarray(obj)``) will convert to ``object`` dtype if :class:`NA` entries are present, where before they would coerce to ``NaN``. To retain a float numpy dtype, explicitly pass ``na_value=np.nan`` to :meth:`Series.to_numpy`. + .. _whatsnew_300.api_breaking.deps: Increased minimum version for Python diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 463e8af7cc561..ee709eff2eeae 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -33,3 +33,8 @@ def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] + + +def is_nan_na() -> bool: + _mode_options = _global_config["mode"] + return _mode_options["nan_is_na"] diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 6bf30a03cef32..64256ae4b36ad 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ... def checknull(val: object) -> bool: ... def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... +def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index c7f905c4d0be0..164a47cb5adb7 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA +@cython.wraparound(False) +@cython.boundscheck(False) +def is_pdna_or_none(values: ndarray) -> ndarray: + cdef: + ndarray[uint8_t] result + Py_ssize_t i, N + object val + + N = len(values) + result = np.zeros(N, dtype=np.uint8) + + for i in range(N): + val = values[i] + if val is None or val is C_NA: + result[i] = True + return result.view(bool) + + @cython.wraparound(False) @cython.boundscheck(False) def is_numeric_na(values: ndarray) -> ndarray: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 87a305ede481e..91eddc3261164 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -8,6 +8,8 @@ from csv import ( ) import warnings +from pandas._config import is_nan_na + from pandas.util._exceptions import find_stack_level from pandas import ( @@ -1469,7 +1471,7 @@ def _maybe_upcast( if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow arr = arr.to_numpy(na_value=None) - arr = ArrowExtensionArray(pa.array(arr, from_pandas=True)) + arr = ArrowExtensionArray(pa.array(arr, from_pandas=is_nan_na())) return arr diff --git a/pandas/conftest.py b/pandas/conftest.py index 774936be33631..d69c7e0113310 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2116,3 +2116,10 @@ def temp_file(tmp_path): def monkeysession(): with pytest.MonkeyPatch.context() as mp: yield mp + + +@pytest.fixture(params=[True, False]) +def using_nan_is_na(request): + opt = request.param + with pd.option_context("mode.nan_is_na", opt): + yield opt diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 4957fb0fa2069..277c7240cf552 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -7,7 +7,10 @@ import numpy as np +from pandas._config import is_nan_na + from pandas._libs import lib +from pandas._libs.missing import NA from pandas.errors import LossySetitemError from pandas.core.dtypes.cast import np_can_hold_element @@ -22,7 +25,10 @@ def to_numpy_dtype_inference( - arr: ExtensionArray, dtype: npt.DTypeLike | None, na_value, hasna: bool + arr: ExtensionArray, + dtype: npt.DTypeLike | None, + na_value, + hasna: bool, ) -> tuple[np.dtype | None, Any]: result_dtype: np.dtype | None inferred_numeric_dtype = False @@ -37,7 +43,11 @@ def to_numpy_dtype_inference( else: result_dtype = arr.dtype.numpy_dtype # type: ignore[attr-defined] if na_value is lib.no_default: - na_value = np.nan + if not is_nan_na(): + na_value = NA + dtype = np.dtype(object) + else: + na_value = np.nan else: result_dtype = arr.dtype.numpy_dtype # type: ignore[attr-defined] elif dtype is not None: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2eed608908440..db6b58c8dbc7f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -22,7 +22,10 @@ import numpy as np +from pandas._config import is_nan_na + from pandas._libs import lib +from pandas._libs.missing import is_pdna_or_none from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -38,6 +41,7 @@ from pandas.core.dtypes.cast import ( can_hold_element, + construct_1d_object_array_from_listlike, infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( @@ -331,6 +335,11 @@ def _from_sequence_of_strings( """ Construct a new ExtensionArray from a sequence of strings. """ + mask = isna(strings) + + if isinstance(strings, cls): + strings = strings._pa_array + pa_type = to_pyarrow_type(dtype) if ( pa_type is None @@ -349,17 +358,19 @@ def _from_sequence_of_strings( from pandas.core.tools.datetimes import to_datetime scalars = to_datetime(strings, errors="raise").date + scalars = pa.array(scalars, type=pa_type, mask=mask) elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta scalars = to_timedelta(strings, errors="raise") + if pa_type.unit != "ns": # GH51175: test_from_sequence_of_strings_pa_array # attempt to parse as int64 reflecting pyarrow's # duration to string casting behavior mask = isna(scalars) if not isinstance(strings, (pa.Array, pa.ChunkedArray)): - strings = pa.array(strings, type=pa.string(), from_pandas=True) + strings = pa.array(strings, type=pa.string(), mask=mask) strings = pc.if_else(mask, None, strings) try: scalars = strings.cast(pa.int64()) @@ -380,7 +391,7 @@ def _from_sequence_of_strings( if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings else: - scalars = pa.array(strings, type=pa.string(), from_pandas=True) + scalars = pa.array(strings, type=pa.string(), mask=mask) scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) scalars = scalars.cast(pa.bool_()) @@ -392,6 +403,11 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") + if isinstance(strings, (pa.Array, pa.ChunkedArray)): + scalars = strings.cast(pa_type) + elif mask is not None: + scalars = pa.array(scalars, mask=mask, type=pa_type) + else: raise NotImplementedError( f"Converting strings to {pa_type} is not implemented." @@ -411,7 +427,12 @@ def _cast_pointwise_result(self, values) -> ArrayLike: return self[:0].copy() try: - arr = pa.array(values, from_pandas=True) + if self.dtype.kind in "iufc" and not is_nan_na(): + values = np.asarray(values, dtype=object) + mask = is_pdna_or_none(values) + arr = pa.array(values, mask=mask) + else: + arr = pa.array(values, from_pandas=True) except (ValueError, TypeError): # e.g. test_by_column_values_with_same_starting_value with nested # values, one entry of which is an ArrowStringArray @@ -508,7 +529,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: """ if isinstance(value, pa.Scalar): pa_scalar = value - elif isna(value): + elif isna(value) and not (lib.is_float(value) and not is_nan_na()): pa_scalar = pa.scalar(None, type=pa_type) else: # Workaround https://github.com/apache/arrow/issues/37291 @@ -525,7 +546,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: value = value.as_unit(pa_type.unit) value = value._value - pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + pa_scalar = pa.scalar(value, type=pa_type) if pa_type is not None and pa_scalar.type != pa_type: pa_scalar = pa_scalar.cast(pa_type) @@ -557,6 +578,13 @@ def _box_pa_array( if copy: value = value.copy() pa_array = value.__arrow_array__() + + elif hasattr(value, "__arrow_array__"): + # e.g. StringArray + if copy: + value = value.copy() + pa_array = value.__arrow_array__() + else: if ( isinstance(value, np.ndarray) @@ -610,11 +638,32 @@ def _box_pa_array( pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask) return pa_array + mask = None + if is_nan_na(): + try: + arr_value = np.asarray(value) + if arr_value.ndim > 1: + # e.g. test_fixed_size_list we have list data. ndim > 1 + # means there were no scalar (NA) entries. + mask = np.zeros(len(value), dtype=np.bool_) + else: + mask = isna(arr_value) + except ValueError: + # Ragged data that numpy raises on + arr_value = construct_1d_object_array_from_listlike(value) + mask = isna(arr_value) + elif ( + getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf" + ): + arr_value = np.asarray(value, dtype=object) + # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like + mask = is_pdna_or_none(arr_value) # type: ignore[assignment] + try: - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array(value, type=pa_type, mask=mask) except (pa.ArrowInvalid, pa.ArrowTypeError): # GH50430: let pyarrow infer type, then cast - pa_array = pa.array(value, from_pandas=True) + pa_array = pa.array(value, mask=mask) if pa_type is None and pa.types.is_duration(pa_array.type): # Workaround https://github.com/apache/arrow/issues/37291 @@ -622,7 +671,7 @@ def _box_pa_array( value = to_timedelta(value) value = value.to_numpy() - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array(value, type=pa_type) if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: # GH52843: upstream bug for duration types when originally @@ -993,7 +1042,14 @@ def _arith_method(self, other, op) -> Self | npt.NDArray[np.object_]: ], dtype=object, ) - return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) + + result = self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) + if is_nan_na() and result.dtype.kind == "f": + parr = result._pa_array + mask = pc.is_nan(parr).to_numpy() + arr = pc.replace_with_mask(parr, mask, pa.scalar(None, type=parr.type)) + result = type(self)(arr) + return result def equals(self, other) -> bool: if not isinstance(other, ArrowExtensionArray): @@ -1029,7 +1085,10 @@ def __len__(self) -> int: def __contains__(self, key) -> bool: # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 if isna(key) and key is not self.dtype.na_value: - if self.dtype.kind == "f" and lib.is_float(key): + if lib.is_float(key) and is_nan_na(): + return self.dtype.na_value in self + elif self.dtype.kind == "f" and lib.is_float(key): + # Check specifically for NaN return pc.any(pc.is_nan(self._pa_array)).as_py() # e.g. date or timestamp types we do not allow None here to match pd.NA @@ -1330,7 +1389,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if not len(values): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) + result = pc.is_in(self._pa_array, value_set=pa.array(values)) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -1613,7 +1672,11 @@ def to_numpy( pa.types.is_floating(pa_type) and ( na_value is np.nan - or (original_na_value is lib.no_default and is_float_dtype(dtype)) + or ( + original_na_value is lib.no_default + and is_float_dtype(dtype) + and is_nan_na() + ) ) ): result = data._pa_array.to_numpy() @@ -2140,7 +2203,7 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") chunks = [ *self._pa_array[:key].chunks, - pa.array([value], type=self._pa_array.type, from_pandas=True), + pa.array([value], type=self._pa_array.type, from_pandas=is_nan_na()), *self._pa_array[key + 1 :].chunks, ] data = pa.chunked_array(chunks).combine_chunks() @@ -2194,7 +2257,7 @@ def _rank_calc( pa_type = pa.float64() else: pa_type = pa.uint64() - result = pa.array(ranked, type=pa_type, from_pandas=True) + result = pa.array(ranked, type=pa_type, from_pandas=is_nan_na()) return result data = self._pa_array.combine_chunks() @@ -2445,7 +2508,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: right, right_type = _to_numpy_and_type(right) pa_type = left_type or right_type result = np.where(cond, left, right) - return pa.array(result, type=pa_type, from_pandas=True) + return pa.array(result, type=pa_type, from_pandas=is_nan_na()) @classmethod def _replace_with_mask( @@ -2486,9 +2549,10 @@ def _replace_with_mask( replacements = np.array(replacements, dtype=object) elif isinstance(replacements, pa.Scalar): replacements = replacements.as_py() + result = np.array(values, dtype=object) result[mask] = replacements - return pa.array(result, type=values.type, from_pandas=True) + return pa.array(result, type=values.type, from_pandas=is_nan_na()) # ------------------------------------------------------------------ # GroupBy Methods @@ -2567,7 +2631,7 @@ def _groupby_op( return self._from_pyarrow_array(pa_result) else: # DatetimeArray, TimedeltaArray - pa_result = pa.array(result, from_pandas=True) + pa_result = pa.array(result) return self._from_pyarrow_array(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bddca5bed6ff8..25d7c792e5810 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import is_nan_na + from pandas._libs import ( algos as libalgos, lib, @@ -326,7 +328,9 @@ def __setitem__(self, key, value) -> None: def __contains__(self, key) -> bool: if isna(key) and key is not self.dtype.na_value: # GH#52840 - if self._data.dtype.kind == "f" and lib.is_float(key): + if lib.is_float(key) and is_nan_na(): + key = self.dtype.na_value + elif self._data.dtype.kind == "f" and lib.is_float(key): return bool((np.isnan(self._data) & ~self._mask).any()) return bool(super().__contains__(key)) @@ -684,6 +688,8 @@ def reconstruct(x: np.ndarray): # reached in e.g. np.sqrt on BooleanArray # we don't support float16 x = x.astype(np.float32) + if is_nan_na(): + m[np.isnan(x)] = True return FloatingArray(x, m) else: x[mask] = np.nan @@ -890,6 +896,9 @@ def _maybe_mask_result( if result.dtype.kind == "f": from pandas.core.arrays import FloatingArray + if is_nan_na(): + mask[np.isnan(result)] = True + return FloatingArray(result, mask, copy=False) elif result.dtype.kind == "b": diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 324e2fc3bf108..80fbbf99a5494 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -9,6 +9,8 @@ import numpy as np +from pandas._config import is_nan_na + from pandas._libs import ( lib, missing as libmissing, @@ -101,6 +103,8 @@ def __from_arrow__( array = array.combine_chunks() data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype) + if data.dtype.kind == "f" and is_nan_na(): + mask[np.isnan(data)] = False return array_class(data.copy(), ~mask, copy=False) @classmethod @@ -195,9 +199,21 @@ def _coerce_to_data_and_mask( elif values.dtype.kind == "f": # np.isnan is faster than is_numeric_na() for floats # github issue: #60066 - mask = np.isnan(values) + if is_nan_na(): + mask = np.isnan(values) + else: + mask = np.zeros(len(values), dtype=np.bool_) + if dtype_cls.__name__.strip("_").startswith(("I", "U")): + wrong = np.isnan(values) + if wrong.any(): + raise ValueError("Cannot cast NaN value to Integer dtype.") else: - mask = libmissing.is_numeric_na(values) + if is_nan_na(): + mask = libmissing.is_numeric_na(values) + else: + # is_numeric_na will raise on non-numeric NAs + libmissing.is_numeric_na(values) + mask = libmissing.is_pdna_or_none(values) else: assert len(mask) == len(values) @@ -236,7 +252,6 @@ def _coerce_to_data_and_mask( values = values.astype(dtype, copy=copy) else: values = dtype_cls._safe_cast(values, dtype, copy=False) - return values, mask, dtype, inferred_type @@ -265,6 +280,10 @@ def __init__( # If we don't raise here, then accessing self.dtype would raise raise TypeError("FloatingArray does not support np.float16 dtype.") + # NB: if is_nan_na() is True + # then caller is responsible for ensuring + # assert mask[np.isnan(values)].all() + super().__init__(values, mask, copy=copy) @cache_readonly diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7a61a252d86a6..4801c21aa325a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -493,6 +493,12 @@ def _str_map_str_or_object( if self.dtype.storage == "pyarrow": import pyarrow as pa + # TODO: shouldn't this already be caught my passed mask? + # it isn't in test_extract_expand_capture_groups_index + # mask = mask | np.array( + # [x is libmissing.NA for x in result], dtype=bool + # ) + result = pa.array( result, mask=mask, type=pa.large_string(), from_pandas=True ) @@ -778,7 +784,7 @@ def __arrow_array__(self, type=None): values = self._ndarray.copy() values[self.isna()] = None - return pa.array(values, type=type, from_pandas=True) + return pa.array(values, type=type) def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] arr = self._ndarray diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bf7e8fb02b58e..9b317b51cabdc 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -427,6 +427,15 @@ def is_terminal() -> bool: validator=is_one_of_factory([True, False, "warn"]), ) + cf.register_option( + "nan_is_na", + os.environ.get("PANDAS_NAN_IS_NA", "1") == "1", + "Whether to treat NaN entries as interchangeable with pd.NA in " + "numpy-nullable and pyarrow float dtypes. See discussion in " + "https://github.com/pandas-dev/pandas/issues/32265", + validator=is_one_of_factory([True, False]), + ) + # user warnings chained_assignment = """ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a0f2d0447ea8c..3b615c70ebea2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,10 @@ import numpy as np -from pandas._config import using_string_dtype +from pandas._config import ( + is_nan_na, + using_string_dtype, +) from pandas._libs import ( Interval, @@ -954,7 +957,10 @@ def convert_dtypes( elif input_array.dtype.kind in "fcb": # TODO: de-dup with maybe_cast_to_integer_array? arr = input_array[notna(input_array)] - if (arr.astype(int) == arr).all(): + if len(arr) < len(input_array) and not is_nan_na(): + # In the presence of NaNs, we cannot convert to IntegerDtype + pass + elif (arr.astype(int) == arr).all(): inferred_dtype = target_int_dtype else: inferred_dtype = input_array.dtype @@ -978,7 +984,10 @@ def convert_dtypes( if convert_integer: # TODO: de-dup with maybe_cast_to_integer_array? arr = input_array[notna(input_array)] - if (arr.astype(int) == arr).all(): + if len(arr) < len(input_array) and not is_nan_na(): + # In the presence of NaNs, we can't convert to IntegerDtype + inferred_dtype = inferred_float_dtype + elif (arr.astype(int) == arr).all(): inferred_dtype = pandas_dtype_func("Int64") else: inferred_dtype = inferred_float_dtype diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7b91ca3d564a2..65d9829a4cce9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9936,7 +9936,7 @@ def where( def where( self, cond, - other=np.nan, + other=lib.no_default, *, inplace: bool = False, axis: Axis | None = None, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d75479da70d11..84ab9c6ec9b09 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -22,6 +22,7 @@ from pandas._config import ( get_option, + is_nan_na, using_string_dtype, ) @@ -160,6 +161,7 @@ ExtensionArray, TimedeltaArray, ) +from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.string_ import ( StringArray, StringDtype, @@ -6594,6 +6596,14 @@ def _maybe_cast_indexer(self, key): If we have a float key and are not a floating index, then try to cast to an int if equivalent. """ + if ( + is_float(key) + and np.isnan(key) + and isinstance(self.dtype, FloatingDtype) + and is_nan_na() + ): + # TODO: better place to do this? + key = self.dtype.na_value return key def _maybe_cast_listlike_indexer(self, target) -> Index: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index a8c143bda7190..a295c6ff602ee 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -33,7 +33,10 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -374,7 +377,11 @@ def dict_to_mgr( if columns is not None: columns = ensure_index(columns) - arrays = [np.nan] * len(columns) + if dtype is not None and not isinstance(dtype, np.dtype): + # e.g. test_dataframe_from_dict_of_series + arrays = [dtype.na_value] * len(columns) + else: + arrays = [np.nan] * len(columns) midxs = set() data_keys = ensure_index(data.keys()) # type: ignore[arg-type] data_values = list(data.values()) @@ -963,10 +970,13 @@ def convert_object_array( def convert(arr): if dtype != np.dtype("O"): + # e.g. if dtype is UInt32 then we want to cast Nones to NA instead of + # NaN in maybe_convert_objects. + to_nullable = dtype_backend != "numpy" or isinstance(dtype, BaseMaskedDtype) arr = lib.maybe_convert_objects( arr, try_float=coerce_float, - convert_to_nullable_dtype=dtype_backend != "numpy", + convert_to_nullable_dtype=to_nullable, ) # Notes on cases that get here 2023-02-15 # 1) we DO get here when arr is all Timestamps and dtype=None diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e1286eee65128..32e932b70e761 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -19,6 +19,8 @@ import numpy as np +from pandas._config import option_context + from pandas._libs import lib from pandas._libs.json import ( ujson_dumps, @@ -994,9 +996,10 @@ def _read_ujson(self) -> DataFrame | Series: else: obj = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: - return obj.convert_dtypes( - infer_objects=False, dtype_backend=self.dtype_backend - ) + with option_context("mode.nan_is_na", True): + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) else: return obj @@ -1071,9 +1074,10 @@ def __next__(self) -> DataFrame | Series: raise ex if self.dtype_backend is not lib.no_default: - return obj.convert_dtypes( - infer_objects=False, dtype_backend=self.dtype_backend - ) + with option_context("mode.nan_is_na", True): + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend + ) else: return obj diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index feca60c6e28a2..5510036e542f5 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -13,6 +13,8 @@ ) import warnings +from pandas._config import option_context + from pandas._libs import lib from pandas._libs.json import ujson_loads from pandas._libs.tslibs import timezones @@ -384,7 +386,8 @@ def parse_table_schema(json, precise_float: bool) -> DataFrame: 'table="orient" can not yet read ISO-formatted Timedelta data' ) - df = df.astype(dtypes) + with option_context("mode.nan_is_na", True): + df = df.astype(dtypes) if "primaryKey" in table["schema"]: df = df.set_index(table["schema"]["primaryKey"]) diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 777099e76fc73..e4e26383ae42c 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -35,21 +35,24 @@ def test_array_op(dtype, opname, exp): @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) -def test_divide_by_zero(dtype, zero, negative): +def test_divide_by_zero(dtype, zero, negative, using_nan_is_na): # TODO pending NA/NaN discussion # https://github.com/pandas-dev/pandas/issues/32265/ a = pd.array([0, 1, -1, None], dtype=dtype) result = a / zero + exp_mask = np.array([False, False, False, True]) + if using_nan_is_na: + exp_mask[[0, -1]] = True expected = FloatingArray( np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype), - np.array([False, False, False, True]), + exp_mask, ) if negative: expected *= -1 tm.assert_extension_array_equal(result, expected) -def test_pow_scalar(dtype): +def test_pow_scalar(dtype, using_nan_is_na): a = pd.array([-1, 0, 1, None, 2], dtype=dtype) result = a**0 expected = pd.array([1, 1, 1, 1, 1], dtype=dtype) @@ -64,11 +67,14 @@ def test_pow_scalar(dtype): tm.assert_extension_array_equal(result, expected) result = a**np.nan - # TODO np.nan should be converted to pd.NA / missing before operation? - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask, - ) + if using_nan_is_na: + expected = pd.array([None, None, 1, None, None], dtype=dtype) + else: + # TODO np.nan should be converted to pd.NA / missing before operation? + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), + mask=a._mask, + ) tm.assert_extension_array_equal(result, expected) # reversed @@ -87,9 +93,11 @@ def test_pow_scalar(dtype): tm.assert_extension_array_equal(result, expected) result = np.nan**a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask - ) + if not using_nan_is_na: + # Otherwise the previous `expected` can be reused + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask + ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py index a429649f1ce1d..0990757964267 100644 --- a/pandas/tests/arrays/floating/test_comparison.py +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -38,11 +38,15 @@ def test_equals(): assert a1.equals(a2) is False -def test_equals_nan_vs_na(): +def test_equals_nan_vs_na(using_nan_is_na): # GH#44382 mask = np.zeros(3, dtype=bool) data = np.array([1.0, np.nan, 3.0], dtype=np.float64) + if using_nan_is_na: + # Under PDEP16, all callers of the FloatingArray constructor should + # ensure that mask[np.isnan(data)] = True + mask[1] = True left = FloatingArray(data, mask) assert left.equals(left) @@ -57,7 +61,11 @@ def test_equals_nan_vs_na(): assert right.equals(right) tm.assert_extension_array_equal(right, right) - assert not left.equals(right) + if not using_nan_is_na: + assert not left.equals(right) + else: + # the constructor will set the NaN locations to NA + assert left.equals(right) # with mask[1] = True, the only difference is data[1], which should # not matter for equals diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index e1d237205a753..9c383efa3216c 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -85,9 +85,12 @@ def test_to_array(): ([np.nan], [pd.NA]), ], ) -def test_to_array_none_is_nan(a, b): +def test_to_array_none_is_nan(a, b, using_nan_is_na): result = pd.array(a, dtype="Float64") expected = pd.array(b, dtype="Float64") + if not using_nan_is_na and a[-1] is np.nan: + assert np.isnan(result[-1]) + expected._mask[-1] = False tm.assert_extension_array_equal(result, expected) @@ -189,13 +192,17 @@ def test_to_array_bool(bool_values, values, target_dtype, expected_dtype): tm.assert_extension_array_equal(result, expected) -def test_series_from_float(data): +def test_series_from_float(data, using_nan_is_na): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + np_res = data.to_numpy(na_value=np.nan, dtype="float") + if not using_nan_is_na: + np_res = np_res.astype(object) + np_res[data.isna()] = pd.NA + result = pd.Series(np_res, dtype=str(dtype)) tm.assert_series_equal(result, expected) # from list diff --git a/pandas/tests/arrays/floating/test_contains.py b/pandas/tests/arrays/floating/test_contains.py index 956642697bf32..5dff4b803d87d 100644 --- a/pandas/tests/arrays/floating/test_contains.py +++ b/pandas/tests/arrays/floating/test_contains.py @@ -3,10 +3,13 @@ import pandas as pd -def test_contains_nan(): +def test_contains_nan(using_nan_is_na): # GH#52840 arr = pd.array(range(5)) / 0 assert np.isnan(arr._data[0]) - assert not arr.isna()[0] + if using_nan_is_na: + assert arr.isna()[0] + else: + assert not arr.isna()[0] assert np.nan in arr diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index dffb2a1f6e1f5..e03e8f30197b9 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -10,10 +10,13 @@ @pytest.mark.parametrize("ufunc", [np.abs, np.sign]) # np.sign emits a warning with nans, @pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning") -def test_ufuncs_single(ufunc): +def test_ufuncs_single(ufunc, using_nan_is_na): a = pd.array([1, 2, -3, pd.NA], dtype="Float64") result = ufunc(a) - expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) s = pd.Series(a) @@ -23,45 +26,66 @@ def test_ufuncs_single(ufunc): @pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) -def test_ufuncs_single_float(ufunc): +def test_ufuncs_single_float(ufunc, using_nan_is_na): a = pd.array([1.0, 0.2, 3.0, pd.NA], dtype="Float64") with np.errstate(invalid="ignore"): result = ufunc(a) - expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) s = pd.Series(a) with np.errstate(invalid="ignore"): result = ufunc(s) - expected = pd.Series(ufunc(s.astype(float)), dtype="Float64") + np_res = ufunc(s.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.Series(np_res, dtype="Float64") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ufunc", [np.add, np.subtract]) -def test_ufuncs_binary_float(ufunc): +def test_ufuncs_binary_float(ufunc, using_nan_is_na): # two FloatingArrays a = pd.array([1, 0.2, -3, pd.NA], dtype="Float64") result = ufunc(a, a) - expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64") + np_res = ufunc(a.astype(float), a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) # FloatingArray with numpy array arr = np.array([1, 2, 3, 4]) result = ufunc(a, arr) - expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64") + np_res = ufunc(a.astype(float), arr) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) - expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64") + np_res = ufunc(arr, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) # FloatingArray with scalar result = ufunc(a, 1) - expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64") + np_res = ufunc(a.astype(float), 1) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) result = ufunc(1, a) - expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64") + np_res = ufunc(1, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index e954cecba417a..fc9e260923d32 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -7,18 +7,23 @@ @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) -def test_to_numpy(box): +def test_to_numpy(box, using_nan_is_na): con = pd.Series if box else pd.array # default (with or without missing values) -> object dtype arr = con([0.1, 0.2, 0.3], dtype="Float64") result = arr.to_numpy() expected = np.array([0.1, 0.2, 0.3], dtype="float64") + # TODO: should this be object with `not using_nan_is_na` to avoid + # values-dependent behavior? tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") result = arr.to_numpy() - expected = np.array([0.1, 0.2, np.nan], dtype="float64") + if using_nan_is_na: + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + else: + expected = np.array([0.1, 0.2, pd.NA], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -81,11 +86,18 @@ def test_to_numpy_na_value(box): tm.assert_numpy_array_equal(result, expected) -def test_to_numpy_na_value_with_nan(): +def test_to_numpy_na_value_with_nan(using_nan_is_na): # array with both NaN and NA -> only fill NA with `na_value` - arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True])) + mask = np.array([False, False, True]) + if using_nan_is_na: + mask[1] = True + arr = FloatingArray(np.array([0.0, np.nan, 0.0]), mask) result = arr.to_numpy(dtype="float64", na_value=-1) - expected = np.array([0.0, np.nan, -1.0], dtype="float64") + if using_nan_is_na: + # the NaN passed to the constructor is considered as NA + expected = np.array([0.0, -1.0, -1.0], dtype="float64") + else: + expected = np.array([0.0, np.nan, -1.0], dtype="float64") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index aeceb9b8a3cb1..e16ab6f23b417 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -52,13 +52,16 @@ def test_div(dtype): @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) -def test_divide_by_zero(zero, negative): +def test_divide_by_zero(zero, negative, using_nan_is_na): # https://github.com/pandas-dev/pandas/issues/27398, GH#22793 a = pd.array([0, 1, -1, None], dtype="Int64") result = a / zero + exp_mask = np.array([False, False, False, True]) + if using_nan_is_na: + exp_mask[0] = True expected = FloatingArray( np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"), - np.array([False, False, False, True]), + exp_mask, ) if negative: expected *= -1 @@ -99,7 +102,7 @@ def test_mod(dtype): tm.assert_extension_array_equal(result, expected) -def test_pow_scalar(): +def test_pow_scalar(using_nan_is_na): a = pd.array([-1, 0, 1, None, 2], dtype="Int64") result = a**0 expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") @@ -114,10 +117,13 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = a**np.nan - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), - np.array([False, False, False, True, False]), - ) + if using_nan_is_na: + expected = expected.astype("Float64") + else: + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), + np.array([False, False, False, True, False]), + ) tm.assert_extension_array_equal(result, expected) # reversed @@ -136,10 +142,13 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = np.nan**a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype="float64"), - np.array([False, False, True, False]), - ) + if using_nan_is_na: + expected = expected.astype("Float64") + else: + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype="float64"), + np.array([False, False, True, False]), + ) tm.assert_extension_array_equal(result, expected) @@ -212,7 +221,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): # TODO test unsigned overflow -def test_arith_coerce_scalar(data, all_arithmetic_operators): +def test_arith_coerce_scalar(data, all_arithmetic_operators, using_nan_is_na): op = tm.get_op_from_name(all_arithmetic_operators) s = pd.Series(data) other = 0.01 @@ -220,9 +229,11 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): result = op(s, other) expected = op(s.astype(float), other) expected = expected.astype("Float64") + if not using_nan_is_na: + expected[s.isna()] = pd.NA # rmod results in NaN that wasn't NA in original nullable Series -> unmask it - if all_arithmetic_operators == "__rmod__": + if all_arithmetic_operators == "__rmod__" and not using_nan_is_na: mask = (s == 0).fillna(False).to_numpy(bool) expected.array._mask[mask] = False diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 8eaa9ace027c9..ab52fbec45f79 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -26,14 +26,20 @@ def test_uses_pandas_na(): assert a[1] is pd.NA -def test_from_dtype_from_float(data): +def test_from_dtype_from_float(data, using_nan_is_na): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) - tm.assert_series_equal(result, expected) + arr = data.to_numpy(na_value=np.nan, dtype="float") + if using_nan_is_na: + result = pd.Series(arr, dtype=str(dtype)) + tm.assert_series_equal(result, expected) + else: + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + pd.Series(arr, dtype=str(dtype)) # from int / list expected = pd.Series(data) @@ -116,10 +122,15 @@ def test_integer_array_constructor_copy(): ([np.nan, np.nan], [np.nan, np.nan]), ], ) -def test_to_integer_array_none_is_nan(a, b): - result = pd.array(a, dtype="Int64") - expected = pd.array(b, dtype="Int64") - tm.assert_extension_array_equal(result, expected) +def test_to_integer_array_none_is_nan(a, b, using_nan_is_na): + if using_nan_is_na: + result = pd.array(a, dtype="Int64") + expected = pd.array(b, dtype="Int64") + tm.assert_extension_array_equal(result, expected) + else: + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + pd.array(b, dtype="Int64") @pytest.mark.parametrize( @@ -139,6 +150,7 @@ def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays msg = "|".join( [ + "cannot convert float NaN to integer", # with not using_nan_is_na r"cannot be converted to IntegerDtype", r"invalid literal for int\(\) with base 10:", r"values must be a 1D list-like", @@ -214,8 +226,16 @@ def test_to_integer_array_str(): ], ) def test_to_integer_array_bool( - constructor, bool_values, int_values, target_dtype, expected_dtype + constructor, bool_values, int_values, target_dtype, expected_dtype, using_nan_is_na ): + if not using_nan_is_na and np.isnan(bool_values[-1]): + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + constructor(bool_values, dtype=target_dtype) + with pytest.raises(ValueError, match=msg): + pd.array(int_values, dtype=target_dtype) + return + result = constructor(bool_values, dtype=target_dtype) assert result.dtype == expected_dtype expected = pd.array(int_values, dtype=target_dtype) @@ -230,8 +250,16 @@ def test_to_integer_array_bool( (np.array([1, np.nan]), "int8", Int8Dtype), ], ) -def test_to_integer_array(values, to_dtype, result_dtype): +def test_to_integer_array(values, to_dtype, result_dtype, using_nan_is_na): # convert existing arrays to IntegerArrays + if not using_nan_is_na and np.isnan(values[-1]): + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + IntegerArray._from_sequence(values, dtype=to_dtype) + with pytest.raises(ValueError, match=msg): + pd.array(values, dtype=result_dtype()) + return + result = IntegerArray._from_sequence(values, dtype=to_dtype) assert result.dtype == result_dtype() expected = pd.array(values, dtype=result_dtype()) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 77a0dd12534cc..892a7a2be7b5c 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -9,24 +9,33 @@ @pytest.mark.parametrize("ufunc", [np.abs, np.sign]) # np.sign emits a warning with nans, @pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning") -def test_ufuncs_single_int(ufunc): +def test_ufuncs_single_int(ufunc, using_nan_is_na): a = pd.array([1, 2, -3, pd.NA], dtype="Int64") result = ufunc(a) - expected = pd.array(ufunc(a.astype(float)), dtype="Int64") + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[-1] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) s = pd.Series(a) result = ufunc(s) - expected = pd.Series(pd.array(ufunc(a.astype(float)), dtype="Int64")) + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[-1] = pd.NA + expected = pd.Series(pd.array(np_res, dtype="Int64")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) -def test_ufuncs_single_float(ufunc): +def test_ufuncs_single_float(ufunc, using_nan_is_na): a = pd.array([1, 2, -3, pd.NA], dtype="Int64") with np.errstate(invalid="ignore"): result = ufunc(a) - expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) + if using_nan_is_na: + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + else: + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) tm.assert_extension_array_equal(result, expected) s = pd.Series(a) @@ -41,34 +50,56 @@ def test_ufuncs_binary_int(ufunc): # two IntegerArrays a = pd.array([1, 2, -3, pd.NA], dtype="Int64") result = ufunc(a, a) - expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Int64") + np_res = ufunc(a.astype(float), a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) # IntegerArray with numpy array arr = np.array([1, 2, 3, 4]) result = ufunc(a, arr) - expected = pd.array(ufunc(a.astype(float), arr), dtype="Int64") + np_res = ufunc(a.astype(float), arr) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) - expected = pd.array(ufunc(arr, a.astype(float)), dtype="Int64") + np_res = ufunc(arr, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) # IntegerArray with scalar result = ufunc(a, 1) - expected = pd.array(ufunc(a.astype(float), 1), dtype="Int64") + np_res = ufunc(a.astype(float), 1) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) result = ufunc(1, a) - expected = pd.array(ufunc(1, a.astype(float)), dtype="Int64") + np_res = ufunc(1, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) -def test_ufunc_binary_output(): - a = pd.array([1, 2, np.nan]) +def test_ufunc_binary_output(using_nan_is_na): + a = pd.array([1, 2, pd.NA], dtype="Int64") result = np.modf(a) - expected = np.modf(a.to_numpy(na_value=np.nan, dtype="float")) - expected = (pd.array(expected[0]), pd.array(expected[1])) + np_res = np.modf(a.to_numpy(na_value=np.nan, dtype="float")) + + np_res = list(np_res) + np_res[0] = np_res[0].astype(object) + np_res[1] = np_res[1].astype(object) + np_res[0][-1] = pd.NA + np_res[1][-1] = pd.NA + + expected = (pd.array(np_res[0]), pd.array(np_res[1])) assert isinstance(result, tuple) assert len(result) == 2 diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index 1c91cd25ba69c..f456d06a49fe5 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -96,8 +96,8 @@ def test_groupby_reductions(op, expected): ["median", Series([2, 2], index=["B", "C"], dtype="Float64")], ["var", Series([2, 2], index=["B", "C"], dtype="Float64")], ["std", Series([2**0.5, 2**0.5], index=["B", "C"], dtype="Float64")], - ["skew", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], - ["kurt", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], + ["skew", Series([np.nan, pd.NA], index=["B", "C"], dtype="Float64")], + ["kurt", Series([np.nan, pd.NA], index=["B", "C"], dtype="Float64")], ["any", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index ef8701be81e2b..c8692bb98f346 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -51,7 +51,7 @@ def test_arrow_array(): pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) -def test_arrow_array_missing(): +def test_arrow_array_missing(using_nan_is_na): pa = pytest.importorskip("pyarrow") from pandas.core.arrays.arrow.extension_types import ArrowIntervalType diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index b4b1761217826..38a9488e5707d 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -38,17 +38,18 @@ def numpy_dtype(data): def test_round(data, numpy_dtype): # No arguments result = data.round() - expected = pd.array( - np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)), dtype=data.dtype - ) + np_result = np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)) + exp_np = np_result.astype(object) + exp_np[data.isna()] = pd.NA + expected = pd.array(exp_np, dtype=data.dtype) tm.assert_extension_array_equal(result, expected) # Decimals argument result = data.round(decimals=2) - expected = pd.array( - np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2), - dtype=data.dtype, - ) + np_result = np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2) + exp_np = np_result.astype(object) + exp_np[data.isna()] = pd.NA + expected = pd.array(exp_np, dtype=data.dtype) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index f10ebda94dc6a..1a71f6c41c4f1 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -525,7 +525,8 @@ def test_astype_float(dtype, any_float_dtype): # Don't compare arrays (37974) ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) result = ser.astype(any_float_dtype) - expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype) + item = np.nan if isinstance(result.dtype, np.dtype) else pd.NA + expected = pd.Series([1.1, item, 3.3], dtype=any_float_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 8890b4509d954..3d075857c3fd9 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -353,7 +353,10 @@ def test_array_multiindex_raises(): ), ], ) -def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array, using_nan_is_na): + if not using_nan_is_na and arr[-1] is pd.NA: + expected = np.array([0, pd.NA], dtype=object) + box = index_or_series_or_array with tm.assert_produces_warning(None): diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 7f094db6ea524..6e55531bbce8f 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -30,7 +30,7 @@ def test_unique(index_or_series_obj): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize("null_obj", [np.nan, None]) -def test_unique_null(null_obj, index_or_series_obj): +def test_unique_null(null_obj, index_or_series_obj, using_nan_is_na): obj = index_or_series_obj if not allow_na_ops(obj): @@ -39,6 +39,12 @@ def test_unique_null(null_obj, index_or_series_obj): pytest.skip("Test doesn't make sense on empty data") elif isinstance(obj, pd.MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") + elif ( + null_obj is not None + and not using_nan_is_na + and obj.dtype in ["Int64", "UInt16", "Float32"] + ): + pytest.skip("NaN is not a valid NA for this dtype.") values = obj._values values[0:2] = null_obj diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 883c0ba8e35b6..d01fe4183eeea 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -31,7 +31,7 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True - def test_contains(self, data, data_missing): + def test_contains(self, data, data_missing, using_nan_is_na): # GH-37867 # Tests for membership checks. Membership checks for nan-likes is tricky and # the settled on rule is: `nan_like in arr` is True if nan_like is @@ -55,7 +55,21 @@ def test_contains(self, data, data_missing): # type check for e.g. two instances of Decimal("NAN") continue assert na_value_obj not in data - assert na_value_obj not in data_missing + if ( + using_nan_is_na + and isinstance(na_value_obj, float) + and isinstance( + data, + ( + pd.core.arrays.BaseMaskedArray, + pd.core.arrays.ArrowExtensionArray, + ), + ) + ): + # TODO: wrong place for this override + assert na_value_obj in data_missing + else: + assert na_value_obj not in data_missing def test_memory_usage(self, data): s = pd.Series(data) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6e11b54e3dfee..d2d65c4b983a7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -296,14 +296,14 @@ def test_compare_scalar(self, data, comparison_op): self._compare_other(ser, data, comparison_op, data[0]) @pytest.mark.parametrize("na_action", [None, "ignore"]) - def test_map(self, data_missing, na_action): + def test_map(self, data_missing, na_action, using_nan_is_na): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing.to_numpy(dtype=object) tm.assert_numpy_array_equal(result, expected) else: result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == "float32[pyarrow]": + if data_missing.dtype == "float32[pyarrow]" and using_nan_is_na: # map roundtrips through objects, which converts to float64 expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) else: @@ -701,7 +701,7 @@ def test_setitem_preserves_views(self, data): @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default]) @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data, dtype_backend, request): + def test_EA_types(self, engine, data, dtype_backend, request, using_nan_is_na): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): request.applymarker( @@ -722,7 +722,10 @@ def test_EA_types(self, engine, data, dtype_backend, request): pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) - csv_output = df.to_csv(index=False, na_rep=np.nan) + if not using_nan_is_na: + csv_output = df.to_csv(index=False, na_rep="NA") + else: + csv_output = df.to_csv(index=False, na_rep=np.nan) if pa.types.is_binary(pa_dtype): csv_output = BytesIO(csv_output) else: @@ -1514,7 +1517,8 @@ def test_pickle_roundtrip(data): def test_astype_from_non_pyarrow(data): # GH49795 - pd_array = data._pa_array.to_pandas().array + np_arr = data.to_numpy() + pd_array = pd.array(np_arr, dtype=np_arr.dtype) result = pd_array.astype(data.dtype) assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) @@ -1536,7 +1540,7 @@ def test_astype_errors_ignore(): tm.assert_frame_equal(result, expected) -def test_to_numpy_with_defaults(data): +def test_to_numpy_with_defaults(data, using_nan_is_na): # GH49973 result = data.to_numpy() @@ -1548,20 +1552,23 @@ def test_to_numpy_with_defaults(data): else: expected = np.array(data._pa_array) - if data._hasna and not is_numeric_dtype(data.dtype): + if data._hasna and (not is_numeric_dtype(data.dtype) or not using_nan_is_na): expected = expected.astype(object) expected[pd.isna(data)] = pd.NA tm.assert_numpy_array_equal(result, expected) -def test_to_numpy_int_with_na(): +def test_to_numpy_int_with_na(using_nan_is_na): # GH51227: ensure to_numpy does not convert int to float data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() - expected = np.array([1, np.nan]) - assert isinstance(result[0], float) + if not using_nan_is_na: + expected = np.array([1, pd.NA], dtype=object) + else: + expected = np.array([1, np.nan]) + assert isinstance(result[0], float) tm.assert_numpy_array_equal(result, expected) @@ -3519,10 +3526,13 @@ def test_cast_dictionary_different_value_dtype(arrow_type): assert result.dtypes.iloc[0] == data_type -def test_map_numeric_na_action(): +def test_map_numeric_na_action(using_nan_is_na): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") - expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + if not using_nan_is_na: + expected = pd.Series([42.0, 42.0, pd.NA], dtype="object") + else: + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) @@ -3628,3 +3638,56 @@ def test_date_vs_timestamp_array_comparison(): assert not (ser3 == ser).any() assert (ser != ser3).all() assert (ser3 != ser).all() + + +def test_ops_with_nan_is_na(using_nan_is_na): + # GH#61732 + ser = pd.Series([-1, 0, 1], dtype="int64[pyarrow]") + + result = ser - np.nan + if using_nan_is_na: + assert result.isna().all() + else: + assert not result.isna().any() + + result = ser * np.nan + if using_nan_is_na: + assert result.isna().all() + else: + assert not result.isna().any() + + result = ser / 0 + if using_nan_is_na: + assert result.isna()[1] + else: + assert not result.isna()[1] + + +def test_setitem_float_nan_is_na(using_nan_is_na): + # GH#61732 + ser = pd.Series([-1, 0, 1], dtype="int64[pyarrow]") + + if using_nan_is_na: + ser[1] = np.nan + assert ser.isna()[1] + else: + msg = "Could not convert nan with type float: tried to convert to int64" + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser[1] = np.nan + + ser = pd.Series([-1, np.nan, 1], dtype="float64[pyarrow]") + if using_nan_is_na: + assert ser.isna()[1] + assert ser[1] is pd.NA + + ser[1] = np.nan + assert ser[1] is pd.NA + + else: + assert not ser.isna()[1] + assert isinstance(ser[1], float) + assert np.isnan(ser[1]) + + ser[2] = np.nan + assert isinstance(ser[2], float) + assert np.isnan(ser[2]) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index c6df731069fee..a6373ed263499 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -172,20 +172,23 @@ def skip_if_doesnt_support_2d(self, dtype, request): # override becomes unnecessary. @pytest.mark.parametrize("na_action", [None, "ignore"]) - def test_map(self, data_missing, na_action): + def test_map(self, data_missing, na_action, using_nan_is_na): result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == Float32Dtype(): + if data_missing.dtype == Float32Dtype() and using_nan_is_na: # map roundtrips through objects, which converts to float64 expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) else: expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_map_na_action_ignore(self, data_missing_for_sorting): + def test_map_na_action_ignore(self, data_missing_for_sorting, using_nan_is_na): zero = data_missing_for_sorting[2] result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") if data_missing_for_sorting.dtype.kind == "b": expected = np.array([False, pd.NA, False], dtype=object) + elif not using_nan_is_na: + # TODO: would we prefer to get NaN in this case to get a non-object? + expected = np.array([zero, pd.NA, zero], dtype=object) else: expected = np.array([zero, np.nan, zero]) tm.assert_numpy_array_equal(result, expected) @@ -350,9 +353,11 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.Series( pd.array( getattr(ser.astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, + dtype="Float64", ) ) + expected[np.isnan(expected)] = pd.NA + expected = expected.astype(expected_dtype) tm.assert_series_equal(result, expected) def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 39dde0ef7be3f..9fd1f3133c2f5 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -369,12 +369,22 @@ def test_astype_extension_dtypes_1d(self, any_int_ea_dtype): tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["category", "Int64"]) - def test_astype_extension_dtypes_duplicate_col(self, dtype): + def test_astype_extension_dtypes_duplicate_col(self, dtype, using_nan_is_na): # GH#24704 a1 = Series([0, np.nan, 4], name="a") a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) + if dtype == "Int64" and not using_nan_is_na: + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + df.astype(dtype) + with pytest.raises(ValueError, match=msg): + a1.astype(dtype) + with pytest.raises(ValueError, match=msg): + a2.astype(dtype) + return + result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index d79b836673225..e90786a43c483 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -59,7 +59,7 @@ def test_convert_dtypes_retain_column_names(self): tm.assert_index_equal(result.columns, df.columns) assert result.columns.name == "cols" - def test_pyarrow_dtype_backend(self): + def test_pyarrow_dtype_backend(self, using_nan_is_na): pa = pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -73,6 +73,8 @@ def test_pyarrow_dtype_backend(self): } ) result = df.convert_dtypes(dtype_backend="pyarrow") + + item = None if using_nan_is_na else np.nan expected = pd.DataFrame( { "a": pd.arrays.ArrowExtensionArray( @@ -80,7 +82,7 @@ def test_pyarrow_dtype_backend(self): ), "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), - "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + "d": pd.arrays.ArrowExtensionArray(pa.array([item, 100.5, 200.0])), "e": pd.arrays.ArrowExtensionArray( pa.array( [ diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 9e302dc5f94ee..41f72d17ebef7 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -643,7 +643,7 @@ def test_replace_mixed3(self): def test_replace_nullable_int_with_string_doesnt_cast(self): # GH#25438 don't cast df['a'] to float64 - df = DataFrame({"a": [1, 2, 3, np.nan], "b": ["some", "strings", "here", "he"]}) + df = DataFrame({"a": [1, 2, 3, pd.NA], "b": ["some", "strings", "here", "he"]}) df["a"] = df["a"].astype("Int64") res = df.replace("", np.nan) @@ -681,7 +681,7 @@ def test_replace_simple_nested_dict_with_nonexistent_value(self): def test_replace_NA_with_None(self): # gh-45601 - df = DataFrame({"value": [42, None]}).astype({"value": "Int64"}) + df = DataFrame({"value": [42, pd.NA]}, dtype="Int64") result = df.replace({pd.NA: None}) expected = DataFrame({"value": [42, None]}, dtype=object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a8f0e77df6af5..7c4ce4c67f13d 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -2121,7 +2121,9 @@ def test_fails_on_non_numeric(kernel): ], ) @pytest.mark.parametrize("min_count", [0, 2]) -def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): +def test_numeric_ea_axis_1( + method, skipna, min_count, any_numeric_ea_dtype, using_nan_is_na +): # GH 54341 df = DataFrame( { @@ -2168,7 +2170,13 @@ def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): result = getattr(df, method)(axis=1, **kwargs) expected = getattr(expected_df, method)(axis=1, **kwargs) if method not in ("idxmax", "idxmin"): - expected = expected.astype(expected_dtype) + if using_nan_is_na: + expected = expected.astype(expected_dtype) + else: + mask = np.isnan(expected) + expected[mask] = 0 + expected = expected.astype(expected_dtype) + expected[mask] = pd.NA tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 28cb25b515ed2..815513fe96009 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -255,7 +255,9 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): def test_groupby_quantile_NA_float(any_float_dtype): # GH#42849 - df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) + dtype = pd.Series([], dtype=any_float_dtype).dtype + item = np.nan if isinstance(dtype, np.dtype) else pd.NA + df = DataFrame({"x": [1, 1], "y": [0.2, item]}, dtype=any_float_dtype) result = df.groupby("x")["y"].quantile(0.5) exp_index = Index([1.0], dtype=any_float_dtype, name="x") @@ -353,7 +355,7 @@ def test_groupby_quantile_allNA_column(dtype): df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) result = df.groupby("x")["y"].quantile(0.5) expected = pd.Series( - [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" + [pd.NA], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" ) expected.index.name = "x" tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index cac18c46341b5..5c8be57b10013 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -186,9 +186,10 @@ def test_masked_kleene_logic(all_boolean_reductions, skipna, data): ) def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): # GH#37506 - data = [1.0, np.nan] + data1 = [1.0, np.nan] if dtype1.startswith("f") else [1.0, pd.NA] + data2 = [1.0, np.nan] if dtype2.startswith("f") else [1.0, pd.NA] df = DataFrame( - {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} + {"col1": pd.array(data1, dtype=dtype1), "col2": pd.array(data2, dtype=dtype2)} ) result = df.groupby([1, 1]).agg("all", skipna=False) @@ -379,8 +380,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): df = DataFrame( { "a": [2, 1, 1, 2, 3, 3], - "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], - "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + # TODO: test that has mixed na_value and NaN either working for + # float or raising for int? + "b": [na_value, 3.0, na_value, 4.0, na_value, na_value], + "c": [na_value, 3.0, na_value, 4.0, na_value, na_value], }, dtype=any_real_nullable_dtype, ) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index cf5fc2977a28f..c134e44681122 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -671,7 +671,7 @@ def test_from_frame_missing_values_multiIndex(): multi_indexed = MultiIndex.from_frame(df) expected = MultiIndex.from_arrays( [ - Series([1, 2, None]).astype("Int64"), + Series([1, 2, None], dtype="Int64"), pd.Float64Dtype().__from_arrow__(pa.array([0.2, None, None])), ], names=["a", "b"], diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index b29f783203177..2f37b15ca74f5 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -339,35 +339,50 @@ def test_get_loc_masked_na(self, any_numeric_ea_and_arrow_dtype): with pytest.raises(KeyError, match="NA"): idx.get_loc(NA) - def test_get_loc_masked_na_and_nan(self): + def test_get_loc_masked_na_and_nan(self, using_nan_is_na): # GH#39133 - idx = Index( - FloatingArray( - np.array([1, 2, 1, np.nan]), mask=np.array([False, False, True, False]) - ) - ) - result = idx.get_loc(NA) - assert result == 2 - result = idx.get_loc(np.nan) - assert result == 3 + mask = np.array([False, False, True, False]) + if using_nan_is_na: + mask[-1] = True + + idx = Index(FloatingArray(np.array([1, 2, 1, np.nan]), mask=mask)) + if using_nan_is_na: + # NaN and NA are consistently treated as the same + result = idx.get_loc(NA) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + result = idx.get_loc(np.nan) + tm.assert_numpy_array_equal(result, expected) + else: + result = idx.get_loc(NA) + assert result == 2 + result = idx.get_loc(np.nan) + assert result == 3 idx = Index( FloatingArray(np.array([1, 2, 1.0]), mask=np.array([False, False, True])) ) result = idx.get_loc(NA) assert result == 2 - with pytest.raises(KeyError, match="nan"): - idx.get_loc(np.nan) + if using_nan_is_na: + result = idx.get_loc(np.nan) + assert result == 2 + else: + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan) - idx = Index( - FloatingArray( - np.array([1, 2, np.nan]), mask=np.array([False, False, False]) - ) - ) + mask = np.array([False, False, False]) + if using_nan_is_na: + mask[-1] = True + idx = Index(FloatingArray(np.array([1, 2, np.nan]), mask=mask)) result = idx.get_loc(np.nan) assert result == 2 - with pytest.raises(KeyError, match="NA"): - idx.get_loc(NA) + if using_nan_is_na: + result = idx.get_loc(NA) + assert result == 2 + else: + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) @pytest.mark.parametrize("val", [4, 2]) def test_get_indexer_masked_na(self, any_numeric_ea_and_arrow_dtype, val): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 4c8a434462714..17aaf6a4b1108 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -10,6 +10,7 @@ from pandas.errors import IndexingError from pandas import ( + NA, Categorical, CategoricalDtype, DataFrame, @@ -1519,8 +1520,10 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) + + ser = Series([NA], name="b", dtype="Int64") with pytest.raises(TypeError, match="Invalid value"): - result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") + result.loc[:, "b"] = ser def test_iloc_arrow_extension_array(self): # GH#61311 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6aa1666f1b912..8e4845a72ec35 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2987,7 +2987,7 @@ def test_loc_getitem_multiindex_tuple_level(): def test_loc_getitem_nullable_index_with_duplicates(): # GH#34497 df = DataFrame( - data=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, np.nan, np.nan]]).T, + data=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, pd.NA, pd.NA]]).T, columns=["a", "b", "c"], dtype="Int64", ) diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py index 5d19e9c14d534..98c1f70f08e89 100644 --- a/pandas/tests/io/formats/style/test_highlight.py +++ b/pandas/tests/io/formats/style/test_highlight.py @@ -15,8 +15,10 @@ @pytest.fixture(params=[(None, "float64"), (NA, "Int64")]) def df(request): # GH 45804 + dtype = request.param[1] + item = np.nan if dtype == "float64" else NA return DataFrame( - {"A": [0, np.nan, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1] + {"A": [0, item, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1] ) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 63332fe4658e5..10335ff716c1f 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -656,8 +656,10 @@ def test_cut_incorrect_labels(labels): def test_cut_nullable_integer(bins, right, include_lowest): a = np.random.default_rng(2).integers(0, 10, size=50).astype(float) a[::2] = np.nan + b = a.astype(object) + b[::2] = pd.NA result = cut( - pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest + pd.array(b, dtype="Int64"), bins, right=right, include_lowest=include_lowest ) expected = cut(a, bins, right=right, include_lowest=include_lowest) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index f69c90ced2828..016b7db3e689a 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -733,9 +733,9 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): "input_series, expected_output", [ [["2020-01-01"], [[2020, 1, 3]]], - [[pd.NaT], [[np.nan, np.nan, np.nan]]], + [[pd.NaT], [[None, None, None]]], [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], - [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.nan, np.nan, np.nan]]], + [["2010-01-01", pd.NaT], [[2009, 53, 5], [None, None, None]]], # see GH#36032 [["2016-01-08", "2016-01-04"], [[2016, 1, 5], [2016, 1, 1]]], [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], diff --git a/pandas/tests/series/methods/test_case_when.py b/pandas/tests/series/methods/test_case_when.py index acfc58bea728e..7cb60a11644a3 100644 --- a/pandas/tests/series/methods/test_case_when.py +++ b/pandas/tests/series/methods/test_case_when.py @@ -2,7 +2,6 @@ import pytest from pandas import ( - NA, DataFrame, Series, array as pd_array, @@ -100,7 +99,7 @@ def test_case_when_multiple_conditions_replacement_extension_dtype(df): (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), ], ) - expected = Series([1, 2, NA], dtype="Float64") + expected = Series([1, 2, np.nan], dtype="Float64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 8ed422fc118dc..c1ee7f8c9e008 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -43,21 +43,16 @@ def test_clip_types_and_nulls(self): assert list(isna(s)) == list(isna(lower)) assert list(isna(s)) == list(isna(upper)) - def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixture): + def test_series_clipping_with_na_values(self, any_numeric_ea_dtype): # Ensure that clipping method can handle NA values with out failing # GH#40581 - if nulls_fixture is pd.NaT: - # constructor will raise, see - # test_constructor_mismatched_null_nullable_dtype - pytest.skip("See test_constructor_mismatched_null_nullable_dtype") - - ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) + ser = Series([pd.NA, 1.0, 3.0], dtype=any_numeric_ea_dtype) s_clipped_upper = ser.clip(upper=2.0) s_clipped_lower = ser.clip(lower=2.0) - expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype) - expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype) + expected_upper = Series([pd.NA, 1.0, 2.0], dtype=any_numeric_ea_dtype) + expected_lower = Series([pd.NA, 2.0, 3.0], dtype=any_numeric_ea_dtype) tm.assert_series_equal(s_clipped_upper, expected_upper) tm.assert_series_equal(s_clipped_lower, expected_lower) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 324e03894e92c..e36baba5e0108 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -182,6 +182,7 @@ def test_convert_dtypes( expected_other, params, using_infer_string, + using_nan_is_na, ): if ( hasattr(data, "dtype") @@ -224,6 +225,16 @@ def test_convert_dtypes( # If convert_string=False and infer_objects=True, we end up with the # default string dtype instead of preserving object for string data expected_dtype = pd.StringDtype(na_value=np.nan) + if ( + not using_nan_is_na + and expected_dtype == "Int64" + and isinstance(data[1], float) + and np.isnan(data[1]) + ): + if params_dict["convert_floating"]: + expected_dtype = "Float64" + else: + expected_dtype = "float64" expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index ecd52b2c8498a..357894cbd0fe3 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -273,14 +273,38 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): + def test_rank_tie_methods( + self, ser, results, dtype, using_infer_string, using_nan_is_na + ): method, exp = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): + if ( + dtype == "int64" + or ( + # TODO: these can work but need to update ser construction. + dtype in ["int64[pyarrow]", "uint64[pyarrow]", "Int64"] + and not using_nan_is_na + ) + or (not using_infer_string and dtype == "str") + ): pytest.skip("int64/str does not support NaN") ser = ser if dtype is None else ser.astype(dtype) + if dtype in ["float64[pyarrow]", "Float64"] and not using_nan_is_na: + # TODO: use ser.replace(np.nan, NA) once that works + ser[np.isnan(ser.to_numpy(dtype=np.float64, na_value=np.nan))] = NA + mask = np.isnan(exp) + exp = exp.astype(object) + exp[mask] = NA + result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) + + if dtype == "string[pyarrow]" and not using_nan_is_na: + mask = np.isnan(exp) + exp = exp.astype(object) + exp[mask] = NA + + expected = Series(exp, dtype=expected_dtype(dtype, method)) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( @@ -299,7 +323,15 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): ], ) def test_rank_tie_methods_on_infs_nans( - self, rank_method, na_option, ascending, dtype, na_value, pos_inf, neg_inf + self, + rank_method, + na_option, + ascending, + dtype, + na_value, + pos_inf, + neg_inf, + using_nan_is_na, ): pytest.importorskip("scipy") if dtype == "float64[pyarrow]": @@ -331,6 +363,8 @@ def test_rank_tie_methods_on_infs_nans( order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] + elif dtype in ("float64[pyarrow]", "Float64") and not using_nan_is_na: + order = [ranks[0], [NA] * chunk, ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] @@ -395,10 +429,16 @@ def test_rank_dense_method(self, dtype, ser, exp): expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype, using_infer_string): + def test_rank_descending( + self, ser, results, dtype, using_infer_string, using_nan_is_na + ): method, _ = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): - s = ser.dropna() + if ( + dtype == "int64" + or (dtype in ["int64[pyarrow]", "Int64", "Float64"] and not using_nan_is_na) + or (not using_infer_string and dtype == "str") + ): + s = ser.dropna().astype(dtype) else: s = ser.astype(dtype) @@ -407,6 +447,8 @@ def test_rank_descending(self, ser, results, dtype, using_infer_string): expected = (s.astype("float64").max() - s.astype("float64")).rank() else: expected = (s.max() - s).rank() + if dtype == "string[pyarrow]" and not using_nan_is_na: + expected = expected.replace(np.nan, NA) tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) if dtype.startswith("str"): @@ -416,6 +458,8 @@ def test_rank_descending(self, ser, results, dtype, using_infer_string): else: expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) + if dtype == "string[pyarrow]" and not using_nan_is_na: + expected = expected.replace(np.nan, NA) tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): diff --git a/pandas/tests/series/test_npfuncs.py b/pandas/tests/series/test_npfuncs.py index 11a51c4700d5c..b72ac8efbaa6d 100644 --- a/pandas/tests/series/test_npfuncs.py +++ b/pandas/tests/series/test_npfuncs.py @@ -38,9 +38,15 @@ def test_numpy_argwhere(index): @td.skip_if_no("pyarrow") -def test_log_arrow_backed_missing_value(): +def test_log_arrow_backed_missing_value(using_nan_is_na): # GH#56285 ser = Series([1, 2, None], dtype="float64[pyarrow]") - result = np.log(ser) - expected = np.log(Series([1, 2, None], dtype="float64")) - tm.assert_series_equal(result, expected) + if using_nan_is_na: + result = np.log(ser) + expected = np.log(Series([1, 2, None], dtype="float64")) + tm.assert_series_equal(result, expected) + else: + # we get cast to object which raises + msg = "loop of ufunc does not support argument" + with pytest.raises(TypeError, match=msg): + np.log(ser)