Skip to content

Commit e4ca405

Browse files
authored
API: mode.nan_is_na to consistently distinguish NaN-vs-NA (#62040)
1 parent 53cb639 commit e4ca405

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+775
-206
lines changed

asv_bench/benchmarks/algorithms.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,8 @@ class SortIntegerArray:
199199
params = [10**3, 10**5]
200200

201201
def setup(self, N):
202-
data = np.arange(N, dtype=float)
203-
data[40] = np.nan
202+
data = np.arange(N, dtype=float).astype(object)
203+
data[40] = pd.NA
204204
self.array = pd.array(data, dtype="Int64")
205205

206206
def time_argsort(self, N):

asv_bench/benchmarks/frame_methods.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55

66
from pandas import (
7+
NA,
78
DataFrame,
89
Index,
910
MultiIndex,
@@ -445,6 +446,8 @@ def setup(self, inplace, dtype):
445446
values[::2] = np.nan
446447
if dtype == "Int64":
447448
values = values.round()
449+
values = values.astype(object)
450+
values[::2] = NA
448451
self.df = DataFrame(values, dtype=dtype)
449452
self.fill_values = self.df.iloc[self.df.first_valid_index()].to_dict()
450453

asv_bench/benchmarks/groupby.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,10 @@ def setup(self, dtype, method, with_nans):
689689
null_vals = vals.astype(float, copy=True)
690690
null_vals[::2, :] = np.nan
691691
null_vals[::3, :] = np.nan
692+
if dtype in ["Int64", "Float64"]:
693+
null_vals = null_vals.astype(object)
694+
null_vals[::2, :] = NA
695+
null_vals[::3, :] = NA
692696
df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
693697
df["key"] = keys
694698
self.df = df

doc/source/user_guide/text.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ or convert from existing pandas data:
7575

7676
.. ipython:: python
7777
78-
s1 = pd.Series([1, 2, np.nan], dtype="Int64")
78+
s1 = pd.Series([1, 2, pd.NA], dtype="Int64")
7979
s1
8080
s2 = s1.astype("string")
8181
s2

doc/source/whatsnew/v0.24.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ marker of ``np.nan`` will infer to integer dtype. The display of the ``Series``
5050

5151
.. ipython:: python
5252
53-
s = pd.Series([1, 2, np.nan], dtype='Int64')
53+
s = pd.Series([1, 2, pd.NA], dtype='Int64')
5454
s
5555
5656
@@ -166,7 +166,7 @@ See the :ref:`dtypes docs <basics.dtypes>` for more on extension arrays.
166166

167167
.. ipython:: python
168168
169-
pd.array([1, 2, np.nan], dtype='Int64')
169+
pd.array([1, 2, pd.NA], dtype='Int64')
170170
pd.array(['a', 'b', 'c'], dtype='category')
171171
172172
Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.)

doc/source/whatsnew/v3.0.0.rst

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,55 @@ small behavior differences as collateral:
465465
- Adding or subtracting a :class:`Day` with a :class:`Timedelta` is no longer supported.
466466
- Adding or subtracting a :class:`Day` offset to a timezone-aware :class:`Timestamp` or datetime-like may lead to an ambiguous or non-existent time, which will raise.
467467

468+
.. _whatsnew_300.api_breaking.nan_vs_na:
469+
470+
Changed treatment of NaN values in pyarrow and numpy-nullable floating dtypes
471+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
472+
473+
Previously, when dealing with a nullable dtype (e.g. ``Float64Dtype`` or ``int64[pyarrow]``), ``NaN`` was treated as interchangeable with :class:`NA` in some circumstances but not others. This was done to make adoption easier, but caused some confusion (:issue:`32265`). In 3.0, an option ``"mode.nan_is_na"`` (default ``True``) controls whether to treat ``NaN`` as equivalent to :class:`NA`.
474+
475+
With ``pd.set_option("mode.nan_is_na", True)`` (again, this is the default), ``NaN`` can be passed to constructors, ``__setitem__``, ``__contains__`` and be treated the same as :class:`NA`. The only change users will see is that arithmetic and ``np.ufunc`` operations that previously introduced ``NaN`` entries produce :class:`NA` entries instead:
476+
477+
*Old behavior:*
478+
479+
.. code-block:: ipython
480+
481+
In [2]: ser = pd.Series([0, None], dtype=pd.Float64Dtype())
482+
In [3]: ser / 0
483+
Out[3]:
484+
0 NaN
485+
1 <NA>
486+
dtype: Float64
487+
488+
*New behavior:*
489+
490+
.. ipython:: python
491+
492+
ser = pd.Series([0, None], dtype=pd.Float64Dtype())
493+
ser / 0
494+
495+
By contrast, with ``pd.set_option("mode.nan_is_na", False)``, ``NaN`` is always considered distinct and specifically as a floating-point value, so cannot be used with integer dtypes:
496+
497+
*Old behavior:*
498+
499+
.. code-block:: ipython
500+
501+
In [2]: ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype())
502+
In [3]: ser[1]
503+
Out[3]: <NA>
504+
505+
*New behavior:*
506+
507+
.. ipython:: python
508+
509+
pd.set_option("mode.nan_is_na", False)
510+
ser = pd.Series([1, np.nan], dtype=pd.Float64Dtype())
511+
ser[1]
512+
513+
If we had passed ``pd.Int64Dtype()`` or ``"int64[pyarrow]"`` for the dtype in the latter example, this would raise, as a float ``NaN`` cannot be held by an integer dtype.
514+
515+
With ``"mode.nan_is_na"`` set to ``False``, ``ser.to_numpy()`` (and ``frame.values`` and ``np.asarray(obj)``) will convert to ``object`` dtype if :class:`NA` entries are present, where before they would coerce to ``NaN``. To retain a float numpy dtype, explicitly pass ``na_value=np.nan`` to :meth:`Series.to_numpy`.
516+
468517
.. _whatsnew_300.api_breaking.deps:
469518

470519
Increased minimum version for Python

pandas/_config/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@
3333
def using_string_dtype() -> bool:
3434
_mode_options = _global_config["future"]
3535
return _mode_options["infer_string"]
36+
37+
38+
def is_nan_na() -> bool:
39+
_mode_options = _global_config["mode"]
40+
return _mode_options["nan_is_na"]

pandas/_libs/missing.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
1414
def checknull(val: object) -> bool: ...
1515
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
1616
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
17+
def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...

pandas/_libs/missing.pyx

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
249249
return checknull_with_nat(obj) or obj is C_NA
250250

251251

252+
@cython.wraparound(False)
253+
@cython.boundscheck(False)
254+
def is_pdna_or_none(values: ndarray) -> ndarray:
255+
cdef:
256+
ndarray[uint8_t] result
257+
Py_ssize_t i, N
258+
object val
259+
260+
N = len(values)
261+
result = np.zeros(N, dtype=np.uint8)
262+
263+
for i in range(N):
264+
val = values[i]
265+
if val is None or val is C_NA:
266+
result[i] = True
267+
return result.view(bool)
268+
269+
252270
@cython.wraparound(False)
253271
@cython.boundscheck(False)
254272
def is_numeric_na(values: ndarray) -> ndarray:

pandas/_libs/parsers.pyx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ from csv import (
88
)
99
import warnings
1010

11+
from pandas._config import is_nan_na
12+
1113
from pandas.util._exceptions import find_stack_level
1214

1315
from pandas import (
@@ -1469,7 +1471,7 @@ def _maybe_upcast(
14691471
if isinstance(arr, IntegerArray) and arr.isna().all():
14701472
# use null instead of int64 in pyarrow
14711473
arr = arr.to_numpy(na_value=None)
1472-
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
1474+
arr = ArrowExtensionArray(pa.array(arr, from_pandas=is_nan_na()))
14731475

14741476
return arr
14751477

0 commit comments

Comments
 (0)