Skip to content

Commit 83c9e4a

Browse files
authored
DEPR: Categorical with values not present in categories (#62142)
1 parent 0bc7ea3 commit 83c9e4a

29 files changed

+260
-105
lines changed

doc/source/user_guide/categorical.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it
7777
.. ipython:: python
7878
7979
raw_cat = pd.Categorical(
80-
["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False
80+
[None, "b", "c", None], categories=["b", "c", "d"], ordered=False
8181
)
8282
s = pd.Series(raw_cat)
8383
s
@@ -145,7 +145,7 @@ of :class:`~pandas.api.types.CategoricalDtype`.
145145
146146
from pandas.api.types import CategoricalDtype
147147
148-
s = pd.Series(["a", "b", "c", "a"])
148+
s = pd.Series([None, "b", "c", None])
149149
cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
150150
s_cat = s.astype(cat_type)
151151
s_cat

doc/source/user_guide/io.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -499,11 +499,14 @@ When using ``dtype=CategoricalDtype``, "unexpected" values outside of
499499
``dtype.categories`` are treated as missing values.
500500

501501
.. ipython:: python
502+
:okwarning:
502503
503504
dtype = CategoricalDtype(["a", "b", "d"]) # No 'c'
504505
pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1
505506
506-
This matches the behavior of :meth:`Categorical.set_categories`.
507+
This matches the behavior of :meth:`Categorical.set_categories`. This behavior is
508+
deprecated. In a future version, the presence of non-NA values that are not
509+
among the specified categories will raise.
507510

508511
.. note::
509512

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,7 @@ Other Deprecations
647647
- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
648648
- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
649649
- Deprecated ``pd.core.internals.api.maybe_infer_ndim`` (:issue:`40226`)
650+
- Deprecated allowing constructing or casting to :class:`Categorical` with non-NA values that are not present in specified ``dtype.categories`` (:issue:`40996`)
650651
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`)
651652
- Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`)
652653
- Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`)

pandas/core/arrays/categorical.py

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
cast,
1212
overload,
1313
)
14+
import warnings
1415

1516
import numpy as np
1617

@@ -23,6 +24,8 @@
2324
)
2425
from pandas._libs.arrays import NDArrayBacked
2526
from pandas.compat.numpy import function as nv
27+
from pandas.errors import Pandas4Warning
28+
from pandas.util._exceptions import find_stack_level
2629
from pandas.util._validators import validate_bool_kwarg
2730

2831
from pandas.core.dtypes.cast import (
@@ -476,7 +479,11 @@ def __init__(
476479
elif isinstance(values.dtype, CategoricalDtype):
477480
old_codes = extract_array(values)._codes
478481
codes = recode_for_categories(
479-
old_codes, values.dtype.categories, dtype.categories, copy=copy
482+
old_codes,
483+
values.dtype.categories,
484+
dtype.categories,
485+
copy=copy,
486+
warn=True,
480487
)
481488

482489
else:
@@ -528,7 +535,12 @@ def _from_sequence(
528535

529536
def _cast_pointwise_result(self, values) -> ArrayLike:
530537
res = super()._cast_pointwise_result(values)
531-
cat = type(self)._from_sequence(res, dtype=self.dtype)
538+
with warnings.catch_warnings():
539+
warnings.filterwarnings(
540+
"ignore",
541+
"Constructing a Categorical with a dtype and values containing",
542+
)
543+
cat = type(self)._from_sequence(res, dtype=self.dtype)
532544
if (cat.isna() == isna(res)).all():
533545
# i.e. the conversion was non-lossy
534546
return cat
@@ -565,6 +577,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
565577
dtype = self.dtype.update_dtype(dtype)
566578
self = self.copy() if copy else self
567579
result = self._set_dtype(dtype, copy=False)
580+
wrong = result.isna() & ~self.isna()
581+
if wrong.any():
582+
warnings.warn(
583+
"Constructing a Categorical with a dtype and values containing "
584+
"non-null entries not in that dtype's categories is deprecated "
585+
"and will raise in a future version.",
586+
Pandas4Warning,
587+
stacklevel=find_stack_level(),
588+
)
568589

569590
elif isinstance(dtype, ExtensionDtype):
570591
return super().astype(dtype, copy=copy)
@@ -659,14 +680,16 @@ def _from_inferred_categories(
659680
if known_categories:
660681
# Recode from observation order to dtype.categories order.
661682
categories = dtype.categories
662-
codes = recode_for_categories(inferred_codes, cats, categories, copy=False)
683+
codes = recode_for_categories(
684+
inferred_codes, cats, categories, copy=False, warn=True
685+
)
663686
elif not cats.is_monotonic_increasing:
664687
# Sort categories and recode for unknown categories.
665688
unsorted = cats.copy()
666689
categories = cats.sort_values()
667690

668691
codes = recode_for_categories(
669-
inferred_codes, unsorted, categories, copy=False
692+
inferred_codes, unsorted, categories, copy=False, warn=True
670693
)
671694
dtype = CategoricalDtype(categories, ordered=False)
672695
else:
@@ -787,7 +810,7 @@ def categories(self) -> Index:
787810
>>> ser.cat.categories
788811
Index(['a', 'b', 'c'], dtype='str')
789812
790-
>>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"])
813+
>>> raw_cat = pd.Categorical([None, "b", "c", None], categories=["b", "c", "d"])
791814
>>> ser = pd.Series(raw_cat)
792815
>>> ser.cat.categories
793816
Index(['b', 'c', 'd'], dtype='str')
@@ -1095,7 +1118,7 @@ def set_categories(
10951118
For :class:`pandas.Series`:
10961119
10971120
>>> raw_cat = pd.Categorical(
1098-
... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True
1121+
... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True
10991122
... )
11001123
>>> ser = pd.Series(raw_cat)
11011124
>>> ser
@@ -1117,7 +1140,7 @@ def set_categories(
11171140
For :class:`pandas.CategoricalIndex`:
11181141
11191142
>>> ci = pd.CategoricalIndex(
1120-
... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True
1143+
... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True
11211144
... )
11221145
>>> ci
11231146
CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'],
@@ -1145,7 +1168,7 @@ def set_categories(
11451168
codes = cat._codes
11461169
else:
11471170
codes = recode_for_categories(
1148-
cat.codes, cat.categories, new_dtype.categories, copy=False
1171+
cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False
11491172
)
11501173
NDArrayBacked.__init__(cat, codes, new_dtype)
11511174
return cat
@@ -2956,7 +2979,7 @@ def codes(self) -> Series:
29562979
29572980
Examples
29582981
--------
2959-
>>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"])
2982+
>>> raw_cate = pd.Categorical(["a", "b", None, "a"], categories=["a", "b"])
29602983
>>> ser = pd.Series(raw_cate)
29612984
>>> ser.cat.codes
29622985
0 0
@@ -2991,11 +3014,25 @@ def _get_codes_for_values(
29913014
If `values` is known to be a Categorical, use recode_for_categories instead.
29923015
"""
29933016
codes = categories.get_indexer_for(values)
3017+
wrong = (codes == -1) & ~isna(values)
3018+
if wrong.any():
3019+
warnings.warn(
3020+
"Constructing a Categorical with a dtype and values containing "
3021+
"non-null entries not in that dtype's categories is deprecated "
3022+
"and will raise in a future version.",
3023+
Pandas4Warning,
3024+
stacklevel=find_stack_level(),
3025+
)
29943026
return coerce_indexer_dtype(codes, categories)
29953027

29963028

29973029
def recode_for_categories(
2998-
codes: np.ndarray, old_categories, new_categories, *, copy: bool
3030+
codes: np.ndarray,
3031+
old_categories,
3032+
new_categories,
3033+
*,
3034+
copy: bool = True,
3035+
warn: bool = False,
29993036
) -> np.ndarray:
30003037
"""
30013038
Convert a set of codes for to a new set of categories
@@ -3006,6 +3043,8 @@ def recode_for_categories(
30063043
old_categories, new_categories : Index
30073044
copy: bool, default True
30083045
Whether to copy if the codes are unchanged.
3046+
warn : bool, default False
3047+
Whether to warn on silent-NA mapping.
30093048
30103049
Returns
30113050
-------
@@ -3030,9 +3069,18 @@ def recode_for_categories(
30303069
return codes.copy()
30313070
return codes
30323071

3033-
indexer = coerce_indexer_dtype(
3034-
new_categories.get_indexer_for(old_categories), new_categories
3035-
)
3072+
codes_in_old_cats = new_categories.get_indexer_for(old_categories)
3073+
if warn:
3074+
wrong = codes_in_old_cats == -1
3075+
if wrong.any():
3076+
warnings.warn(
3077+
"Constructing a Categorical with a dtype and values containing "
3078+
"non-null entries not in that dtype's categories is deprecated "
3079+
"and will raise in a future version.",
3080+
Pandas4Warning,
3081+
stacklevel=find_stack_level(),
3082+
)
3083+
indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories)
30363084
new_codes = take_nd(indexer, codes, fill_value=-1)
30373085
return new_codes
30383086

pandas/core/dtypes/dtypes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
203203
Examples
204204
--------
205205
>>> t = pd.CategoricalDtype(categories=["b", "a"], ordered=True)
206-
>>> pd.Series(["a", "b", "a", "c"], dtype=t)
206+
>>> pd.Series(["a", "b", "a", None], dtype=t)
207207
0 a
208208
1 b
209209
2 a

pandas/core/groupby/ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,7 @@ def groups(self) -> dict[Hashable, Index]:
718718
return self.groupings[0].groups
719719
result_index, ids = self.result_index_and_ids
720720
values = result_index._values
721-
categories = Categorical(ids, categories=range(len(result_index)))
721+
categories = Categorical.from_codes(ids, categories=range(len(result_index)))
722722
result = {
723723
# mypy is not aware that group has to be an integer
724724
values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload]

pandas/core/indexes/category.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from pandas.core.dtypes.dtypes import CategoricalDtype
2323
from pandas.core.dtypes.missing import (
2424
is_valid_na_for_dtype,
25-
isna,
2625
)
2726

2827
from pandas.core.arrays.categorical import (
@@ -258,6 +257,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
258257
else:
259258
values = other
260259

260+
codes = self.categories.get_indexer(values)
261+
if ((codes == -1) & ~values.isna()).any():
262+
# GH#37667 see test_equals_non_category
263+
raise TypeError(
264+
"categories must match existing categories when appending"
265+
)
261266
cat = Categorical(other, dtype=self.dtype)
262267
other = CategoricalIndex(cat)
263268
if not other.isin(values).all():
@@ -266,12 +271,6 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
266271
)
267272
cat = other._values
268273

269-
if not ((cat == values) | (isna(cat) & isna(values))).all():
270-
# GH#37667 see test_equals_non_category
271-
raise TypeError(
272-
"categories must match existing categories when appending"
273-
)
274-
275274
return cat
276275

277276
def equals(self, other: object) -> bool:

pandas/tests/arrays/categorical/test_api.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas.errors import Pandas4Warning
7+
68
from pandas import (
79
Categorical,
810
CategoricalIndex,
@@ -289,8 +291,16 @@ def test_set_categories(self):
289291
],
290292
)
291293
def test_set_categories_many(self, values, categories, new_categories, ordered):
292-
c = Categorical(values, categories)
293-
expected = Categorical(values, new_categories, ordered)
294+
msg = "Constructing a Categorical with a dtype and values containing"
295+
296+
warn1 = Pandas4Warning if set(values).difference(categories) else None
297+
with tm.assert_produces_warning(warn1, match=msg):
298+
c = Categorical(values, categories)
299+
300+
warn2 = Pandas4Warning if set(values).difference(new_categories) else None
301+
with tm.assert_produces_warning(warn2, match=msg):
302+
expected = Categorical(values, new_categories, ordered)
303+
294304
result = c.set_categories(new_categories, ordered=ordered)
295305
tm.assert_categorical_equal(result, expected)
296306

pandas/tests/arrays/categorical/test_astype.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
from pandas.errors import Pandas4Warning
5+
46
from pandas import (
57
NA,
68
Categorical,
@@ -121,8 +123,11 @@ def test_astype_category(self, dtype_ordered, ordered):
121123

122124
# non-standard categories
123125
dtype = CategoricalDtype(list("adc"), dtype_ordered)
124-
result = cat.astype(dtype)
125-
expected = Categorical(data, dtype=dtype)
126+
msg = "Constructing a Categorical with a dtype and values containing"
127+
with tm.assert_produces_warning(Pandas4Warning, match=msg):
128+
result = cat.astype(dtype)
129+
with tm.assert_produces_warning(Pandas4Warning, match=msg):
130+
expected = Categorical(data, dtype=dtype)
126131
tm.assert_categorical_equal(result, expected)
127132

128133
if dtype_ordered is False:

0 commit comments

Comments
 (0)