Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.3.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See

Bug fixes
^^^^^^^^^
- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript
characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`)
- Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the
"string" type in the JSON Table Schema for :class:`StringDtype` columns
(:issue:`61889`)
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/_arrow_string_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas.compat import (
HAS_PYARROW,
pa_version_under17p0,
pa_version_under21p0,
)

if HAS_PYARROW:
Expand Down Expand Up @@ -267,6 +268,12 @@ def _str_isdecimal(self):
return self._convert_bool_result(result)

def _str_isdigit(self):
if pa_version_under21p0:
# https://github.com/pandas-dev/pandas/issues/61466
res_list = self._apply_elementwise(str.isdigit)
return self._convert_bool_result(
pa.chunked_array(res_list, type=pa.bool_())
)
result = pc.utf8_is_digit(self._pa_array)
return self._convert_bool_result(result)

Expand Down
18 changes: 14 additions & 4 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3602,16 +3602,26 @@ def casefold(self):
Series.str.isupper : Check whether all characters are uppercase.
Series.str.istitle : Check whether all characters are titlecase.

Examples
--------
Notes
-----
Similar to ``str.isdecimal`` but also includes special digits, like
superscripted and subscripted digits in unicode.

The exact behavior of this method, i.e. which unicode characters are
considered as digits, depends on the backend used for string operations,
and there can be small differences.
For example, Python considers the ³ superscript character as a digit, but
not the ⅕ fraction character, while PyArrow considers both as digits. For
simple (ascii) decimal numbers, the behaviour is consistent.

Examples
--------

>>> s3 = pd.Series(['23', '³', '⅕', ''])
>>> s3.str.isdigit()
0 True
1 False
2 False
1 True
2 True
3 False
dtype: bool
"""
Expand Down
29 changes: 24 additions & 5 deletions pandas/tests/strings/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under21p0
from pandas.errors import Pandas4Warning

from pandas import (
Expand All @@ -15,6 +16,7 @@
Index,
MultiIndex,
Series,
StringDtype,
option_context,
)
import pandas._testing as tm
Expand Down Expand Up @@ -249,8 +251,9 @@ def test_ismethods(method, expected, any_string_dtype):
@pytest.mark.parametrize(
"method, expected",
[
("isnumeric", [False, True, True, False, True, True, False]),
("isdecimal", [False, True, False, False, False, True, False]),
("isnumeric", [False, True, True, True, False, True, True, False]),
("isdecimal", [False, True, False, False, False, False, True, False]),
("isdigit", [False, True, True, False, False, False, True, False]),
],
)
def test_isnumeric_unicode(method, expected, any_string_dtype):
Expand All @@ -259,19 +262,35 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
# 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
# 0xFF13: 3 Em 3 # noqa: RUF003
ser = Series(
["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001
["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001
dtype=any_string_dtype,
)
expected_dtype = (
"bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(expected, dtype=expected_dtype)
if (
method == "isdigit"
and isinstance(ser.dtype, StringDtype)
and ser.dtype.storage == "pyarrow"
and not pa_version_under21p0
):
# known difference in behavior between python and pyarrow unicode handling
# pyarrow 21+ considers ¼ and ፸ as a digit, while python does not
expected.iloc[3] = True
expected.iloc[5] = True

result = getattr(ser.str, method)()
tm.assert_series_equal(result, expected)

# compare with standard library
expected = [getattr(item, method)() for item in ser]
assert list(result) == expected
# (only for non-pyarrow storage given the above differences)
if any_string_dtype == "object" or (
isinstance(any_string_dtype, StringDtype)
and any_string_dtype.storage == "python"
):
expected = [getattr(item, method)() for item in ser]
assert list(result) == expected


@pytest.mark.parametrize(
Expand Down
Loading