Skip to content

Commit 08d21d7

Browse files
BUG: fix bug in str.fullmatch for Arrow backend with optional groups (#61073)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 2b25842 commit 08d21d7

File tree

4 files changed

+54
-6
lines changed

4 files changed

+54
-6
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Bug fixes
3535
- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
3636
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
3737
with a compiled regex and custom flags (:issue:`62240`)
38+
- Fix :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`)
3839

3940
.. ---------------------------------------------------------------------------
4041
.. _whatsnew_233.contributors:

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,12 @@ def _str_fullmatch(
326326
flags: int = 0,
327327
na: Scalar | lib.NoDefault = lib.no_default,
328328
):
329-
if not pat.endswith("$") or pat.endswith("\\$"):
330-
pat = f"{pat}$"
329+
if (not pat.endswith("$") or pat.endswith("\\$")) and not pat.startswith("^"):
330+
pat = f"^({pat})$"
331+
elif not pat.endswith("$") or pat.endswith("\\$"):
332+
pat = f"^({pat[1:]})$"
333+
elif not pat.startswith("^"):
334+
pat = f"^({pat[0:-1]})$"
331335
return self._str_match(pat, case, flags, na)
332336

333337
def _str_find(self, sub: str, start: int = 0, end: int | None = None):

pandas/tests/extension/test_arrow.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1870,23 +1870,28 @@ def test_str_match(pat, case, na, exp):
18701870

18711871
@pytest.mark.parametrize(
18721872
"pat, case, na, exp",
1873+
# Note: keep cases in sync with
1874+
# pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases
18731875
[
1874-
["abc", False, None, [True, True, False, None]],
1876+
["abc", False, None, [True, False, False, None]],
18751877
["Abc", True, None, [False, False, False, None]],
18761878
["bc", True, None, [False, False, False, None]],
1877-
["ab", False, None, [True, True, False, None]],
1878-
["a[a-z]{2}", False, None, [True, True, False, None]],
1879+
["ab", False, None, [False, False, False, None]],
1880+
["a[a-z]{2}", False, None, [True, False, False, None]],
18791881
["A[a-z]{1}", True, None, [False, False, False, None]],
18801882
# GH Issue: #56652
18811883
["abc$", False, None, [True, False, False, None]],
18821884
["abc\\$", False, None, [False, True, False, None]],
18831885
["Abc$", True, None, [False, False, False, None]],
18841886
["Abc\\$", True, None, [False, False, False, None]],
1887+
# https://github.com/pandas-dev/pandas/issues/61072
1888+
["(abc)|(abx)", True, None, [True, False, False, None]],
1889+
["((abc)|(abx))", True, None, [True, False, False, None]],
18851890
],
18861891
)
18871892
def test_str_fullmatch(pat, case, na, exp):
18881893
ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
1889-
result = ser.str.match(pat, case=case, na=na)
1894+
result = ser.str.fullmatch(pat, case=case, na=na)
18901895
expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
18911896
tm.assert_series_equal(result, expected)
18921897

pandas/tests/strings/test_find_replace.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,44 @@ def test_fullmatch_compiled_regex(any_string_dtype):
10751075
values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
10761076

10771077

1078+
@pytest.mark.parametrize(
1079+
"pat, case, na, exp",
1080+
# Note: keep cases in sync with
1081+
# pandas/tests/extension/test_arrow.py::test_str_fullmatch
1082+
[
1083+
["abc", False, None, [True, False, False, None]],
1084+
["Abc", True, None, [False, False, False, None]],
1085+
["bc", True, None, [False, False, False, None]],
1086+
["ab", False, None, [False, False, False, None]],
1087+
["a[a-z]{2}", False, None, [True, False, False, None]],
1088+
["A[a-z]{1}", True, None, [False, False, False, None]],
1089+
# GH Issue: #56652
1090+
["abc$", False, None, [True, False, False, None]],
1091+
["abc\\$", False, None, [False, True, False, None]],
1092+
["Abc$", True, None, [False, False, False, None]],
1093+
["Abc\\$", True, None, [False, False, False, None]],
1094+
# https://github.com/pandas-dev/pandas/issues/61072
1095+
["(abc)|(abx)", True, None, [True, False, False, None]],
1096+
["((abc)|(abx))", True, None, [True, False, False, None]],
1097+
],
1098+
)
1099+
def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp):
1100+
ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype)
1101+
result = ser.str.fullmatch(pat, case=case, na=na)
1102+
1103+
if any_string_dtype == "str":
1104+
# NaN propagates as False
1105+
exp[-1] = False
1106+
expected_dtype = bool
1107+
else:
1108+
expected_dtype = (
1109+
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
1110+
)
1111+
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
1112+
expected = Series(exp, dtype=expected_dtype)
1113+
tm.assert_series_equal(result, expected)
1114+
1115+
10781116
# --------------------------------------------------------------------------------------
10791117
# str.findall
10801118
# --------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)