From b749da42748b2d946e4aa187dbc5fafc26efdee6 Mon Sep 17 00:00:00 2001 From: Benedikt Schlager Date: Fri, 20 Feb 2026 11:42:17 +0100 Subject: [PATCH] re2: Fix python matches multiple times --- python/re2.py | 10 ++++ python/re2_test.py | 131 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/python/re2.py b/python/re2.py index 12c7b10fb..86889d6f8 100644 --- a/python/re2.py +++ b/python/re2.py @@ -218,6 +218,11 @@ def decode(span): encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1) else: encoded_pos = spans[0][1] + if spans[0][0] == encoded_pos: + # Empty match; skip past it to avoid re-matching at same spot + if encoded_pos == encoded_endpos: + break + encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1) else: while True: spans = self._regexp.Match(anchor, text, pos, endpos) @@ -232,6 +237,11 @@ def decode(span): pos += 1 else: pos = spans[0][1] + if spans[0][0] == spans[0][1]: + # Empty match; skip past it to avoid re-matching at same spot + if pos == endpos: + break + pos += 1 def search(self, text, pos=None, endpos=None): return next(self._match(_Anchor.UNANCHORED, text, pos, endpos), None) diff --git a/python/re2_test.py b/python/re2_test.py index 146b55b48..708ed6c17 100644 --- a/python/re2_test.py +++ b/python/re2_test.py @@ -490,6 +490,137 @@ def test_issue_484(self): r'Match\(\) called before compiling'): f.Match('') +class Re2SubTest(absltest.TestCase): + + def test_dollar_sign_subn(self): + # $ should match once at end of string + end_of_string = re2.compile("$") + result, count = end_of_string.subn("EOS", "Hello World") + self.assertEqual(result, "Hello WorldEOS") + self.assertEqual(count, 1) + + def test_dollar_sign_finditer(self): + # $ should match once at end of string + matches = list(re2.finditer("$", "Hello World")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (11, 11)) + + def test_caret_subn(self): + # ^ should match once at start of string + result, count = re2.subn("^", "BOS", "Hello World") + self.assertEqual(result, "BOSHello World") + self.assertEqual(count, 1) + + def test_caret_finditer(self): + # ^ should match once at start of string + matches = list(re2.finditer("^", "Hello World")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (0, 0)) + + def test_empty_pattern_finditer(self): + # Empty pattern should match at every position including end + matches = list(re2.finditer("", "abc")) + self.assertEqual(len(matches), 4) + self.assertEqual([m.span() for m in matches], [(0, 0), (1, 1), (2, 2), (3, 3)]) + + def test_empty_pattern_subn(self): + # Empty pattern substitution at every position + result, count = re2.subn("", "-", "ab") + self.assertEqual(result, "-a-b-") + self.assertEqual(count, 3) + + def test_dollar_sign_bytes(self): + # Bytes version of $ test + result, count = re2.subn(b"$", b"EOS", b"Hello World") + self.assertEqual(result, b"Hello WorldEOS") + self.assertEqual(count, 1) + + def test_caret_bytes(self): + # Bytes version of ^ test + result, count = re2.subn(b"^", b"BOS", b"Hello World") + self.assertEqual(result, b"BOSHello World") + self.assertEqual(count, 1) + + def test_empty_string_input(self): + # $ on empty string should match once + matches = list(re2.finditer("$", "")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (0, 0)) + + def test_caret_dollar_empty_string(self): + # ^$ should match empty string once + matches = list(re2.finditer("^$", "")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (0, 0)) + + def test_optional_match_at_end(self): + # a? should match 'a' once, then empty string at positions after + matches = list(re2.finditer("a?", "a")) + self.assertEqual(len(matches), 2) + self.assertEqual([m.span() for m in matches], [(0, 1), (1, 1)]) + + def test_star_match_at_end(self): + # a* at "aa" should match "aa", then empty at end + matches = list(re2.finditer("a*", "aa")) + self.assertEqual(len(matches), 2) + self.assertEqual([m.span() for m in matches], [(0, 2), (2, 2)]) + + def test_finditer_with_endpos(self): + # RE2's $ matches end of actual string, not endpos boundary + # This differs from Python's re module + matches = list(re2.compile("$").finditer("Hello World", endpos=5)) + self.assertEqual(len(matches), 0) + + def test_empty_pattern_with_endpos(self): + # Empty pattern with endpos should match up to and including endpos + matches = list(re2.compile("").finditer("Hello", endpos=3)) + self.assertEqual(len(matches), 4) + self.assertEqual([m.span() for m in matches], [(0, 0), (1, 1), (2, 2), (3, 3)]) + + def test_backslash_z_end_of_string(self): + # \z is RE2's absolute end of string anchor + matches = list(re2.finditer(r"\z", "Hello")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (5, 5)) + + def test_backslash_A_start_of_string(self): + # \A is absolute start of string anchor + matches = list(re2.finditer(r"\A", "Hello")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (0, 0)) + + def test_word_boundary(self): + # \b matches at word boundaries + matches = list(re2.finditer(r"\b", "Hi there")) + # Boundaries: before H, after i, before t, after e + self.assertEqual(len(matches), 4) + self.assertEqual([m.span() for m in matches], [(0, 0), (2, 2), (3, 3), (8, 8)]) + + def test_word_boundary_subn(self): + # \b substitution at word boundaries + result, count = re2.subn(r"\b", "|", "Hi there") + self.assertEqual(result, "|Hi| |there|") + self.assertEqual(count, 4) + + def test_non_word_boundary(self): + # \B matches at non-word boundaries (inside words) + matches = list(re2.finditer(r"\B", "Hello")) + # Non-boundaries: between H-e, e-l, l-l, l-o + self.assertEqual(len(matches), 4) + self.assertEqual([m.span() for m in matches], [(1, 1), (2, 2), (3, 3), (4, 4)]) + + def test_backslash_z_bytes(self): + # Bytes version of \z + matches = list(re2.finditer(rb"\z", b"Hello")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (5, 5)) + + def test_backslash_A_bytes(self): + # Bytes version of \A + matches = list(re2.finditer(rb"\A", b"Hello")) + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0].span(), (0, 0)) + if __name__ == '__main__': absltest.main()