diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index 2fb609602a2..d1b835c2075 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -248,35 +248,46 @@ The following functions, attributes and methods are currently supported: * ``*`` (repetition of strings) * ``in``, ``.contains()`` * ``==``, ``<``, ``<=``, ``>``, ``>=`` (comparison) -* ``.startswith()`` +* ``.capitalize()`` +* ``.casefold()`` +* ``.center()`` +* ``.count()`` +* ``.endswith()`` * ``.endswith()`` * ``.expandtabs()`` -* ``.isspace()`` -* ``.isidentifier()`` * ``.find()`` -* ``.center()`` -* ``.ljust()`` -* ``.rjust()`` -* ``.split()`` -* ``.join()`` -* ``.lstrip()`` -* ``.rstrip()`` -* ``.strip()`` -* ``.isupper()`` -* ``.upper()`` +* ``.index()`` +* ``.isalnum()`` +* ``.isalpha()`` +* ``.isdecimal()`` +* ``.isdigit()`` +* ``.isidentifier()`` * ``.islower()`` -* ``.lower()`` +* ``.isnumeric()`` * ``.isprintable()`` -* ``.zfill()`` -* ``.rpartition()`` -* ``.count()`` +* ``.isspace()`` * ``.istitle()`` +* ``.isupper()`` +* ``.join()`` +* ``.ljust()`` +* ``.lower()`` +* ``.lstrip()`` +* ``.partition()`` +* ``.replace()`` * ``.rfind()`` * ``.rindex()`` -* ``.index()`` +* ``.rjust()`` +* ``.rpartition()`` +* ``.rsplit()`` +* ``.rstrip()`` +* ``.split()`` +* ``.splitlines()`` +* ``.startswith()`` +* ``.strip()`` +* ``.swapcase()`` * ``.title()`` -* ``.isalpha()`` -* ``.isalnum()`` +* ``.upper()`` +* ``.zfill()`` Additional operations as well as support for Python 2 strings / Python 3 bytes will be added in a future version of Numba. Python 2 Unicode objects will diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 56b77aede25..3df0cb8a6f0 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -83,6 +83,10 @@ def ge_usecase(x, y): return x >= y +def partition_usecase(s, sep): + return s.partition(sep) + + def find_usecase(x, y): return x.find(y) @@ -123,6 +127,14 @@ def rfind_with_start_end_usecase(x, y, start, end): return x.rfind(y, start, end) +def replace_usecase(s, x, y): + return s.replace(x, y) + + +def replace_with_count_usecase(s, x, y, count): + return s.replace(x, y, count) + + def rindex_usecase(x, y): return x.rindex(y) @@ -167,6 +179,14 @@ def expandtabs_with_tabsize_kwarg_usecase(s, tabsize): return s.expandtabs(tabsize=tabsize) +def endswith_with_start_only_usecase(x, y, start): + return x.endswith(y, start) + + +def endswith_with_start_end_usecase(x, y, start, end): + return x.endswith(y, start, end) + + def split_usecase(x, y): return x.split(y) @@ -183,6 +203,34 @@ def split_whitespace_usecase(x): return x.split() +def splitlines_usecase(s): + return s.splitlines() + + +def splitlines_with_keepends_usecase(s, keepends): + return s.splitlines(keepends) + + +def splitlines_with_keepends_kwarg_usecase(s, keepends): + return s.splitlines(keepends=keepends) + + +def rsplit_usecase(s, sep): + return s.rsplit(sep) + + +def rsplit_with_maxsplit_usecase(s, sep, maxsplit): + return s.rsplit(sep, maxsplit) + + +def rsplit_with_maxsplit_kwarg_usecase(s, sep, maxsplit): + return s.rsplit(sep, maxsplit=maxsplit) + + +def rsplit_whitespace_usecase(s): + return s.rsplit() + + def lstrip_usecase(x): return x.lstrip() @@ -441,24 +489,6 @@ def test_bool(self, flags=no_pyobj_flags): for s in UNICODE_EXAMPLES: self.assertEqual(pyfunc(s), cfunc(s)) - def test_startswith(self, flags=no_pyobj_flags): - pyfunc = startswith_usecase - cfunc = njit(pyfunc) - for a in UNICODE_EXAMPLES: - for b in ['', 'x', a[:-2], a[3:], a, a + a]: - self.assertEqual(pyfunc(a, b), - cfunc(a, b), - '%s, %s' % (a, b)) - - def test_endswith(self, flags=no_pyobj_flags): - pyfunc = endswith_usecase - cfunc = njit(pyfunc) - for a in UNICODE_EXAMPLES: - for b in ['', 'x', a[:-2], a[3:], a, a + a]: - self.assertEqual(pyfunc(a, b), - cfunc(a, b), - '%s, %s' % (a, b)) - def test_expandtabs(self): pyfunc = expandtabs_usecase cfunc = njit(pyfunc) @@ -496,6 +526,133 @@ def test_expandtabs_exception_noninteger_tabsize(self): msg = '"tabsize" must be {}, not float'.format(accepted_types) self.assertIn(msg, str(raises.exception)) + def test_startswith(self, flags=no_pyobj_flags): + pyfunc = startswith_usecase + cfunc = njit(pyfunc) + for a in UNICODE_EXAMPLES: + for b in ['', 'x', a[:-2], a[3:], a, a + a]: + self.assertEqual(pyfunc(a, b), + cfunc(a, b), + '%s, %s' % (a, b)) + + def test_endswith(self, flags=no_pyobj_flags): + pyfunc = endswith_usecase + cfunc = njit(pyfunc) + for a in UNICODE_EXAMPLES: + for b in ['', 'x', a[:-2], a[3:], a, a + a]: + self.assertEqual(pyfunc(a, b), + cfunc(a, b), + '%s, %s' % (a, b)) + + def test_endswith_default(self): + pyfunc = endswith_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + msg = 'Results "{}".endswith("{}") must be equal' + self.assertEqual(pyfunc(s, sub_str), cfunc(s, sub_str), + msg=msg.format(s, sub_str)) + + def test_endswith_with_start(self): + pyfunc = endswith_with_start_only_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + for start in list(range(-20, 20)) + [None]: + msg = 'Results "{}".endswith("{}", {}) must be equal' + self.assertEqual(pyfunc(s, sub_str, start), + cfunc(s, sub_str, start), + msg=msg.format(s, sub_str, start)) + + def test_endswith_with_start_end(self): + pyfunc = endswith_with_start_end_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#LL1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + for start in list(range(-20, 20)) + [None]: + for end in list(range(-20, 20)) + [None]: + msg = 'Results "{}".endswith("{}", {}, {})\ + must be equal' + self.assertEqual(pyfunc(s, sub_str, start, end), + cfunc(s, sub_str, start, end), + msg=msg.format(s, sub_str, start, end)) + + def test_endswith_tuple(self): + pyfunc = endswith_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + msg = 'Results "{}".endswith({}) must be equal' + tuple_subs = (sub_str, 'lo') + self.assertEqual(pyfunc(s, tuple_subs), + cfunc(s, tuple_subs), + msg=msg.format(s, tuple_subs)) + + def test_endswith_tuple_args(self): + pyfunc = endswith_with_start_end_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + for start in list(range(-20, 20)) + [None]: + for end in list(range(-20, 20)) + [None]: + msg = 'Results "{}".endswith("{}", {}, {})\ + must be equal' + tuple_subs = (sub_str, 'lo') + self.assertEqual(pyfunc(s, tuple_subs, start, end), + cfunc(s, tuple_subs, start, end), + msg=msg.format(s, tuple_subs, + start, end)) + def test_in(self, flags=no_pyobj_flags): pyfunc = in_usecase cfunc = njit(pyfunc) @@ -506,6 +663,44 @@ def test_in(self, flags=no_pyobj_flags): cfunc(substr, a), "'%s' in '%s'?" % (substr, a)) + def test_partition_exception_invalid_sep(self): + self.disable_leak_check() + + pyfunc = partition_usecase + cfunc = njit(pyfunc) + + # Handle empty separator exception + for func in [pyfunc, cfunc]: + with self.assertRaises(ValueError) as raises: + func('a', '') + self.assertIn('empty separator', str(raises.exception)) + + accepted_types = (types.UnicodeType, types.UnicodeCharSeq) + with self.assertRaises(TypingError) as raises: + cfunc('a', None) + msg = '"sep" must be {}, not none'.format(accepted_types) + self.assertIn(msg, str(raises.exception)) + + def test_partition(self): + pyfunc = partition_usecase + cfunc = njit(pyfunc) + + CASES = [ + ('', '⚑'), + ('abcabc', '⚑'), + ('🐍⚑', '⚑'), + ('🐍⚑🐍', '⚑'), + ('abababa', 'a'), + ('abababa', 'b'), + ('abababa', 'c'), + ('abababa', 'ab'), + ('abababa', 'aba'), + ] + msg = 'Results of "{}".partition("{}") must be equal' + for s, sep in CASES: + self.assertEqual(pyfunc(s, sep), cfunc(s, sep), + msg=msg.format(s, sep)) + def test_find(self, flags=no_pyobj_flags): pyfunc = find_usecase cfunc = njit(pyfunc) @@ -646,7 +841,7 @@ def test_count_with_start_end(self): "'{0}'.c_count('{1}', {2}, {3}) = {5}") for s, sub in UNICODE_COUNT_EXAMPLES: - for i , j in product(range(-18, 18), (-18, 18)): + for i, j in product(range(-18, 18), (-18, 18)): py_result = pyfunc(s, sub, i, j) c_result = cfunc(s, sub, i, j) self.assertEqual(py_result, c_result, @@ -1203,6 +1398,144 @@ def test_split_whitespace(self): cfunc(test_str), "'%s'.split()?" % (test_str,)) + def test_split_exception_invalid_keepends(self): + pyfunc = splitlines_with_keepends_usecase + cfunc = njit(pyfunc) + + accepted_types = (types.Integer, int, types.Boolean, bool) + for ty, keepends in (('none', None), ('unicode_type', 'None')): + with self.assertRaises(TypingError) as raises: + cfunc('\n', keepends) + msg = '"keepends" must be {}, not {}'.format(accepted_types, ty) + self.assertIn(msg, str(raises.exception)) + + def test_splitlines(self): + pyfunc = splitlines_usecase + cfunc = njit(pyfunc) + + cases = ['', '\n', 'abc\r\rabc\r\n', '🐍⚑\v', '\f🐍⚑\f\v\v🐍\x85', + '\u2028aba\u2029baba', '\n\r\na\v\fb\x0b\x0cc\x1c\x1d\x1e'] + + msg = 'Results of "{}".splitlines() must be equal' + for s in cases: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + + def test_splitlines_with_keepends(self): + pyfuncs = [ + splitlines_with_keepends_usecase, + splitlines_with_keepends_kwarg_usecase + ] + messages = [ + 'Results of "{}".splitlines({}) must be equal', + 'Results of "{}".splitlines(keepends={}) must be equal' + ] + cases = ['', '\n', 'abc\r\rabc\r\n', '🐍⚑\v', '\f🐍⚑\f\v\v🐍\x85', + '\u2028aba\u2029baba', '\n\r\na\v\fb\x0b\x0cc\x1c\x1d\x1e'] + all_keepends = [True, False, 0, 1, -1, 100] + + for pyfunc, msg in zip(pyfuncs, messages): + cfunc = njit(pyfunc) + for s, keepends in product(cases, all_keepends): + self.assertEqual(pyfunc(s, keepends), cfunc(s, keepends), + msg=msg.format(s, keepends)) + + def test_rsplit_exception_empty_sep(self): + self.disable_leak_check() + + pyfunc = rsplit_usecase + cfunc = njit(pyfunc) + + # Handle empty separator exception + for func in [pyfunc, cfunc]: + with self.assertRaises(ValueError) as raises: + func('a', '') + self.assertIn('empty separator', str(raises.exception)) + + def test_rsplit_exception_noninteger_maxsplit(self): + pyfunc = rsplit_with_maxsplit_usecase + cfunc = njit(pyfunc) + + accepted_types = (types.Integer, int) + for sep in [' ', None]: + with self.assertRaises(TypingError) as raises: + cfunc('a', sep, 2.4) + msg = '"maxsplit" must be {}, not float'.format(accepted_types) + self.assertIn(msg, str(raises.exception)) + + def test_rsplit(self): + pyfunc = rsplit_usecase + cfunc = njit(pyfunc) + + CASES = [ + (' a ', None), + ('', '⚑'), + ('abcabc', '⚑'), + ('🐍⚑', '⚑'), + ('🐍⚑🐍', '⚑'), + ('abababa', 'a'), + ('abababa', 'b'), + ('abababa', 'c'), + ('abababa', 'ab'), + ('abababa', 'aba'), + ] + msg = 'Results of "{}".rsplit("{}") must be equal' + for s, sep in CASES: + self.assertEqual(pyfunc(s, sep), cfunc(s, sep), + msg=msg.format(s, sep)) + + def test_rsplit_with_maxsplit(self): + pyfuncs = [rsplit_with_maxsplit_usecase, + rsplit_with_maxsplit_kwarg_usecase] + CASES = [ + (' a ', None, 1), + ('', '⚑', 1), + ('abcabc', '⚑', 1), + ('🐍⚑', '⚑', 1), + ('🐍⚑🐍', '⚑', 1), + ('abababa', 'a', 2), + ('abababa', 'b', 1), + ('abababa', 'c', 2), + ('abababa', 'ab', 1), + ('abababa', 'aba', 5), + ] + messages = [ + 'Results of "{}".rsplit("{}", {}) must be equal', + 'Results of "{}".rsplit("{}", maxsplit={}) must be equal' + ] + + for pyfunc, msg in zip(pyfuncs, messages): + cfunc = njit(pyfunc) + for test_str, sep, maxsplit in CASES: + self.assertEqual(pyfunc(test_str, sep, maxsplit), + cfunc(test_str, sep, maxsplit), + msg=msg.format(test_str, sep, maxsplit)) + + def test_rsplit_whitespace(self): + pyfunc = rsplit_whitespace_usecase + cfunc = njit(pyfunc) + + # list copied from + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodetype_db.h#L5996-L6031 # noqa: E501 + all_whitespace = ''.join(map(chr, [ + 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x001C, 0x001D, 0x001E, + 0x001F, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, + 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, + 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 + ])) + + CASES = [ + '', + 'abcabc', + '🐍 ⚑', + '🐍 ⚑ 🐍', + '🐍 ⚑ 🐍 ', + ' 🐍 ⚑ 🐍', + ' 🐍' + all_whitespace + '⚑ 🐍 ', + ] + msg = 'Results of "{}".rsplit() must be equal' + for s in CASES: + self.assertEqual(pyfunc(s), cfunc(s), msg.format(s)) + def test_join_empty(self): # Can't pass empty list to nopython mode, so we have to make a # separate test case @@ -1637,6 +1970,23 @@ def pyfunc(x): self.assertEqual(pyfunc(*args), cfunc(*args), msg='failed on {}'.format(args)) + def test_capitalize(self): + def pyfunc(x): + return x.capitalize() + + cfunc = njit(pyfunc) + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L800-L815 # noqa: E501 + cpython = ['\U0001044F', '\U0001044F\U0001044F', '\U00010427\U0001044F', + '\U0001044F\U00010427', 'X\U00010427x\U0001044F', 'h\u0130', + '\u1fd2\u0130', 'finnish', 'A\u0345\u03a3'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L926 # noqa: E501 + cpython_extras = ['\U00010000\U00100000'] + + msg = 'Results of "{}".capitalize() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_isupper(self): def pyfunc(x): return x.isupper() @@ -1666,6 +2016,21 @@ def pyfunc(x): self.assertEqual(pyfunc(*args), cfunc(*args), msg='failed on {}'.format(args)) + def test_casefold(self): + def pyfunc(x): + return x.casefold() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L774-L781 # noqa: E501 + cpython = ['hello', 'hELlo', 'ß', 'fi', '\u03a3', + 'A\u0345\u03a3', '\u00b5'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L924 # noqa: E501 + cpython_extras = ['\U00010000\U00100000'] + + msg = 'Results of "{}".casefold() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_isalpha(self): def pyfunc(x): return x.isalpha() @@ -1715,6 +2080,25 @@ def test_title(self): for s in UNICODE_EXAMPLES + [''] + cpython: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_swapcase(self): + def pyfunc(x): + return x.swapcase() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L834-L858 # noqa: E501 + cpython = ['\U0001044F', '\U00010427', '\U0001044F\U0001044F', + '\U00010427\U0001044F', '\U0001044F\U00010427', + 'X\U00010427x\U0001044F', 'fi', '\u0130', '\u03a3', + '\u0345\u03a3', 'A\u0345\u03a3', 'A\u0345\u03a3a', + 'A\u0345\u03a3', 'A\u03a3\u0345', '\u03a3\u0345 ', + '\u03a3', 'ß', '\u1fd2'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L928 # noqa: E501 + cpython_extras = ['\U00010000\U00100000'] + + msg = 'Results of "{}".swapcase() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_islower(self): pyfunc = islower_usecase cfunc = njit(pyfunc) @@ -1772,6 +2156,133 @@ def test_lower(self): for s in UNICODE_EXAMPLES + [''] + extras + cpython + sigma: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_isnumeric(self): + def pyfunc(x): + return x.isnumeric() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L676-L693 # noqa: E501 + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', + '0123456789a', '\U00010401', '\U00010427', '\U00010429', + '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', + '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', + 'a\uDFFFb\uD800a'] + + msg = 'Results of "{}".isnumeric() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + + def test_isdigit(self): + def pyfunc(x): + return x.isdigit() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L664-L674 # noqa: E501 + cpython = ['\u2460', '\xbc', '\u0660', '\U00010401', '\U00010427', + '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', + '\U00011065', '\U0001D7F6', '\U00011066', '\U000104A0', + '\U0001F107'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', + 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] + + msg = 'Results of "{}".isdigit() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + + def test_isdecimal(self): + def pyfunc(x): + return x.isdecimal() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L646-L662 # noqa: E501 + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', + '0123456789a', '\U00010401', '\U00010427', '\U00010429', + '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', + '\U0001F107', '\U0001D7F6', '\U00011066', '\U000104A0'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', + 'a\uDFFFb\uD800a'] + + msg = 'Results of "{}".isdecimal() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + + def test_replace(self): + pyfunc = replace_usecase + cfunc = njit(pyfunc) + + CASES = [ + ('abc', '', 'A'), + ('', '⚑', 'A'), + ('abcabc', '⚑', 'A'), + ('🐍⚑', '⚑', 'A'), + ('🐍⚑🐍', '⚑', 'A'), + ('abababa', 'a', 'A'), + ('abababa', 'b', 'A'), + ('abababa', 'c', 'A'), + ('abababa', 'ab', 'A'), + ('abababa', 'aba', 'A'), + ] + + for test_str, old_str, new_str in CASES: + self.assertEqual(pyfunc(test_str, old_str, new_str), + cfunc(test_str, old_str, new_str), + "'%s'.replace('%s', '%s')?" % + (test_str, old_str, new_str)) + + def test_replace_with_count(self): + pyfunc = replace_with_count_usecase + cfunc = njit(pyfunc) + + CASES = [ + ('abc', '', 'A'), + ('', '⚑', 'A'), + ('abcabc', '⚑', 'A'), + ('🐍⚑', '⚑', 'A'), + ('🐍⚑🐍', '⚑', 'A'), + ('abababa', 'a', 'A'), + ('abababa', 'b', 'A'), + ('abababa', 'c', 'A'), + ('abababa', 'ab', 'A'), + ('abababa', 'aba', 'A'), + ] + + count_test = [-1, 1, 0, 5] + + for test_str, old_str, new_str in CASES: + for count in count_test: + self.assertEqual(pyfunc(test_str, old_str, new_str, count), + cfunc(test_str, old_str, new_str, count), + "'%s'.replace('%s', '%s', '%s')?" % + (test_str, old_str, new_str, count)) + + def test_replace_unsupported(self): + def pyfunc(s, x, y, count): + return s.replace(x, y, count) + + cfunc = njit(pyfunc) + + with self.assertRaises(TypingError) as raises: + cfunc('ababababab', 'ba', 'qqq', 3.5) + msg = 'Unsupported parameters. The parametrs must be Integer.' + self.assertIn(msg, str(raises.exception)) + + with self.assertRaises(TypingError) as raises: + cfunc('ababababab', 0, 'qqq', 3) + msg = 'The object must be a UnicodeType.' + self.assertIn(msg, str(raises.exception)) + + with self.assertRaises(TypingError) as raises: + cfunc('ababababab', 'ba', 0, 3) + msg = 'The object must be a UnicodeType.' + self.assertIn(msg, str(raises.exception)) + @unittest.skipUnless(_py34_or_later, 'unicode support requires Python 3.4 or later') diff --git a/numba/unicode.py b/numba/unicode.py index 4af0729992e..f5b113d795e 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -34,16 +34,23 @@ from numba.errors import TypingError from .unicode_support import (_Py_TOUPPER, _Py_TOLOWER, _Py_UCS4, _Py_ISALNUM, _PyUnicode_ToUpperFull, _PyUnicode_ToLowerFull, + _PyUnicode_ToFoldedFull, _PyUnicode_ToTitleFull, _PyUnicode_IsPrintable, - _PyUnicode_IsSpace, + _PyUnicode_IsSpace, _Py_ISSPACE, _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, + _PyUnicode_IsLineBreak, _Py_ISLINEBREAK, + _Py_ISLINEFEED, _Py_ISCARRIAGERETURN, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER, _Py_TAB, _Py_LINEFEED, _Py_CARRIAGE_RETURN, _Py_SPACE, _PyUnicode_IsAlpha, _PyUnicode_IsNumeric, - _Py_ISALPHA,) + _Py_ISALPHA, _PyUnicode_IsDigit, + _PyUnicode_IsDecimalDigit) + + +_py38_or_later = sys.version_info[:2] >= (3, 8) # DATA MODEL @@ -522,23 +529,6 @@ def contains_impl(a, b): return contains_impl -# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9342-L9354 # noqa: E501 -@register_jitable -def _adjust_indices(length, start, end): - if end > length: - end = length - if end < 0: - end += length - if end < 0: - end = 0 - if start < 0: - start += length - if start < 0: - start = 0 - - return start, end - - def unicode_idx_check_type(ty, name): """Check object belongs to one of specific types ty: type @@ -676,6 +666,42 @@ def index_impl(s, sub, start=None, end=None): return index_impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12922-L12976 # noqa: E501 +@overload_method(types.UnicodeType, 'partition') +def unicode_partition(data, sep): + """Implements str.partition()""" + thety = sep + # if the type is omitted, the concrete type is the value + if isinstance(sep, types.Omitted): + thety = sep.value + # if the type is optional, the concrete type is the captured type + elif isinstance(sep, types.Optional): + thety = sep.type + + accepted = (types.UnicodeType, types.UnicodeCharSeq) + if thety is not None and not isinstance(thety, accepted): + msg = '"{}" must be {}, not {}'.format('sep', accepted, sep) + raise TypingError(msg) + + def impl(data, sep): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/partition.h#L7-L60 # noqa: E501 + empty_str = _empty_string(data._kind, 0, data._is_ascii) + sep_length = len(sep) + if data._kind < sep._kind or len(data) < sep_length: + return data, empty_str, empty_str + + if sep_length == 0: + raise ValueError('empty separator') + + pos = data.find(sep) + if pos < 0: + return data, empty_str, empty_str + + return data[0:pos], sep, data[pos + sep_length:len(data)] + + return impl + + @overload_method(types.UnicodeType, 'count') def unicode_count(src, sub, start=None, end=None): @@ -760,18 +786,69 @@ def startswith_impl(a, b): return startswith_impl +# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9342-L9354 # noqa: E501 +@register_jitable +def _adjust_indices(length, start, end): + if end > length: + end = length + if end < 0: + end += length + if end < 0: + end = 0 + if start < 0: + start += length + if start < 0: + start = 0 + + return start, end + + @overload_method(types.UnicodeType, 'endswith') -def unicode_endswith(a, b): - if isinstance(b, types.UnicodeType): - def endswith_impl(a, b): - a_offset = len(a) - len(b) - if a_offset < 0: +def unicode_endswith(s, substr, start=None, end=None): + if not (start is None or isinstance(start, (types.Omitted, + types.Integer, + types.NoneType))): + raise TypingError('The arg must be a Integer or None') + + if not (end is None or isinstance(end, (types.Omitted, + types.Integer, + types.NoneType))): + raise TypingError('The arg must be a Integer or None') + + if isinstance(substr, (types.Tuple, types.UniTuple)): + def endswith_impl(s, substr, start=None, end=None): + for item in substr: + if s.endswith(item, start, end) is True: + return True + + return False + return endswith_impl + + if isinstance(substr, types.UnicodeType): + def endswith_impl(s, substr, start=None, end=None): + length = len(s) + sub_length = len(substr) + if start is None: + start = 0 + if end is None: + end = length + + start, end = _adjust_indices(length, start, end) + if end - start < sub_length: return False - return _cmp_region(a, a_offset, b, 0, len(b)) == 0 + + if sub_length == 0: + return True + + s = s[start:end] + offset = len(s) - sub_length + + return _cmp_region(s, offset, substr, 0, sub_length) == 0 return endswith_impl - if isinstance(b, types.UnicodeCharSeq): - def endswith_impl(a, b): - return a.endswith(str(b)) + + if isinstance(substr, types.UnicodeCharSeq): + def endswith_impl(s, substr, start=None, end=None): + return s.endswith(str(substr), start, end) return endswith_impl @@ -926,6 +1003,138 @@ def split_whitespace_impl(a, sep=None, maxsplit=-1): return split_whitespace_impl +def generate_rsplit_whitespace_impl(isspace_func): + """Generate whitespace rsplit func based on either ascii or unicode""" + + def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L192-L240 # noqa: E501 + if maxsplit < 0: + maxsplit = sys.maxsize + + result = [] + i = len(data) - 1 + while maxsplit > 0: + while i >= 0: + code_point = _get_code_point(data, i) + if not isspace_func(code_point): + break + i -= 1 + if i < 0: + break + j = i + i -= 1 + while i >= 0: + code_point = _get_code_point(data, i) + if isspace_func(code_point): + break + i -= 1 + result.append(data[i + 1:j + 1]) + maxsplit -= 1 + + if i >= 0: + # Only occurs when maxsplit was reached + # Skip any remaining whitespace and copy to beginning of string + while i >= 0: + code_point = _get_code_point(data, i) + if not isspace_func(code_point): + break + i -= 1 + if i >= 0: + result.append(data[0:i + 1]) + + return result[::-1] + + return rsplit_whitespace_impl + + +unicode_rsplit_whitespace_impl = register_jitable( + generate_rsplit_whitespace_impl(_PyUnicode_IsSpace)) +ascii_rsplit_whitespace_impl = register_jitable( + generate_rsplit_whitespace_impl(_Py_ISSPACE)) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13095-L13108 # noqa: E501 +@overload_method(types.UnicodeType, 'rsplit') +def unicode_rsplit(data, sep=None, maxsplit=-1): + """Implements str.unicode_rsplit()""" + + def _unicode_rsplit_check_type(ty, name, accepted): + """Check object belongs to one of specified types""" + thety = ty + # if the type is omitted, the concrete type is the value + if isinstance(ty, types.Omitted): + thety = ty.value + # if the type is optional, the concrete type is the captured type + elif isinstance(ty, types.Optional): + thety = ty.type + + if thety is not None and not isinstance(thety, accepted): + raise TypingError( + '"{}" must be {}, not {}'.format(name, accepted, ty)) + + _unicode_rsplit_check_type(sep, 'sep', (types.UnicodeType, + types.UnicodeCharSeq, + types.NoneType)) + _unicode_rsplit_check_type(maxsplit, 'maxsplit', (types.Integer, int)) + + if sep is None or isinstance(sep, (types.NoneType, types.Omitted)): + + def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): + if data._is_ascii: + return ascii_rsplit_whitespace_impl(data, sep, maxsplit) + return unicode_rsplit_whitespace_impl(data, sep, maxsplit) + + return rsplit_whitespace_impl + + def rsplit_impl(data, sep=None, maxsplit=-1): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L286-L333 # noqa: E501 + if data._kind < sep._kind or len(data) < len(sep): + return [data] + + def _rsplit_char(data, ch, maxsplit): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L242-L284 # noqa: E501 + result = [] + ch_code_point = _get_code_point(ch, 0) + i = j = len(data) - 1 + while i >= 0 and maxsplit > 0: + data_code_point = _get_code_point(data, i) + if data_code_point == ch_code_point: + result.append(data[i + 1 : j + 1]) + j = i = i - 1 + maxsplit -= 1 + i -= 1 + if j >= -1: + result.append(data[0 : j + 1]) + + return result[::-1] + + if maxsplit < 0: + maxsplit = sys.maxsize + + sep_length = len(sep) + + if sep_length == 0: + raise ValueError('empty separator') + if sep_length == 1: + return _rsplit_char(data, sep, maxsplit) + + result = [] + j = len(data) + while maxsplit > 0: + pos = data.rfind(sep, start=0, end=j) + if pos < 0: + break + result.append(data[pos + sep_length:j]) + j = pos + maxsplit -= 1 + + result.append(data[0:j]) + + return result[::-1] + + return rsplit_impl + + @overload_method(types.UnicodeType, 'center') def unicode_center(string, width, fillchar=' '): if not isinstance(width, types.Integer): @@ -964,66 +1173,115 @@ def center_impl(string, width, fillchar=' '): return center_impl -@overload_method(types.UnicodeType, 'ljust') -def unicode_ljust(string, width, fillchar=' '): - if not isinstance(width, types.Integer): - raise TypingError('The width must be an Integer') +def gen_unicode_Xjust(STRING_FIRST): + def unicode_Xjust(string, width, fillchar=' '): + if not isinstance(width, types.Integer): + raise TypingError('The width must be an Integer') - if isinstance(fillchar, types.UnicodeCharSeq): - def ljust_impl(string, width, fillchar=' '): - return string.ljust(width, str(fillchar)) - return ljust_impl + if isinstance(fillchar, types.UnicodeCharSeq): + if STRING_FIRST: + def ljust_impl(string, width, fillchar=' '): + return string.ljust(width, str(fillchar)) + return ljust_impl + else: + def rjust_impl(string, width, fillchar=' '): + return string.rjust(width, str(fillchar)) + return rjust_impl - if not (fillchar == ' ' or isinstance( - fillchar, (types.Omitted, types.UnicodeType))): - raise TypingError('The fillchar must be a UnicodeType') + if not (fillchar == ' ' or + isinstance(fillchar, (types.Omitted, types.UnicodeType))): + raise TypingError('The fillchar must be a UnicodeType') - def ljust_impl(string, width, fillchar=' '): - str_len = len(string) - fillchar_len = len(fillchar) + def impl(string, width, fillchar=' '): + str_len = len(string) + fillchar_len = len(fillchar) - if fillchar_len != 1: - raise ValueError('The fill character must be exactly one ' - 'character long') + if fillchar_len != 1: + raise ValueError('The fill character must be exactly one ' + 'character long') - if width <= str_len: - return string + if width <= str_len: + return string - newstr = string + (fillchar * (width - str_len)) + newstr = (fillchar * (width - str_len)) + if STRING_FIRST: + return string + newstr + else: + return newstr + string - return newstr - return ljust_impl + return impl + return unicode_Xjust -@overload_method(types.UnicodeType, 'rjust') -def unicode_rjust(string, width, fillchar=' '): - if not isinstance(width, types.Integer): - raise TypingError('The width must be an Integer') - if isinstance(fillchar, types.UnicodeCharSeq): - def rjust_impl(string, width, fillchar=' '): - return string.rjust(width, str(fillchar)) - return rjust_impl +overload_method(types.UnicodeType, 'rjust')(gen_unicode_Xjust(False)) +overload_method(types.UnicodeType, 'ljust')(gen_unicode_Xjust(True)) - if not (fillchar == ' ' or - isinstance(fillchar, (types.Omitted, types.UnicodeType))): - raise TypingError('The fillchar must be a UnicodeType') - def rjust_impl(string, width, fillchar=' '): - str_len = len(string) - fillchar_len = len(fillchar) +def generate_splitlines_func(is_line_break_func): + """Generate splitlines performer based on ascii or unicode line breaks.""" + def impl(data, keepends): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L335-L389 # noqa: E501 + length = len(data) + result = [] + i = j = 0 + while i < length: + # find a line and append it + while i < length: + code_point = _get_code_point(data, i) + if is_line_break_func(code_point): + break + i += 1 + + # skip the line break reading CRLF as one line break + eol = i + if i < length: + if i + 1 < length: + cur_cp = _get_code_point(data, i) + next_cp = _get_code_point(data, i + 1) + if _Py_ISCARRIAGERETURN(cur_cp) and _Py_ISLINEFEED(next_cp): + i += 1 + i += 1 + if keepends: + eol = i + + result.append(data[j:eol]) + j = i - if fillchar_len != 1: - raise ValueError('The fill character must be exactly one ' - 'character long') + return result - if width <= str_len: - return string + return impl - newstr = (fillchar * (width - str_len)) + string - return newstr - return rjust_impl +_ascii_splitlines = register_jitable(generate_splitlines_func(_Py_ISLINEBREAK)) +_unicode_splitlines = register_jitable(generate_splitlines_func( + _PyUnicode_IsLineBreak)) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10196-L10229 # noqa: E501 +@overload_method(types.UnicodeType, 'splitlines') +def unicode_splitlines(data, keepends=False): + """Implements str.splitlines()""" + thety = keepends + # if the type is omitted, the concrete type is the value + if isinstance(keepends, types.Omitted): + thety = keepends.value + # if the type is optional, the concrete type is the captured type + elif isinstance(keepends, types.Optional): + thety = keepends.type + + accepted = (types.Integer, int, types.Boolean, bool) + if thety is not None and not isinstance(thety, accepted): + raise TypingError( + '"{}" must be {}, not {}'.format('keepends', accepted, keepends)) + + def splitlines_impl(data, keepends=False): + if data._is_ascii: + return _ascii_splitlines(data, keepends) + + return _unicode_splitlines(data, keepends) + + return splitlines_impl @register_jitable @@ -1111,30 +1369,9 @@ def zfill_impl(string, width): return zfill_impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12126-L12161 # noqa: E501 -@overload_method(types.UnicodeType, 'isidentifier') -def unicode_isidentifier(data): - """Implements UnicodeType.isidentifier()""" - - def impl(data): - length = len(data) - if length == 0: - return False - - first_cp = _get_code_point(data, 0) - if not _PyUnicode_IsXidStart(first_cp) and first_cp != 0x5F: - return False - - for i in range(1, length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsXidContinue(code_point): - return False - - return True - - return impl - - +# ------------------------------------------------------------------------------ +# Strip functions +# ------------------------------------------------------------------------------ @register_jitable def unicode_strip_left_bound(string, chars): chars = ' ' if chars is None else chars @@ -1223,7 +1460,9 @@ def strip_impl(string, chars=None): return strip_impl -# String creation +# ------------------------------------------------------------------------------ +# Slice functions +# ------------------------------------------------------------------------------ @register_jitable def normalize_str_idx(idx, length, is_start=True): @@ -1390,6 +1629,11 @@ def getitem_slice(s, idx): return getitem_slice +# ------------------------------------------------------------------------------ +# String operations +# ------------------------------------------------------------------------------ + + @overload(operator.add) @overload(operator.iadd) def unicode_concat(a, b): @@ -1459,34 +1703,99 @@ def impl(a): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11928-L11964 # noqa: E501 -@overload_method(types.UnicodeType, 'isalpha') -def unicode_isalpha(data): - """Implements UnicodeType.isalpha()""" +@overload_method(types.UnicodeType, 'replace') +def unicode_replace(s, old_str, new_str, count=-1): + thety = count + if isinstance(count, types.Omitted): + thety = count.value + elif isinstance(count, types.Optional): + thety = count.type + + if not isinstance(thety, (int, types.Integer)): + raise TypingError('Unsupported parameters. The parametrs ' + 'must be Integer. Given count: {}'.format(count)) + + if not isinstance(old_str, (types.UnicodeType, types.NoneType)): + raise TypingError('The object must be a UnicodeType.' + ' Given: {}'.format(old_str)) + + if not isinstance(new_str, types.UnicodeType): + raise TypingError('The object must be a UnicodeType.' + ' Given: {}'.format(new_str)) + + def impl(s, old_str, new_str, count=-1): + if count == 0: + return s + if old_str == '': + schars = list(s) + if count == -1: + return new_str + new_str.join(schars) + new_str + split_result = [new_str] + min_count = min(len(schars), count) + for i in range(min_count): + split_result.append(schars[i]) + if i + 1 != min_count: + split_result.append(new_str) + else: + split_result.append(''.join(schars[(i + 1):])) + if count > len(schars): + split_result.append(new_str) + return ''.join(split_result) + schars = s.split(old_str, count) + result = new_str.join(schars) + return result - def impl(data): - length = len(data) - if length == 0: - return False + return impl - if length == 1: - code_point = _get_code_point(data, 0) - return _PyUnicode_IsAlpha(code_point) +# ------------------------------------------------------------------------------ +# String `is*()` methods +# ------------------------------------------------------------------------------ + + +# generates isalpha/isalnum +def gen_isAlX(ascii_func, unicode_func): + def unicode_isAlX(data): + + def impl(data): + length = len(data) + if length == 0: + return False + + if length == 1: + code_point = _get_code_point(data, 0) + if data._is_ascii: + return ascii_func(code_point) + else: + return unicode_func(code_point) + + if data._is_ascii: + for i in range(length): + code_point = _get_code_point(data, i) + if not ascii_func(code_point): + return False - if data._is_ascii: for i in range(length): code_point = _get_code_point(data, i) - if not _Py_ISALPHA(code_point): + if not unicode_func(code_point): return False - for i in range(length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsAlpha(code_point): - return False + return True - return True + return impl + return unicode_isAlX - return impl + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11928-L11964 # noqa: E501 +overload_method(types.UnicodeType, 'isalpha')(gen_isAlX(_Py_ISALPHA, + _PyUnicode_IsAlpha)) + +_unicode_is_alnum = register_jitable(lambda x: + (_PyUnicode_IsNumeric(x) or + _PyUnicode_IsAlpha(x))) + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11975-L12006 # noqa: E501 +overload_method(types.UnicodeType, 'isalnum')(gen_isAlX(_Py_ISALNUM, + _unicode_is_alnum)) def _is_upper(is_lower, is_upper, is_title): @@ -1532,100 +1841,47 @@ def impl(a): return impl -@overload_method(types.UnicodeType, 'upper') -def unicode_upper(a): - """ - Implements .upper() - """ - def impl(a): - # main structure is a translation of: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13308-L13316 # noqa: E501 +if sys.version_info[:2] >= (3, 7): + @overload_method(types.UnicodeType, 'isascii') + def unicode_isascii(data): + """Implements UnicodeType.isascii()""" - # ASCII fast path - l = len(a) - if a._is_ascii: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L300 # noqa: E501 - ret = _empty_string(a._kind, l, a._is_ascii) - for idx in range(l): - code_point = _get_code_point(a, idx) - _set_code_point(ret, idx, _Py_TOUPPER(code_point)) - return ret - else: - # This part in an amalgamation of two algorithms: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9864-L9908 # noqa: E501 - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9787-L9805 # noqa: E501 - # - # The alg walks the string and writes the upper version of the code - # point into a 4byte kind unicode string and at the same time - # tracks the maximum width "upper" character encountered, following - # this the 4byte kind string is reinterpreted as needed into the - # maximum width kind string - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * l, a._is_ascii) - mapped = np.array((3,), dtype=_Py_UCS4) - maxchar = 0 - k = 0 - for idx in range(l): - mapped[:] = 0 - code_point = _get_code_point(a, idx) - n_res = _PyUnicode_ToUpperFull(_Py_UCS4(code_point), mapped) - for j in range(n_res): - maxchar = max(maxchar, mapped[j]) - _set_code_point(tmp, k, mapped[j]) - k += 1 - newlength = k - newkind = _codepoint_to_kind(maxchar) - ret = _empty_string(newkind, newlength, - _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(ret, i, _get_code_point(tmp, i)) - return ret - return impl + def impl(data): + return data._is_ascii + return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11896-L11925 # noqa: E501 -@overload_method(types.UnicodeType, 'isspace') -def unicode_isspace(data): - """Implements UnicodeType.isspace()""" +@overload_method(types.UnicodeType, 'istitle') +def unicode_istitle(data): + """ + Implements UnicodeType.istitle() + The algorithm is an approximate translation from CPython: + https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 + """ def impl(data): length = len(data) if length == 1: - return _PyUnicode_IsSpace(_get_code_point(data, 0)) + char = _get_code_point(data, 0) + return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char) if length == 0: return False - for i in range(length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsSpace(code_point): - return False - - return True - - return impl - - -@overload_method(types.UnicodeType, 'istitle') -def unicode_istitle(s): - """ - Implements UnicodeType.istitle() - The algorithm is an approximate translation from CPython: - https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 - """ - - def impl(s): cased = False previous_is_cased = False - for char in s: + for idx in range(length): + char = _get_code_point(data, idx) if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char): if previous_is_cased: return False - cased = True previous_is_cased = True + cased = True elif _PyUnicode_IsLowercase(char): if not previous_is_cased: return False + previous_is_cased = True + cased = True else: previous_is_cased = False @@ -1633,54 +1889,50 @@ def impl(s): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12188-L12213 # noqa: E501 -@overload_method(types.UnicodeType, 'isprintable') -def unicode_isprintable(data): - """Implements UnicodeType.isprintable()""" +@overload_method(types.UnicodeType, 'islower') +def unicode_islower(data): + """ + impl is an approximate translation of: + https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933 # noqa: E501 + mixed with: + https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156 # noqa: E501 + """ def impl(data): length = len(data) if length == 1: - return _PyUnicode_IsPrintable(_get_code_point(data, 0)) + return _PyUnicode_IsLowercase(_get_code_point(data, 0)) + if length == 0: + return False - for i in range(length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsPrintable(code_point): + cased = False + for idx in range(length): + cp = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp): return False - - return True - + elif not cased and _PyUnicode_IsLowercase(cp): + cased = True + return cased return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11975-L12006 # noqa: E501 -@overload_method(types.UnicodeType, 'isalnum') -def unicode_isalnum(data): - """Implements UnicodeType.isalnum()""" +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12126-L12161 # noqa: E501 +@overload_method(types.UnicodeType, 'isidentifier') +def unicode_isidentifier(data): + """Implements UnicodeType.isidentifier()""" def impl(data): length = len(data) - - if length == 1: - code_point = _get_code_point(data, 0) - if data._is_ascii: - return _Py_ISALNUM(code_point) - return (_PyUnicode_IsNumeric(code_point) or - _PyUnicode_IsAlpha(code_point)) - if length == 0: return False - if data._is_ascii: - for i in range(length): - code_point = _get_code_point(data, i) - if not _Py_ISALNUM(code_point): - return False + first_cp = _get_code_point(data, 0) + if not _PyUnicode_IsXidStart(first_cp) and first_cp != 0x5F: + return False - for i in range(length): + for i in range(1, length): code_point = _get_code_point(data, i) - if (not _PyUnicode_IsNumeric(code_point) and - not _PyUnicode_IsAlpha(code_point)): + if not _PyUnicode_IsXidContinue(code_point): return False return True @@ -1688,40 +1940,76 @@ def impl(data): return impl -if sys.version_info[:2] >= (3, 7): - @overload_method(types.UnicodeType, 'isascii') - def unicode_isascii(data): - """Implements UnicodeType.isascii()""" - +# generator for simple unicode "isX" methods +def gen_isX(_PyUnicode_IS_func, empty_is_false=True): + def unicode_isX(data): def impl(data): - return data._is_ascii + length = len(data) + if length == 1: + return _PyUnicode_IS_func(_get_code_point(data, 0)) + + if empty_is_false and length == 0: + return False + + for i in range(length): + code_point = _get_code_point(data, i) + if not _PyUnicode_IS_func(code_point): + return False + + return True + return impl + return unicode_isX -@overload_method(types.UnicodeType, 'islower') -def unicode_islower(data): - """ - impl is an approximate translation of: - https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933 # noqa: E501 - mixed with: - https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156 # noqa: E501 - """ +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11896-L11925 # noqa: E501 +overload_method(types.UnicodeType, 'isspace')(gen_isX(_PyUnicode_IsSpace)) +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12096-L12124 # noqa: E501 +overload_method(types.UnicodeType, 'isnumeric')(gen_isX(_PyUnicode_IsNumeric)) + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12056-L12085 # noqa: E501 +overload_method(types.UnicodeType, 'isdigit')(gen_isX(_PyUnicode_IsDigit)) + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045 # noqa: E501 +overload_method(types.UnicodeType, 'isdecimal')( + gen_isX(_PyUnicode_IsDecimalDigit)) + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12188-L12213 # noqa: E501 +overload_method(types.UnicodeType, 'isprintable')( + gen_isX(_PyUnicode_IsPrintable, False)) + +# ------------------------------------------------------------------------------ +# String methods that apply a transformation to the characters themselves +# ------------------------------------------------------------------------------ + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 +def case_operation(ascii_func, unicode_func): + """Generate common case operation performer.""" def impl(data): length = len(data) - if length == 1: - return _PyUnicode_IsLowercase(_get_code_point(data, 0)) if length == 0: - return False + return _empty_string(data._kind, length, data._is_ascii) + + if data._is_ascii: + res = _empty_string(data._kind, length, 1) + ascii_func(data, res) + return res + + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) + # maxchar should be inside of a list to be pass as argument by reference + maxchars = [0] + newlength = unicode_func(data, length, tmp, maxchars) + maxchar = maxchars[0] + newkind = _codepoint_to_kind(maxchar) + res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) + for i in range(newlength): + _set_code_point(res, i, _get_code_point(tmp, i)) + + return res - cased = False - for idx in range(length): - cp = _get_code_point(data, idx) - if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp): - return False - elif not cased and _PyUnicode_IsLowercase(cp): - cased = True - return cased return impl @@ -1759,13 +2047,138 @@ def _lower_ucs4(code_point, data, length, idx, mapped): return _PyUnicode_ToLowerFull(code_point, mapped) +# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 +def _gen_unicode_upper_or_lower(lower): + def _do_upper_or_lower(data, length, res, maxchars): + k = 0 + for idx in range(length): + mapped = np.zeros(3, dtype=_Py_UCS4) + code_point = _get_code_point(data, idx) + if lower: + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + else: + # might be needed if call _do_upper_or_lower in unicode_upper + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + for m in mapped[:n_res]: + maxchars[0] = max(maxchars[0], m) + _set_code_point(res, k, m) + k += 1 + return k + return _do_upper_or_lower + + +_unicode_upper = register_jitable(_gen_unicode_upper_or_lower(False)) +_unicode_lower = register_jitable(_gen_unicode_upper_or_lower(True)) + + +def _gen_ascii_upper_or_lower(func): + def _ascii_upper_or_lower(data, res): + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, func(code_point)) + return _ascii_upper_or_lower + + +_ascii_upper = register_jitable(_gen_ascii_upper_or_lower(_Py_TOUPPER)) +_ascii_lower = register_jitable(_gen_ascii_upper_or_lower(_Py_TOLOWER)) + + +@overload_method(types.UnicodeType, 'lower') +def unicode_lower(data): + """Implements .lower()""" + return case_operation(_ascii_lower, _unicode_lower) + + +@overload_method(types.UnicodeType, 'upper') +def unicode_upper(data): + """Implements .upper()""" + return case_operation(_ascii_upper, _unicode_upper) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 +@register_jitable +def _unicode_casefold(data, length, res, maxchars): + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _PyUnicode_ToFoldedFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + + return k + + +@register_jitable +def _ascii_casefold(data, res): + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) + + +@overload_method(types.UnicodeType, 'casefold') +def unicode_casefold(data): + """Implements str.casefold()""" + return case_operation(_ascii_casefold, _unicode_casefold) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 +@register_jitable +def _unicode_capitalize(data, length, res, maxchars): + k = 0 + maxchar = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + code_point = _get_code_point(data, 0) + + # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c # noqa: E501 + if _py38_or_later: + n_res = _PyUnicode_ToTitleFull(code_point, mapped) + else: + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + for idx in range(1, length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + maxchars[0] = maxchar + return k + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382 # noqa: E501 +@register_jitable +def _ascii_capitalize(data, res): + code_point = _get_code_point(data, 0) + _set_code_point(res, 0, _Py_TOUPPER(code_point)) + for idx in range(1, len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 +@overload_method(types.UnicodeType, 'capitalize') +def unicode_capitalize(data): + return case_operation(_ascii_capitalize, _unicode_capitalize) + + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9996-L10021 # noqa: E501 @register_jitable -def _do_title(data, length, res, maxchars): +def _unicode_title(data, length, res, maxchars): """This is a translation of the function that titles a unicode string.""" k = 0 previous_cased = False - mapped = np.zeros(3, dtype=_Py_UCS4) + mapped = np.empty(3, dtype=_Py_UCS4) for idx in range(length): mapped.fill(0) code_point = _get_code_point(data, idx) @@ -1782,81 +2195,77 @@ def _do_title(data, length, res, maxchars): return k +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L332-L352 # noqa: E501 +@register_jitable +def _ascii_title(data, res): + """ Does .title() on an ASCII string """ + previous_is_cased = False + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + if _Py_ISLOWER(code_point): + if not previous_is_cased: + code_point = _Py_TOUPPER(code_point) + previous_is_cased = True + elif _Py_ISUPPER(code_point): + if previous_is_cased: + code_point = _Py_TOLOWER(code_point) + previous_is_cased = True + else: + previous_is_cased = False + _set_code_point(res, idx, code_point) + + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069 # noqa: E501 @overload_method(types.UnicodeType, 'title') def unicode_title(data): """Implements str.title()""" # https://docs.python.org/3/library/stdtypes.html#str.title - def impl(data): - length = len(data) - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) - # maxchar should be inside of a list to be pass as argument by reference - maxchar = 0 - maxchars = [maxchar] - newlength = _do_title(data, length, tmp, maxchars) - maxchar, = maxchars - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(res, i, _get_code_point(tmp, i)) - return res - return impl + return case_operation(_ascii_title, _unicode_title) -# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L391-L408 # noqa: E501 @register_jitable -def _do_upper_or_lower(data, length, res, maxchars, lower): +def _ascii_swapcase(data, res): + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + if _Py_ISUPPER(code_point): + code_point = _Py_TOLOWER(code_point) + elif _Py_ISLOWER(code_point): + code_point = _Py_TOUPPER(code_point) + _set_code_point(res, idx, code_point) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784 # noqa: E501 +@register_jitable +def _unicode_swapcase(data, length, res, maxchars): k = 0 + maxchar = 0 + mapped = np.empty(3, dtype=_Py_UCS4) for idx in range(length): - mapped = np.zeros(3, dtype=_Py_UCS4) + mapped.fill(0) code_point = _get_code_point(data, idx) - if lower: + if _PyUnicode_IsUppercase(code_point): n_res = _lower_ucs4(code_point, data, length, idx, mapped) - else: - # might be needed if call _do_upper_or_lower in unicode_upper + elif _PyUnicode_IsLowercase(code_point): n_res = _PyUnicode_ToUpperFull(code_point, mapped) + else: + n_res = 1 + mapped[0] = code_point for m in mapped[:n_res]: - maxchars[0] = max(maxchars[0], m) + maxchar = max(maxchar, m) _set_code_point(res, k, m) k += 1 + maxchars[0] = maxchar return k -@overload_method(types.UnicodeType, 'lower') -def unicode_lower(data): - """Implements .lower()""" - def impl(data): - # main structure is a translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L12380-L12388 # noqa: E501 - - # ASCII fast path - length = len(data) - if data._is_ascii: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L247-L255 # noqa: E501 - res = _empty_string(data._kind, length, data._is_ascii) - for idx in range(length): - code_point = _get_code_point(data, idx) - _set_code_point(res, idx, _Py_TOLOWER(code_point)) - return res - else: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069 # noqa: E501 - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, - data._is_ascii) - # maxchar is inside of a list to be pass as argument by reference - maxchars = [0] - newlength = _do_upper_or_lower(data, length, tmp, maxchars, - lower=True) - maxchar = maxchars[0] - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, - _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(res, i, _get_code_point(tmp, i)) - return res +@overload_method(types.UnicodeType, 'swapcase') +def unicode_swapcase(data): + return case_operation(_ascii_swapcase, _unicode_swapcase) - return impl +# ------------------------------------------------------------------------------ +# iteration +# ------------------------------------------------------------------------------ @lower_builtin('getiter', types.UnicodeType) diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 9462aa84244..d170102e0cf 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -207,22 +207,19 @@ def _PyUnicode_IsXidContinue(ch): @register_jitable def _PyUnicode_ToDecimalDigit(ch): - raise NotImplementedError - - -@register_jitable -def _PyUnicode_IsDecimalDigit(ch): - raise NotImplementedError + ctype = _PyUnicode_gettyperecord(ch) + if ctype.flags & _PyUnicode_TyperecordMasks.DECIMAL_MASK: + return ctype.decimal + return -1 +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L1128 # noqa: E501 @register_jitable def _PyUnicode_ToDigit(ch): - raise NotImplementedError - - -@register_jitable -def _PyUnicode_IsDigit(ch): - raise NotImplementedError + ctype = _PyUnicode_gettyperecord(ch) + if ctype.flags & _PyUnicode_TyperecordMasks.DIGIT_MASK: + return ctype.digit + return -1 # From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L140-L145 # noqa: E501 @@ -253,6 +250,12 @@ def _PyUnicode_IsUppercase(ch): return ctype.flags & _PyUnicode_TyperecordMasks.UPPER_MASK != 0 +@register_jitable +def _PyUnicode_IsLineBreak(ch): + ctype = _PyUnicode_gettyperecord(ch) + return ctype.flags & _PyUnicode_TyperecordMasks.LINEBREAK_MASK != 0 + + @register_jitable def _PyUnicode_ToUppercase(ch): raise NotImplementedError @@ -306,10 +309,18 @@ def _PyUnicode_ToUpperFull(ch, res): return 1 +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L259-L272 # noqa: E501 @register_jitable def _PyUnicode_ToFoldedFull(ch, res): - raise NotImplementedError - + ctype = _PyUnicode_gettyperecord(ch) + extended_case_mask = _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK + if ctype.flags & extended_case_mask and (ctype.lower >> 20) & 7: + index = (ctype.lower & 0xFFFF) + (ctype.lower >> 24) + n = (ctype.lower >> 20) & 7 + for i in range(n): + res[i] = _PyUnicode_ExtendedCase(index + i) + return n + return _PyUnicode_ToLowerFull(ch, res) # From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L274-L279 # noqa: E501 @register_jitable @@ -325,6 +336,22 @@ def _PyUnicode_IsCaseIgnorable(ch): return ctype.flags & _PyUnicode_TyperecordMasks.CASE_IGNORABLE_MASK != 0 +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L135 # noqa: E501 +@register_jitable +def _PyUnicode_IsDigit(ch): + if _PyUnicode_ToDigit(ch) < 0: + return 0 + return 1 + + +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L106-L118 # noqa: E501 +@register_jitable +def _PyUnicode_IsDecimalDigit(ch): + if _PyUnicode_ToDecimalDigit(ch) < 0: + return 0 + return 1 + + # From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L291-L296 # noqa: E501 @register_jitable def _PyUnicode_IsSpace(ch): @@ -575,6 +602,40 @@ class _PY_CTF(IntEnum): ], dtype=np.uint8) +class _PY_CTF_LB(IntEnum): + LINE_BREAK = 0x01 + LINE_FEED = 0x02 + CARRIAGE_RETURN = 0x04 + + +_Py_ctype_islinebreak = np.array([ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.LINE_FEED, # 0xa '\n' + _PY_CTF_LB.LINE_BREAK, # 0xb '\v' + _PY_CTF_LB.LINE_BREAK, # 0xc '\f' + _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.CARRIAGE_RETURN, # 0xd '\r' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK, # 0x1c '\x1c' + _PY_CTF_LB.LINE_BREAK, # 0x1d '\x1d' + _PY_CTF_LB.LINE_BREAK, # 0x1e '\x1e' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK, # 0x85 '\x85' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, +], dtype=np.intc) + + # Translation of: # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pymacro.h#L25 # noqa: E501 @register_jitable @@ -677,5 +738,24 @@ def _Py_ISSPACE(ch): """ return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.SPACE + +@register_jitable +def _Py_ISLINEBREAK(ch): + """Check if character is ASCII line break""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_BREAK + + +@register_jitable +def _Py_ISLINEFEED(ch): + """Check if character is line feed `\n`""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_FEED + + +@register_jitable +def _Py_ISCARRIAGERETURN(ch): + """Check if character is carriage return `\r`""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.CARRIAGE_RETURN + + # End code related to/from CPython's pyctype # ------------------------------------------------------------------------------