From 2ca47dc4e8422e5b0bc9290d1ec371199829432b Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 12 Nov 2019 10:57:21 +0300 Subject: [PATCH 01/68] Implement str.capitalize() based on CPython --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 17 +++++++++ numba/unicode.py | 50 +++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index f19d3da7a64..183e6bf2196 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -180,6 +180,7 @@ The following functions, attributes and methods are currently supported: * ``.lstrip()`` * ``.rstrip()`` * ``.strip()`` +* ``.capitalize()`` * ``.isupper()`` * ``.upper()`` * ``.islower()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c9f49623dbd..4d70872a684 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1248,6 +1248,23 @@ def pyfunc(x): self.assertEqual(pyfunc(*args), cfunc(*args), msg='failed on {}'.format(args)) + def test_capitalize(self): + def pyfunc(x): + return x.capitalize() + + cfunc = njit(pyfunc) + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L800-L815 # noqa: E501 + cpython = ['\U0001044F', '\U0001044F\U0001044F', '\U00010427\U0001044F', + '\U0001044F\U00010427', 'X\U00010427x\U0001044F', 'h\u0130', + '\u1fd2\u0130', 'finnish', 'A\u0345\u03a3'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L926 # noqa: E501 + cpython_extras = ['\U00010000\U00100000'] + + msg = 'Results of "{}".capitalize() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_isupper(self): def pyfunc(x): return x.isupper() diff --git a/numba/unicode.py b/numba/unicode.py index 673b437ec5d..d7d6ba0689d 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1266,6 +1266,56 @@ def impl(a): return len(a) == 0 return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 +@register_jitable +def _do_capitalize(data, length, res, maxchars): + """This is a translation of the function to capitalize a unicode string.""" + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + + code_point = _get_code_point(data, 0) + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + + for idx in range(1, length): + code_point = _get_code_point(data, idx) + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + + return k + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 +@overload_method(types.UnicodeType, 'capitalize') +def unicode_capitalize(data): + """Implements str.capitalize()""" + def impl(data): + length = len(data) + if length == 0: + return _empty_string(data._kind, length, data._is_ascii) + + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) + # maxchar should be inside of a list to be pass as argument by reference + maxchars = [0] + newlength = _do_capitalize(data, length, tmp, maxchars) + maxchar = maxchars[0] + newkind = _codepoint_to_kind(maxchar) + res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) + for i in range(newlength): + _set_code_point(res, i, _get_code_point(tmp, i)) + + return res + + return impl + def _is_upper(is_lower, is_upper, is_title): # impl is an approximate translation of: From 3a4d6d07b82f496e38314a9f91da6b6fbabc7290 Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 12 Nov 2019 12:57:20 +0300 Subject: [PATCH 02/68] Implement str.casefold() based on CPython --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 15 +++++++++++ numba/unicode.py | 39 ++++++++++++++++++++++++++- numba/unicode_support.py | 12 +++++++-- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index f19d3da7a64..c7f14d85029 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -169,6 +169,7 @@ The following functions, attributes and methods are currently supported: * ``*`` (repetition of strings) * ``in``, ``.contains()`` * ``==``, ``<``, ``<=``, ``>``, ``>=`` (comparison) +* ``.casefold()`` * ``.startswith()`` * ``.endswith()`` * ``.find()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c9f49623dbd..eb620198466 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1277,6 +1277,21 @@ def pyfunc(x): self.assertEqual(pyfunc(*args), cfunc(*args), msg='failed on {}'.format(args)) + def test_casefold(self): + def pyfunc(x): + return x.casefold() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L774-L781 # noqa: E501 + cpython = ['hello', 'hELlo', 'ß', 'fi', '\u03a3', + 'A\u0345\u03a3', '\u00b5'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L924 # noqa: E501 + cpython_extras = ['\U00010000\U00100000'] + + msg = 'Results of "{}".casefold() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_title(self): pyfunc = title cfunc = njit(pyfunc) diff --git a/numba/unicode.py b/numba/unicode.py index 673b437ec5d..672fdfd0824 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -33,7 +33,7 @@ from numba.errors import TypingError from .unicode_support import (_Py_TOUPPER, _Py_TOLOWER, _Py_UCS4, _PyUnicode_ToUpperFull, _PyUnicode_ToLowerFull, - _PyUnicode_ToTitleFull, + _PyUnicode_ToFoldedFull, _PyUnicode_ToTitleFull, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) @@ -1361,6 +1361,43 @@ def impl(a): return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 +@overload_method(types.UnicodeType, 'casefold') +def unicode_casefold(data): + """Implements str.casefold()""" + def impl(data): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 + def _do_casefold(data, length, res, maxchars): + """Translation of the function to case fold a unicode string.""" + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _PyUnicode_ToFoldedFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + return k + + length = len(data) + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) + # maxchar should be inside of a list to be pass as argument by reference + maxchars = [0] + newlength = _do_casefold(data, length, tmp, maxchars) + maxchar = maxchars[0] + newkind = _codepoint_to_kind(maxchar) + res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) + for i in range(newlength): + _set_code_point(res, i, _get_code_point(tmp, i)) + + return res + + return impl + + @overload_method(types.UnicodeType, 'istitle') def unicode_istitle(s): """ diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 021f36a8ec8..a8a88b268a9 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -292,10 +292,18 @@ def _PyUnicode_ToUpperFull(ch, res): return 1 +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L259-L272 # noqa: E501 @register_jitable def _PyUnicode_ToFoldedFull(ch, res): - raise NotImplementedError - + ctype = _PyUnicode_gettyperecord(ch) + extended_case_mask = _PyUnicode_TyperecordMasks.EXTENDED_CASE_MASK + if ctype.flags & extended_case_mask and (ctype.lower >> 20) & 7: + index = (ctype.lower & 0xFFFF) + (ctype.lower >> 24) + n = (ctype.lower >> 20) & 7 + for i in range(n): + res[i] = _PyUnicode_ExtendedCase(index + i) + return n + return _PyUnicode_ToLowerFull(ch, res) # From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L274-L279 # noqa: E501 @register_jitable From 8a2079b11af68ff0c8389a4792a2111417d932b9 Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 12 Nov 2019 13:02:21 +0300 Subject: [PATCH 03/68] Move _do_capitalize into overload --- numba/unicode.py | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index d7d6ba0689d..e83ef9988e6 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1266,38 +1266,34 @@ def impl(a): return len(a) == 0 return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 -@register_jitable -def _do_capitalize(data, length, res, maxchars): - """This is a translation of the function to capitalize a unicode string.""" - k = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - - code_point = _get_code_point(data, 0) - n_res = _PyUnicode_ToUpperFull(code_point, mapped) - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - - for idx in range(1, length): - code_point = _get_code_point(data, idx) - n_res = _lower_ucs4(code_point, data, length, idx, mapped) - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - - return k - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 @overload_method(types.UnicodeType, 'capitalize') def unicode_capitalize(data): """Implements str.capitalize()""" def impl(data): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 + def _do_capitalize(data, length, res, maxchars): + """Translation of the function to capitalize a unicode string.""" + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + code_point = _get_code_point(data, 0) + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + for idx in range(1, length): + code_point = _get_code_point(data, idx) + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + return k + length = len(data) if length == 0: return _empty_string(data._kind, length, data._is_ascii) From 82e20bc4c4f26c56cd8ed27f485e38e8250286c0 Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 12 Nov 2019 13:47:43 +0300 Subject: [PATCH 04/68] Implement str.swapcase() based on CPython --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 19 ++++++++++++ numba/unicode.py | 43 +++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index f19d3da7a64..12965a171df 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -188,6 +188,7 @@ The following functions, attributes and methods are currently supported: * ``.count()`` * ``.istitle()`` * ``.rfind()`` +* ``.swapcase()`` * ``.title()`` Additional operations as well as support for Python 2 strings / Python 3 bytes diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c9f49623dbd..efa9798d367 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1293,6 +1293,25 @@ def test_title(self): for s in UNICODE_EXAMPLES + [''] + cpython: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_swapcase(self): + def pyfunc(x): + return x.swapcase() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L834-L858 # noqa: E501 + cpython = ['\U0001044F', '\U00010427', '\U0001044F\U0001044F', + '\U00010427\U0001044F', '\U0001044F\U00010427', + 'X\U00010427x\U0001044F', 'fi', '\u0130', '\u03a3', + '\u0345\u03a3', 'A\u0345\u03a3', 'A\u0345\u03a3a', + 'A\u0345\u03a3', 'A\u03a3\u0345', '\u03a3\u0345 ', + '\u03a3', 'ß', '\u1fd2'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L928 # noqa: E501 + cpython_extras = ['\U00010000\U00100000'] + + msg = 'Results of "{}".swapcase() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_islower(self): pyfunc = islower_usecase cfunc = njit(pyfunc) diff --git a/numba/unicode.py b/numba/unicode.py index 673b437ec5d..9089b3f232b 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1493,6 +1493,49 @@ def impl(data): return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13140-L13147 # noqa: E501 +@overload_method(types.UnicodeType, 'swapcase') +def unicode_swapcase(data): + """Implements str.swapcase()""" + def impl(data): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784 # noqa: E501 + def _do_swapcase(data, length, res, maxchars): + """Translation of the function to swap cases of a unicode string.""" + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(code_point): + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + elif _PyUnicode_IsLowercase(code_point): + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + else: + n_res = 1 + mapped[0] = code_point + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + return k + + length = len(data) + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) + # maxchar should be inside of a list to be pass as argument by reference + maxchars = [0] + newlength = _do_swapcase(data, length, tmp, maxchars) + maxchar = maxchars[0] + newkind = _codepoint_to_kind(maxchar) + res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) + for i in range(newlength): + _set_code_point(res, i, _get_code_point(tmp, i)) + + return res + + return impl + + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 @register_jitable def _do_upper_or_lower(data, length, res, maxchars, lower): From 7602c6351a760647ca4fd6e6196702ac150d3e8b Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 09:17:08 +0300 Subject: [PATCH 05/68] Implement str.rsplit() based on CPython --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 113 ++++++++++++++++++++++++ numba/unicode.py | 118 +++++++++++++++++++++++++- numba/unicode_support.py | 6 ++ 4 files changed, 237 insertions(+), 1 deletion(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index f19d3da7a64..f8000ed3fe6 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -176,6 +176,7 @@ The following functions, attributes and methods are currently supported: * ``.ljust()`` * ``.rjust()`` * ``.split()`` +* ``.rsplit()`` * ``.join()`` * ``.lstrip()`` * ``.rstrip()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c9f49623dbd..195858b7025 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -130,6 +130,22 @@ def split_whitespace_usecase(x): return x.split() +def rsplit_usecase(s, sep): + return s.rsplit(sep) + + +def rsplit_with_maxsplit_usecase(s, sep, maxsplit): + return s.rsplit(sep, maxsplit) + + +def rsplit_with_maxsplit_kwarg_usecase(s, sep, maxsplit): + return s.rsplit(sep, maxsplit=maxsplit) + + +def rsplit_whitespace_usecase(s): + return s.rsplit() + + def lstrip_usecase(x): return x.lstrip() @@ -864,6 +880,103 @@ def test_split_whitespace(self): cfunc(test_str), "'%s'.split()?" % (test_str,)) + def test_rsplit_exception_empty_sep(self): + self.disable_leak_check() + + pyfunc = rsplit_usecase + cfunc = njit(pyfunc) + + # Handle empty separator exception + for func in [pyfunc, cfunc]: + with self.assertRaises(ValueError) as raises: + func('a', '') + self.assertIn('empty separator', str(raises.exception)) + + def test_rsplit_exception_noninteger_maxsplit(self): + pyfunc = rsplit_with_maxsplit_usecase + cfunc = njit(pyfunc) + + accepted_types = (types.Integer, int) + for sep in [' ', None]: + with self.assertRaises(TypingError) as raises: + cfunc('a', sep, 2.4) + msg = '"maxsplit" must be {}, not float'.format(accepted_types) + self.assertIn(msg, str(raises.exception)) + + def test_rsplit(self): + pyfunc = rsplit_usecase + cfunc = njit(pyfunc) + + CASES = [ + (' a ', None), + ('', '⚡'), + ('abcabc', '⚡'), + ('🐍⚡', '⚡'), + ('🐍⚡🐍', '⚡'), + ('abababa', 'a'), + ('abababa', 'b'), + ('abababa', 'c'), + ('abababa', 'ab'), + ('abababa', 'aba'), + ] + msg = 'Results of "{}".rsplit("{}") must be equal' + for s, sep in CASES: + self.assertEqual(pyfunc(s, sep), cfunc(s, sep), + msg=msg.format(s, sep)) + + def test_rsplit_with_maxsplit(self): + pyfuncs = [rsplit_with_maxsplit_usecase, + rsplit_with_maxsplit_kwarg_usecase] + CASES = [ + (' a ', None, 1), + ('', '⚡', 1), + ('abcabc', '⚡', 1), + ('🐍⚡', '⚡', 1), + ('🐍⚡🐍', '⚡', 1), + ('abababa', 'a', 2), + ('abababa', 'b', 1), + ('abababa', 'c', 2), + ('abababa', 'ab', 1), + ('abababa', 'aba', 5), + ] + messages = [ + 'Results of "{}".rsplit("{}", {}) must be equal', + 'Results of "{}".rsplit("{}", maxsplit={}) must be equal' + ] + + for pyfunc, msg in zip(pyfuncs, messages): + cfunc = njit(pyfunc) + for test_str, sep, maxsplit in CASES: + self.assertEqual(pyfunc(test_str, sep, maxsplit), + cfunc(test_str, sep, maxsplit), + msg=msg.format(test_str, sep, maxsplit)) + + def test_rsplit_whitespace(self): + pyfunc = rsplit_whitespace_usecase + cfunc = njit(pyfunc) + + # list copied from + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodetype_db.h#L5996-L6031 # noqa: E501 + all_whitespace = ''.join(map(chr, [ + 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x001C, 0x001D, 0x001E, + 0x001F, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, + 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, + 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 + ])) + + CASES = [ + '', + 'abcabc', + '🐍 ⚡', + '🐍 ⚡ 🐍', + '🐍 ⚡ 🐍 ', + ' 🐍 ⚡ 🐍', + ' 🐍' + all_whitespace + '⚡ 🐍 ', + ] + msg = 'Results of "{}".rsplit() must be equal' + for s in CASES: + self.assertEqual(pyfunc(s), cfunc(s), msg.format(s)) + def test_join_empty(self): # Can't pass empty list to nopython mode, so we have to make a # separate test case diff --git a/numba/unicode.py b/numba/unicode.py index 673b437ec5d..1aa16b98df3 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1,4 +1,5 @@ import operator +import sys import numpy as np from llvmlite.ir import IntType, Constant @@ -33,7 +34,7 @@ from numba.errors import TypingError from .unicode_support import (_Py_TOUPPER, _Py_TOLOWER, _Py_UCS4, _PyUnicode_ToUpperFull, _PyUnicode_ToLowerFull, - _PyUnicode_ToTitleFull, + _PyUnicode_ToTitleFull, _PyUnicode_IsSpace, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) @@ -757,6 +758,121 @@ def split_whitespace_impl(a, sep=None, maxsplit=-1): return parts return split_whitespace_impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13095-L13108 # noqa: E501 +@overload_method(types.UnicodeType, 'rsplit') +def unicode_rsplit(data, sep=None, maxsplit=-1): + """Implements str.unicode_rsplit()""" + + def _unicode_rsplit_check_type(ty, name, accepted): + """Check object belongs to one of specified types""" + thety = ty + # if the type is omitted, the concrete type is the value + if isinstance(ty, types.Omitted): + thety = ty.value + # if the type is optional, the concrete type is the captured type + elif isinstance(ty, types.Optional): + thety = ty.type + + if thety is not None and not isinstance(thety, accepted): + raise TypingError( + '"{}" must be {}, not {}'.format(name, accepted, ty)) + + _unicode_rsplit_check_type(sep, 'sep', (types.UnicodeType, + types.UnicodeCharSeq, + types.NoneType)) + _unicode_rsplit_check_type(maxsplit, 'maxsplit', (types.Integer, int)) + + if sep is None or isinstance(sep, (types.NoneType, types.Omitted)): + + def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): + # https://github.com/python/cpython/blob/master/Objects/stringlib/split.h#L192-L235 # noqa: E501 + if maxsplit < 0: + maxsplit = sys.maxsize + + result = [] + i = len(data) - 1 + while maxsplit > 0: + while i >= 0: + code_point = _get_code_point(data, i) + if not _PyUnicode_IsSpace(code_point): + break + i -= 1 + if i < 0: + break + j = i + i -= 1 + while i >= 0: + code_point = _get_code_point(data, i) + if _PyUnicode_IsSpace(code_point): + break + i -= 1 + result.append(data[i+1:j+1]) + maxsplit -= 1 + + if i >= 0: + # Only occurs when maxsplit was reached + # Skip any remaining whitespace and copy to beginning of string + while i >= 0: + code_point = _get_code_point(data, i) + if not _PyUnicode_IsSpace(code_point): + break + i -= 1 + if i >= 0: + result.append(data[0:i+1]) + + return result[::-1] + + return rsplit_whitespace_impl + + def rsplit_impl(data, sep=None, maxsplit=-1): + # https://github.com/python/cpython/blob/master/Objects/stringlib/split.h#L286-L333 # noqa: E501 + if data._kind < sep._kind or len(data) < len(sep): + return [data] + + def _rsplit_char(data, ch, maxsplit): + # https://github.com/python/cpython/blob/master/Objects/stringlib/split.h#L242-L284 # noqa: E501 + result = [] + i = j = len(data) - 1 + while i >= 0 and maxsplit > 0: + while i >= 0: + data_code_point = _get_code_point(data, i) + ch_code_point = _get_code_point(ch, 0) + if data_code_point == ch_code_point: + result.append(data[i+1:j+1]) + j = i = i - 1 + break + i -= 1 + maxsplit -= 1 + if j >= -1: + result.append(data[0:j+1]) + + return result[::-1] + + if maxsplit < 0: + maxsplit = sys.maxsize + sep_length = len(sep) + + if sep_length == 0: + raise ValueError('empty separator') + if sep_length == 1: + return _rsplit_char(data, sep, maxsplit) + + result = [] + j = len(data) + while maxsplit > 0: + pos = data.rfind(sep, start=0, end=j) + if pos < 0: + break + result.append(data[pos+sep_length:j]) + j = pos + maxsplit -= 1 + + result.append(data[0:j]) + + return result[::-1] + + return rsplit_impl + @overload_method(types.UnicodeType, 'center') def unicode_center(string, width, fillchar=' '): diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 021f36a8ec8..38f257f0ed0 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -311,6 +311,12 @@ def _PyUnicode_IsCaseIgnorable(ch): return ctype.flags & _PyUnicode_TyperecordMasks.CASE_IGNORABLE_MASK != 0 +@register_jitable +def _PyUnicode_IsSpace(ch): + ctype = _PyUnicode_gettyperecord(ch) + return ctype.flags & _PyUnicode_TyperecordMasks.SPACE_MASK != 0 + + @register_jitable def _PyUnicode_IsAlpha(ch): raise NotImplementedError From 56a930cd5bc7857af4f3b9fe854970116a8314d7 Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 09:47:42 +0300 Subject: [PATCH 06/68] Fix linter issues for str.rsplit() --- numba/unicode.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 1aa16b98df3..25455e9a9f8 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -806,7 +806,7 @@ def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): if _PyUnicode_IsSpace(code_point): break i -= 1 - result.append(data[i+1:j+1]) + result.append(data[i + 1:j + 1]) maxsplit -= 1 if i >= 0: @@ -818,7 +818,7 @@ def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): break i -= 1 if i >= 0: - result.append(data[0:i+1]) + result.append(data[0:i + 1]) return result[::-1] @@ -838,13 +838,13 @@ def _rsplit_char(data, ch, maxsplit): data_code_point = _get_code_point(data, i) ch_code_point = _get_code_point(ch, 0) if data_code_point == ch_code_point: - result.append(data[i+1:j+1]) + result.append(data[i + 1:j + 1]) j = i = i - 1 break i -= 1 maxsplit -= 1 if j >= -1: - result.append(data[0:j+1]) + result.append(data[0:j + 1]) return result[::-1] @@ -863,7 +863,7 @@ def _rsplit_char(data, ch, maxsplit): pos = data.rfind(sep, start=0, end=j) if pos < 0: break - result.append(data[pos+sep_length:j]) + result.append(data[pos + sep_length:j]) j = pos maxsplit -= 1 From 11f99f63fd5cd21fd4f9ad0a39d40c03f9fc1698 Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 14:52:36 +0300 Subject: [PATCH 07/68] Add capitalization of ascii for str.capitalize() --- numba/unicode.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/numba/unicode.py b/numba/unicode.py index e83ef9988e6..c684008fd2d 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1298,6 +1298,18 @@ def _do_capitalize(data, length, res, maxchars): if length == 0: return _empty_string(data._kind, length, data._is_ascii) + if data._is_ascii: + # This is an approximate translation of: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L29-L30 # noqa: E501 + res = _empty_string(data._kind, length, data._is_ascii) + code_point = _get_code_point(data, 0) + _set_code_point(res, 0, _Py_TOUPPER(code_point)) + for idx in range(1, length): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) + + return res + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) # maxchar should be inside of a list to be pass as argument by reference maxchars = [0] From 22a0ee2b09db34d6d79221cd0e8a98de12aa36dd Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 14:54:59 +0300 Subject: [PATCH 08/68] Remove incorrect code comment for str.capitalize() --- numba/unicode.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index c684008fd2d..a4c4d1f08a9 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1299,8 +1299,6 @@ def _do_capitalize(data, length, res, maxchars): return _empty_string(data._kind, length, data._is_ascii) if data._is_ascii: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pyctype.h#L29-L30 # noqa: E501 res = _empty_string(data._kind, length, data._is_ascii) code_point = _get_code_point(data, 0) _set_code_point(res, 0, _Py_TOUPPER(code_point)) From 40069b5d97a98b8c94431dfac54dd8b5d6b51d6e Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 15:52:22 +0300 Subject: [PATCH 09/68] Improve str.casefold() Added case folding of ascii, merged _do_casefold to the main code and added processing of empty input string. --- numba/unicode.py | 50 +++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 672fdfd0824..4298f9e481b 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1361,36 +1361,42 @@ def impl(a): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 @overload_method(types.UnicodeType, 'casefold') def unicode_casefold(data): """Implements str.casefold()""" def impl(data): - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 - def _do_casefold(data, length, res, maxchars): - """Translation of the function to case fold a unicode string.""" - k = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) + length = len(data) + if length == 0: + return _empty_string(data._kind, length, data._is_ascii) + + if data._is_ascii: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9678-L9694 # noqa: E501 + res = _empty_string(data._kind, length, 1) for idx in range(length): - mapped.fill(0) code_point = _get_code_point(data, idx) - n_res = _PyUnicode_ToFoldedFull(code_point, mapped) - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - return k + _set_code_point(res, idx, _Py_TOLOWER(code_point)) - length = len(data) - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) - # maxchar should be inside of a list to be pass as argument by reference - maxchars = [0] - newlength = _do_casefold(data, length, tmp, maxchars) - maxchar = maxchars[0] + return res + + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 + # mixed with: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 + k = 0 + maxchar = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _PyUnicode_ToFoldedFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(tmp, k, m) + k += 1 newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) - for i in range(newlength): + res = _empty_string(newkind, k) + for i in range(k): _set_code_point(res, i, _get_code_point(tmp, i)) return res From a2b178091de1e80fb3a6fc303c1a915185a5ce0c Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 16:08:32 +0300 Subject: [PATCH 10/68] Merge _do_capitalize() to the main code --- numba/unicode.py | 54 +++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index a4c4d1f08a9..88a966e975d 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1272,34 +1272,12 @@ def impl(a): def unicode_capitalize(data): """Implements str.capitalize()""" def impl(data): - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 - def _do_capitalize(data, length, res, maxchars): - """Translation of the function to capitalize a unicode string.""" - k = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - code_point = _get_code_point(data, 0) - n_res = _PyUnicode_ToUpperFull(code_point, mapped) - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - for idx in range(1, length): - code_point = _get_code_point(data, idx) - n_res = _lower_ucs4(code_point, data, length, idx, mapped) - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - return k - length = len(data) if length == 0: return _empty_string(data._kind, length, data._is_ascii) if data._is_ascii: - res = _empty_string(data._kind, length, data._is_ascii) + res = _empty_string(data._kind, length, 1) code_point = _get_code_point(data, 0) _set_code_point(res, 0, _Py_TOUPPER(code_point)) for idx in range(1, length): @@ -1308,14 +1286,30 @@ def _do_capitalize(data, length, res, maxchars): return res - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) - # maxchar should be inside of a list to be pass as argument by reference - maxchars = [0] - newlength = _do_capitalize(data, length, tmp, maxchars) - maxchar = maxchars[0] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 + # mixed with: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 + k = 0 + maxchar = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length) + code_point = _get_code_point(data, 0) + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(tmp, k, m) + k += 1 + for idx in range(1, length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(tmp, k, m) + k += 1 newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) - for i in range(newlength): + res = _empty_string(newkind, k) + for i in range(k): _set_code_point(res, i, _get_code_point(tmp, i)) return res From d5191432d52586da4b3383b2abee0a58b1b6672a Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 16:21:15 +0300 Subject: [PATCH 11/68] Improve str.swapcase() Added case swaping of ascii, merged _do_swapcase to the main code and added processing of empty input string. --- numba/unicode.py | 63 +++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 9089b3f232b..278fd39eb55 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1498,37 +1498,46 @@ def impl(data): def unicode_swapcase(data): """Implements str.swapcase()""" def impl(data): - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784 # noqa: E501 - def _do_swapcase(data, length, res, maxchars): - """Translation of the function to swap cases of a unicode string.""" - k = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) + length = len(data) + if length == 0: + return _empty_string(data._kind, length, data._is_ascii) + + if data._is_ascii: + res = _empty_string(data._kind, length, 1) for idx in range(length): - mapped.fill(0) code_point = _get_code_point(data, idx) - if _PyUnicode_IsUppercase(code_point): - n_res = _lower_ucs4(code_point, data, length, idx, mapped) - elif _PyUnicode_IsLowercase(code_point): - n_res = _PyUnicode_ToUpperFull(code_point, mapped) - else: - n_res = 1 - mapped[0] = code_point - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - return k + if _Py_ISUPPER(code_point): + code_point = _Py_TOLOWER(code_point) + elif _Py_ISLOWER(code_point): + code_point = _Py_TOUPPER(code_point) + _set_code_point(res, idx, code_point) - length = len(data) - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) - # maxchar should be inside of a list to be pass as argument by reference - maxchars = [0] - newlength = _do_swapcase(data, length, tmp, maxchars) - maxchar = maxchars[0] + return res + + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 + # mixed with: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784 # noqa: E501 + k = 0 + maxchar = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(code_point): + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + elif _PyUnicode_IsLowercase(code_point): + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + else: + n_res = 1 + mapped[0] = code_point + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(tmp, k, m) + k += 1 newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) - for i in range(newlength): + res = _empty_string(newkind, k) + for i in range(k): _set_code_point(res, i, _get_code_point(tmp, i)) return res From 7cd87c82cf6c01e7bd931b263de853bb42146308 Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 17:53:47 +0300 Subject: [PATCH 12/68] Move common code from str.casefold() --- numba/unicode.py | 60 +++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 4298f9e481b..0e98598666a 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1361,6 +1361,44 @@ def impl(a): return impl +@register_jitable +def _do_casefold(data, length, res, maxchars): + """Translation of the function to case fold a unicode string.""" + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _PyUnicode_ToFoldedFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + return k + + +def _case_operation(func): + def impl(data): + length = len(data) + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) + # maxchar should be inside of a list to be pass as argument by reference + maxchars = [0] + newlength = func(data, length, tmp, maxchars) + maxchar = maxchars[0] + newkind = _codepoint_to_kind(maxchar) + res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) + for i in range(newlength): + _set_code_point(res, i, _get_code_point(tmp, i)) + + return res + + return impl + + +_apply_do_casefold = register_jitable(_case_operation(_do_casefold)) + + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 @overload_method(types.UnicodeType, 'casefold') def unicode_casefold(data): @@ -1379,27 +1417,7 @@ def impl(data): return res - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 - # mixed with: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 - k = 0 - maxchar = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length) - for idx in range(length): - mapped.fill(0) - code_point = _get_code_point(data, idx) - n_res = _PyUnicode_ToFoldedFull(code_point, mapped) - for m in mapped[:n_res]: - maxchar = max(maxchar, m) - _set_code_point(tmp, k, m) - k += 1 - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, k) - for i in range(k): - _set_code_point(res, i, _get_code_point(tmp, i)) - - return res + return _apply_do_casefold(data) return impl From e33fc47b5143bc15082e0b5284632f662edacf9b Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 Nov 2019 21:43:16 +0300 Subject: [PATCH 13/68] Add generating case operation performer --- numba/unicode.py | 88 ++++++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 32 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 0e98598666a..beed5cf455a 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1361,30 +1361,54 @@ def impl(a): return impl -@register_jitable -def _do_casefold(data, length, res, maxchars): - """Translation of the function to case fold a unicode string.""" - k = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - for idx in range(length): - mapped.fill(0) - code_point = _get_code_point(data, idx) - n_res = _PyUnicode_ToFoldedFull(code_point, mapped) - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - return k +def generate_unicode_operation_doer(operation_func): + """Generate unicode case operation performer.""" + def impl(data, length, res, maxchars): + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = operation_func(code_point, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + + return k + + return impl -def _case_operation(func): +def generate_ascii_operation_doer(operation_func): + """Generate ascii case operation performer.""" + def impl(data, res): + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, operation_func(code_point)) + + return impl + + +def generate_common_operation_doer(ascii_func, unicode_nres_func): + """Generate common case operation performer.""" def impl(data): length = len(data) + if length == 0: + return _empty_string(data._kind, length, data._is_ascii) + + if data._is_ascii: + res = _empty_string(data._kind, length, 1) + ascii_func(data, res) + + return res + + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) # maxchar should be inside of a list to be pass as argument by reference maxchars = [0] - newlength = func(data, length, tmp, maxchars) + newlength = unicode_nres_func(data, length, tmp, maxchars) maxchar = maxchars[0] newkind = _codepoint_to_kind(maxchar) res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) @@ -1396,7 +1420,20 @@ def impl(data): return impl -_apply_do_casefold = register_jitable(_case_operation(_do_casefold)) +def generate_case_operation_func(ascii_func, unicode_nres_func): + """Generate function to perform case operation + on a string either ascii or unicode. + """ + ascii_operation_doer = register_jitable(generate_ascii_operation_doer( + ascii_func)) + unicode_operation_doer = register_jitable(generate_unicode_operation_doer( + unicode_nres_func)) + return generate_common_operation_doer(ascii_operation_doer, + unicode_operation_doer) + + +_do_casefold = register_jitable(generate_case_operation_func( + _Py_TOLOWER, _PyUnicode_ToFoldedFull)) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 @@ -1404,20 +1441,7 @@ def impl(data): def unicode_casefold(data): """Implements str.casefold()""" def impl(data): - length = len(data) - if length == 0: - return _empty_string(data._kind, length, data._is_ascii) - - if data._is_ascii: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9678-L9694 # noqa: E501 - res = _empty_string(data._kind, length, 1) - for idx in range(length): - code_point = _get_code_point(data, idx) - _set_code_point(res, idx, _Py_TOLOWER(code_point)) - - return res - - return _apply_do_casefold(data) + return _do_casefold(data) return impl From 0246ad687e689757f197602e4c63ab0587546ab5 Mon Sep 17 00:00:00 2001 From: Denis Date: Thu, 14 Nov 2019 11:09:53 +0300 Subject: [PATCH 14/68] Add SHA references, fix nested loops for rsplit --- numba/unicode.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 25455e9a9f8..59e379baddc 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -785,7 +785,7 @@ def _unicode_rsplit_check_type(ty, name, accepted): if sep is None or isinstance(sep, (types.NoneType, types.Omitted)): def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): - # https://github.com/python/cpython/blob/master/Objects/stringlib/split.h#L192-L235 # noqa: E501 + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L192-L240 # noqa: E501 if maxsplit < 0: maxsplit = sys.maxsize @@ -825,24 +825,22 @@ def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): return rsplit_whitespace_impl def rsplit_impl(data, sep=None, maxsplit=-1): - # https://github.com/python/cpython/blob/master/Objects/stringlib/split.h#L286-L333 # noqa: E501 + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L286-L333 # noqa: E501 if data._kind < sep._kind or len(data) < len(sep): return [data] def _rsplit_char(data, ch, maxsplit): - # https://github.com/python/cpython/blob/master/Objects/stringlib/split.h#L242-L284 # noqa: E501 + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L242-L284 # noqa: E501 result = [] i = j = len(data) - 1 while i >= 0 and maxsplit > 0: - while i >= 0: - data_code_point = _get_code_point(data, i) - ch_code_point = _get_code_point(ch, 0) - if data_code_point == ch_code_point: - result.append(data[i + 1:j + 1]) - j = i = i - 1 - break - i -= 1 - maxsplit -= 1 + data_code_point = _get_code_point(data, i) + ch_code_point = _get_code_point(ch, 0) + if data_code_point == ch_code_point: + result.append(data[i + 1:j + 1]) + j = i = i - 1 + maxsplit -= 1 + i -= 1 if j >= -1: result.append(data[0:j + 1]) From b38a4b2f00e9440cca6aa74a045a8f4630924ae4 Mon Sep 17 00:00:00 2001 From: Denis Date: Thu, 14 Nov 2019 13:19:24 +0300 Subject: [PATCH 15/68] Implement str.partition() based on CPython --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 42 +++++++++++++++++++++++++++ numba/unicode.py | 36 +++++++++++++++++++++++ 3 files changed, 79 insertions(+) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index f19d3da7a64..106ca074543 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -184,6 +184,7 @@ The following functions, attributes and methods are currently supported: * ``.upper()`` * ``.islower()`` * ``.lower()`` +* ``.partition()`` * ``.zfill()`` * ``.count()`` * ``.istitle()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c9f49623dbd..75e8549026e 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -78,6 +78,10 @@ def ge_usecase(x, y): return x >= y +def partition_usecase(s, sep): + return s.partition(sep) + + def find_usecase(x, y): return x.find(y) @@ -404,6 +408,44 @@ def test_in(self, flags=no_pyobj_flags): cfunc(substr, a), "'%s' in '%s'?" % (substr, a)) + def test_partition_exception_invalid_sep(self): + self.disable_leak_check() + + pyfunc = partition_usecase + cfunc = njit(pyfunc) + + # Handle empty separator exception + for func in [pyfunc, cfunc]: + with self.assertRaises(ValueError) as raises: + func('a', '') + self.assertIn('empty separator', str(raises.exception)) + + accepted_types = (types.UnicodeType, types.UnicodeCharSeq) + with self.assertRaises(TypingError) as raises: + cfunc('a', None) + msg = '"sep" must be {}, not none'.format(accepted_types) + self.assertIn(msg, str(raises.exception)) + + def test_partition(self): + pyfunc = partition_usecase + cfunc = njit(pyfunc) + + CASES = [ + ('', '⚡'), + ('abcabc', '⚡'), + ('🐍⚡', '⚡'), + ('🐍⚡🐍', '⚡'), + ('abababa', 'a'), + ('abababa', 'b'), + ('abababa', 'c'), + ('abababa', 'ab'), + ('abababa', 'aba'), + ] + msg = 'Results of "{}".partition("{}") must be equal' + for s, sep in CASES: + self.assertEqual(pyfunc(s, sep), cfunc(s, sep), + msg=msg.format(s, sep)) + def test_find(self, flags=no_pyobj_flags): pyfunc = find_usecase cfunc = njit(pyfunc) diff --git a/numba/unicode.py b/numba/unicode.py index 673b437ec5d..5494e95c336 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -610,6 +610,42 @@ def _adjust_indices(length, start, end): return rfind_impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12922-L12976 # noqa: E501 +@overload_method(types.UnicodeType, 'partition') +def unicode_partition(data, sep): + """Implements str.partition()""" + thety = sep + # if the type is omitted, the concrete type is the value + if isinstance(sep, types.Omitted): + thety = sep.value + # if the type is optional, the concrete type is the captured type + elif isinstance(sep, types.Optional): + thety = sep.type + + accepted = (types.UnicodeType, types.UnicodeCharSeq) + if thety is not None and not isinstance(thety, accepted): + msg = '"{}" must be {}, not {}'.format('sep', accepted, sep) + raise TypingError(msg) + + def impl(data, sep): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/partition.h#L7-L60 # noqa: E501 + empty_str = _empty_string(data._kind, 0, data._is_ascii) + sep_length = len(sep) + if data._kind < sep._kind or len(data) < sep_length: + return data, empty_str, empty_str + + if sep_length == 0: + raise ValueError('empty separator') + + pos = data.find(sep) + if pos < 0: + return data, empty_str, empty_str + + return data[0:pos], sep, data[pos + sep_length:len(data)] + + return impl + + @overload_method(types.UnicodeType, 'count') def unicode_count(src, sub, start=None, end=None): From 8aae59e48fefffb5fd6a163abbcb7b4639443829 Mon Sep 17 00:00:00 2001 From: Denis Date: Fri, 15 Nov 2019 09:54:12 +0300 Subject: [PATCH 16/68] Implement str.splitlines() based on CPython --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 53 +++++++++++++++++++++ numba/unicode.py | 68 +++++++++++++++++++++++++++ numba/unicode_support.py | 59 +++++++++++++++++++++++ 4 files changed, 181 insertions(+) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index f19d3da7a64..f74cb785839 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -176,6 +176,7 @@ The following functions, attributes and methods are currently supported: * ``.ljust()`` * ``.rjust()`` * ``.split()`` +* ``.splitlines()`` * ``.join()`` * ``.lstrip()`` * ``.rstrip()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c9f49623dbd..dbafc46e2d8 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -130,6 +130,18 @@ def split_whitespace_usecase(x): return x.split() +def splitlines_usecase(s): + return s.splitlines() + + +def splitlines_with_keepends_usecase(s, keepends): + return s.splitlines(keepends) + + +def splitlines_with_keepends_kwarg_usecase(s, keepends): + return s.splitlines(keepends=keepends) + + def lstrip_usecase(x): return x.lstrip() @@ -864,6 +876,47 @@ def test_split_whitespace(self): cfunc(test_str), "'%s'.split()?" % (test_str,)) + def test_split_exception_invalid_keepends(self): + pyfunc = splitlines_with_keepends_usecase + cfunc = njit(pyfunc) + + accepted_types = (types.Integer, int, types.Boolean, bool) + for ty, keepends in (('none', None), ('unicode_type', 'None')): + with self.assertRaises(TypingError) as raises: + cfunc('\n', keepends) + msg = '"keepends" must be {}, not {}'.format(accepted_types, ty) + self.assertIn(msg, str(raises.exception)) + + def test_splitlines(self): + pyfunc = splitlines_usecase + cfunc = njit(pyfunc) + + cases = ['', '\n', 'abc\r\rabc\r\n', '🐍⚡\v', '\f🐍⚡\f\v\v🐍\x85', + '\u2028aba\u2029baba', '\n\r\na\v\fb\x0b\x0cc\x1c\x1d\x1e'] + + msg = 'Results of "{}".splitlines() must be equal' + for s in cases: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + + def test_splitlines_with_keepends(self): + pyfuncs = [ + splitlines_with_keepends_usecase, + splitlines_with_keepends_kwarg_usecase + ] + messages = [ + 'Results of "{}".splitlines({}) must be equal', + 'Results of "{}".splitlines(keepends={}) must be equal' + ] + cases = ['', '\n', 'abc\r\rabc\r\n', '🐍⚡\v', '\f🐍⚡\f\v\v🐍\x85', + '\u2028aba\u2029baba', '\n\r\na\v\fb\x0b\x0cc\x1c\x1d\x1e'] + all_keepends = [True, False, 0, 1, -1, 100] + + for pyfunc, msg in zip(pyfuncs, messages): + cfunc = njit(pyfunc) + for s, keepends in product(cases, all_keepends): + self.assertEqual(pyfunc(s, keepends), cfunc(s, keepends), + msg=msg.format(s, keepends)) + def test_join_empty(self): # Can't pass empty list to nopython mode, so we have to make a # separate test case diff --git a/numba/unicode.py b/numba/unicode.py index 673b437ec5d..640b1222d73 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -36,6 +36,8 @@ _PyUnicode_ToTitleFull, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, + _PyUnicode_IsLineBreak, _Py_ISLINEBREAK, + _Py_ISLINEFEED, _Py_ISCARRIAGERETURN, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) # DATA MODEL @@ -858,6 +860,72 @@ def rjust_impl(string, width, fillchar=' '): return rjust_impl +def generate_splitlines_func(is_line_break_func): + """Generate splitlines performer based on ascii or unicode line breaks.""" + def impl(data, keepends): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L335-L389 # noqa: E501 + length = len(data) + result = [] + i = j = 0 + while i < length: + # find a line and append it + while i < length: + code_point = _get_code_point(data, i) + if is_line_break_func(code_point): + break + i += 1 + + # skip the line break reading CRLF as one line break + eol = i + if i < length: + if i + 1 < length: + cur_cp = _get_code_point(data, i) + next_cp = _get_code_point(data, i + 1) + if _Py_ISCARRIAGERETURN(cur_cp) and _Py_ISLINEFEED(next_cp): + i += 1 + i += 1 + if keepends: + eol = i + + result.append(data[j:eol]) + j = i + + return result + + return impl + + +_ascii_splitlines = register_jitable(generate_splitlines_func(_Py_ISLINEBREAK)) +_unicode_splitlines = register_jitable(generate_splitlines_func( + _PyUnicode_IsLineBreak)) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10196-L10229 # noqa: E501 +@overload_method(types.UnicodeType, 'splitlines') +def unicode_splitlines(data, keepends=False): + """Implements str.splitlines()""" + thety = keepends + # if the type is omitted, the concrete type is the value + if isinstance(keepends, types.Omitted): + thety = keepends.value + # if the type is optional, the concrete type is the captured type + elif isinstance(keepends, types.Optional): + thety = keepends.type + + accepted = (types.Integer, int, types.Boolean, bool) + if thety is not None and not isinstance(thety, accepted): + raise TypingError( + '"{}" must be {}, not {}'.format('keepends', accepted, keepends)) + + def splitlines_impl(data, keepends=False): + if data._is_ascii: + return _ascii_splitlines(data, keepends) + + return _unicode_splitlines(data, keepends) + + return splitlines_impl + + @register_jitable def join_list(sep, parts): parts_len = len(parts) diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 021f36a8ec8..84461d919b9 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -239,6 +239,12 @@ def _PyUnicode_IsUppercase(ch): return ctype.flags & _PyUnicode_TyperecordMasks.UPPER_MASK != 0 +@register_jitable +def _PyUnicode_IsLineBreak(ch): + ctype = _PyUnicode_gettyperecord(ch) + return ctype.flags & _PyUnicode_TyperecordMasks.LINEBREAK_MASK != 0 + + @register_jitable def _PyUnicode_ToUppercase(ch): raise NotImplementedError @@ -552,6 +558,40 @@ class _PY_CTF(IntEnum): ], dtype=np.uint8) +class _PY_CTF_LB(IntEnum): + LINE_BREAK = 0x01 + LINE_FEED = 0x02 + CARRIAGE_RETURN = 0x04 + + +_Py_ctype_islinebreak = np.array([ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.LINE_FEED, # 0xa '\n' + _PY_CTF_LB.LINE_BREAK, # 0xb '\v' + _PY_CTF_LB.LINE_BREAK, # 0xc '\f' + _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.CARRIAGE_RETURN, # 0xd '\r' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK, # 0x1c '\x1c' + _PY_CTF_LB.LINE_BREAK, # 0x1d '\x1d' + _PY_CTF_LB.LINE_BREAK, # 0x1e '\x1e' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK, # 0x85 '\x85' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, +], dtype=np.intc) + + # Translation of: # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pymacro.h#L25 # noqa: E501 @register_jitable @@ -654,5 +694,24 @@ def _Py_ISSPACE(ch): """ return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.SPACE + +@register_jitable +def _Py_ISLINEBREAK(ch): + """Check if character is ASCII line break""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_BREAK + + +@register_jitable +def _Py_ISLINEFEED(ch): + """Check if character is line feed `\n`""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_FEED + + +@register_jitable +def _Py_ISCARRIAGERETURN(ch): + """Check if character is carriage return `\r`""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.CARRIAGE_RETURN + + # End code related to/from CPython's pyctype # ------------------------------------------------------------------------------ From 308ea68b2f5e44f439a008f4e51a592ea8b3a39a Mon Sep 17 00:00:00 2001 From: Denis Date: Fri, 15 Nov 2019 13:30:46 +0300 Subject: [PATCH 17/68] Add SHA for str.casefold() --- numba/unicode.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index beed5cf455a..fd14ee747ea 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1437,13 +1437,12 @@ def generate_case_operation_func(ascii_func, unicode_nres_func): # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 +# mixed with +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 @overload_method(types.UnicodeType, 'casefold') def unicode_casefold(data): """Implements str.casefold()""" - def impl(data): - return _do_casefold(data) - - return impl + return _do_casefold @overload_method(types.UnicodeType, 'istitle') From a2cd602dab18150f226f6c7ce9ad3bee5dbfdf03 Mon Sep 17 00:00:00 2001 From: Denis Date: Fri, 15 Nov 2019 15:01:44 +0300 Subject: [PATCH 18/68] Fix generating of operation methods --- numba/unicode.py | 68 ++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index fd14ee747ea..734108eba7e 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1361,37 +1361,7 @@ def impl(a): return impl -def generate_unicode_operation_doer(operation_func): - """Generate unicode case operation performer.""" - def impl(data, length, res, maxchars): - k = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - for idx in range(length): - mapped.fill(0) - code_point = _get_code_point(data, idx) - n_res = operation_func(code_point, mapped) - for m in mapped[:n_res]: - maxchar = maxchars[0] - maxchars[0] = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - - return k - - return impl - - -def generate_ascii_operation_doer(operation_func): - """Generate ascii case operation performer.""" - def impl(data, res): - for idx in range(len(data)): - code_point = _get_code_point(data, idx) - _set_code_point(res, idx, operation_func(code_point)) - - return impl - - -def generate_common_operation_doer(ascii_func, unicode_nres_func): +def generate_operation_func(ascii_func, unicode_nres_func): """Generate common case operation performer.""" def impl(data): length = len(data) @@ -1420,20 +1390,32 @@ def impl(data): return impl -def generate_case_operation_func(ascii_func, unicode_nres_func): - """Generate function to perform case operation - on a string either ascii or unicode. - """ - ascii_operation_doer = register_jitable(generate_ascii_operation_doer( - ascii_func)) - unicode_operation_doer = register_jitable(generate_unicode_operation_doer( - unicode_nres_func)) - return generate_common_operation_doer(ascii_operation_doer, - unicode_operation_doer) +@register_jitable +def _unicode_casefold_doer(data, length, res, maxchars): + k = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _PyUnicode_ToFoldedFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = maxchars[0] + maxchars[0] = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + + return k + + +@register_jitable +def _ascii_casefold_doer(data, res): + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) -_do_casefold = register_jitable(generate_case_operation_func( - _Py_TOLOWER, _PyUnicode_ToFoldedFull)) +_do_casefold = register_jitable(generate_operation_func(_ascii_casefold_doer, + _unicode_casefold_doer)) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 From 72485b455086762dd5e8193e9112fb6505c38807 Mon Sep 17 00:00:00 2001 From: Denis Date: Fri, 15 Nov 2019 15:42:03 +0300 Subject: [PATCH 19/68] Add generator of rsplit whitespace implementation --- numba/unicode.py | 92 +++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 59e379baddc..053ce25ecfe 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -34,7 +34,8 @@ from numba.errors import TypingError from .unicode_support import (_Py_TOUPPER, _Py_TOLOWER, _Py_UCS4, _PyUnicode_ToUpperFull, _PyUnicode_ToLowerFull, - _PyUnicode_ToTitleFull, _PyUnicode_IsSpace, + _PyUnicode_ToTitleFull, + _PyUnicode_IsSpace, _Py_ISSPACE, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) @@ -758,6 +759,56 @@ def split_whitespace_impl(a, sep=None, maxsplit=-1): return parts return split_whitespace_impl + +def generate_rsplit_whitespace_impl(isspace_func): + """Generate whitespace rsplit func based on either ascii or unicode""" + + def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L192-L240 # noqa: E501 + if maxsplit < 0: + maxsplit = sys.maxsize + + result = [] + i = len(data) - 1 + while maxsplit > 0: + while i >= 0: + code_point = _get_code_point(data, i) + if not isspace_func(code_point): + break + i -= 1 + if i < 0: + break + j = i + i -= 1 + while i >= 0: + code_point = _get_code_point(data, i) + if isspace_func(code_point): + break + i -= 1 + result.append(data[i + 1:j + 1]) + maxsplit -= 1 + + if i >= 0: + # Only occurs when maxsplit was reached + # Skip any remaining whitespace and copy to beginning of string + while i >= 0: + code_point = _get_code_point(data, i) + if not isspace_func(code_point): + break + i -= 1 + if i >= 0: + result.append(data[0:i + 1]) + + return result[::-1] + + return rsplit_whitespace_impl + + +unicode_rsplit_whitespace_impl = register_jitable( + generate_rsplit_whitespace_impl(_PyUnicode_IsSpace)) +ascii_rsplit_whitespace_impl = register_jitable( + generate_rsplit_whitespace_impl(_Py_ISSPACE)) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13095-L13108 # noqa: E501 @overload_method(types.UnicodeType, 'rsplit') def unicode_rsplit(data, sep=None, maxsplit=-1): @@ -785,42 +836,9 @@ def _unicode_rsplit_check_type(ty, name, accepted): if sep is None or isinstance(sep, (types.NoneType, types.Omitted)): def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L192-L240 # noqa: E501 - if maxsplit < 0: - maxsplit = sys.maxsize - - result = [] - i = len(data) - 1 - while maxsplit > 0: - while i >= 0: - code_point = _get_code_point(data, i) - if not _PyUnicode_IsSpace(code_point): - break - i -= 1 - if i < 0: - break - j = i - i -= 1 - while i >= 0: - code_point = _get_code_point(data, i) - if _PyUnicode_IsSpace(code_point): - break - i -= 1 - result.append(data[i + 1:j + 1]) - maxsplit -= 1 - - if i >= 0: - # Only occurs when maxsplit was reached - # Skip any remaining whitespace and copy to beginning of string - while i >= 0: - code_point = _get_code_point(data, i) - if not _PyUnicode_IsSpace(code_point): - break - i -= 1 - if i >= 0: - result.append(data[0:i + 1]) - - return result[::-1] + if data._is_ascii: + return ascii_rsplit_whitespace_impl(data, sep, maxsplit) + return unicode_rsplit_whitespace_impl(data, sep, maxsplit) return rsplit_whitespace_impl From 761196f3b9ae05306f097b20d551e3873c314baa Mon Sep 17 00:00:00 2001 From: Denis Date: Fri, 15 Nov 2019 15:43:08 +0300 Subject: [PATCH 20/68] Minor fix for str.rsplit() --- numba/unicode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numba/unicode.py b/numba/unicode.py index 053ce25ecfe..70e1080494c 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -809,6 +809,7 @@ def rsplit_whitespace_impl(data, sep=None, maxsplit=-1): ascii_rsplit_whitespace_impl = register_jitable( generate_rsplit_whitespace_impl(_Py_ISSPACE)) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13095-L13108 # noqa: E501 @overload_method(types.UnicodeType, 'rsplit') def unicode_rsplit(data, sep=None, maxsplit=-1): From 1b5157d98b5d830130610e526694b72b8856672f Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Mon, 18 Nov 2019 12:38:05 +0300 Subject: [PATCH 21/68] Implement str.replace --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 81 +++++++++++++++++++++++++++ numba/unicode.py | 52 +++++++++++++++++ 3 files changed, 134 insertions(+) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index f19d3da7a64..266401d506d 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -187,6 +187,7 @@ The following functions, attributes and methods are currently supported: * ``.zfill()`` * ``.count()`` * ``.istitle()`` +* ``.replace()`` * ``.rfind()`` * ``.title()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c9f49623dbd..20ecbfdb052 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -106,6 +106,14 @@ def rfind_with_start_end_usecase(x, y, start, end): return x.rfind(y, start, end) +def replace_usecase(s, x, y): + return s.replace(x, y) + + +def replace_with_count_usecase(s, x, y, count): + return s.replace(x, y, count) + + def startswith_usecase(x, y): return x.startswith(y) @@ -908,6 +916,9 @@ def test_join(self): ] for sep, parts in CASES: + print(sep) + print(parts) + print(cfunc(sep, parts)) self.assertEqual(pyfunc(sep, parts), cfunc(sep, parts), "'%s'.join('%s')?" % (sep, parts)) @@ -1332,6 +1343,76 @@ def test_lower(self): for s in UNICODE_EXAMPLES + [''] + extras + cpython + sigma: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_replace(self): + pyfunc = replace_usecase + cfunc = njit(pyfunc) + + CASES = [ + ('abc', '', 'A'), + ('', '⚡', 'A'), + ('abcabc', '⚡', 'A'), + ('🐍⚡', '⚡', 'A'), + ('🐍⚡🐍', '⚡', 'A'), + ('abababa', 'a', 'A'), + ('abababa', 'b', 'A'), + ('abababa', 'c', 'A'), + ('abababa', 'ab', 'A'), + ('abababa', 'aba', 'A'), + ] + + for test_str, old_str, new_str in CASES: + self.assertEqual(pyfunc(test_str, old_str, new_str), + cfunc(test_str, old_str, new_str), + "'%s'.replace('%s', '%s')?" % + (test_str, old_str, new_str)) + + def test_replace_with_count(self): + pyfunc = replace_with_count_usecase + cfunc = njit(pyfunc) + + CASES = [ + ('abc', '', 'A'), + ('', '⚡', 'A'), + ('abcabc', '⚡', 'A'), + ('🐍⚡', '⚡', 'A'), + ('🐍⚡🐍', '⚡', 'A'), + ('abababa', 'a', 'A'), + ('abababa', 'b', 'A'), + ('abababa', 'c', 'A'), + ('abababa', 'ab', 'A'), + ('abababa', 'aba', 'A'), + ] + + count_test = [-1, 1, 0, 5] + + for test_str, old_str, new_str in CASES: + for count in count_test: + self.assertEqual(pyfunc(test_str, old_str, new_str, count), + cfunc(test_str, old_str, new_str, count), + "'%s'.replace('%s', '%s', '%s')?" % + (test_str, old_str, new_str, count)) + + def test_replace_unsupported(self): + def pyfunc(s, x, y, count): + return s.replace(x, y, count) + + cfunc = njit(pyfunc) + + with self.assertRaises(TypingError) as raises: + cfunc('ababababab', 'ba', 'qqq', 3.5) + msg = 'Unsupported parameters. The parametrs must be Integer.' + self.assertIn(msg, str(raises.exception)) + + with self.assertRaises(TypingError) as raises: + cfunc('ababababab', 0, 'qqq', 3) + msg = 'The object must be a UnicodeType.' + self.assertIn(msg, str(raises.exception)) + + with self.assertRaises(TypingError) as raises: + cfunc('ababababab', 'ba', 0, 3) + msg = 'The object must be a UnicodeType.' + self.assertIn(msg, str(raises.exception)) + @unittest.skipUnless(_py34_or_later, 'unicode support requires Python 3.4 or later') diff --git a/numba/unicode.py b/numba/unicode.py index 673b437ec5d..34831f6363b 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1297,6 +1297,58 @@ def impl(a): _PyUnicode_IsTitlecase)) +@overload_method(types.UnicodeType, 'replace') +def unicode_replace(s, old_str, new_str, count=-1): + thety = count + if isinstance(count, types.Omitted): + thety = count.value + elif isinstance(count, types.Optional): + thety = count.type + + if not isinstance(thety, (int, types.Integer)): + raise TypingError('Unsupported parameters. The parametrs ' + 'must be Integer. Given count: {}'.format(count)) + + if not isinstance(old_str, (types.UnicodeType, types.NoneType)): + raise TypingError('The object must be a UnicodeType.' + ' Given: {}'.format(old_str)) + + if not isinstance(new_str, types.UnicodeType): + raise TypingError('The object must be a UnicodeType.' + ' Given: {}'.format(new_str)) + + def impl(s, old_str, new_str, count=-1): + if count == 0: + return s + if old_str == '' or old_str is None: + q = list(s) + if count == -1: + str_res = new_str.join(q) + str_result = new_str + str_res + new_str + return str_result + i = 0 + str_result = new_str + if count > len(q): + counter = len(q) + else: + counter = count + while i < counter: + str_result += q[i] + if i + 1 != counter: + str_result += new_str + else: + str_result += ''.join(q[(i + 1):]) + i += 1 + if count > len(q): + str_result += new_str + return str_result + q = s.split(old_str, count) + str_result = new_str.join(q) + return str_result + + return impl + + @overload_method(types.UnicodeType, 'isupper') def unicode_isupper(a): """ From fbbf808db6af94ac18ac8c44c3499e33f63ead01 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Mon, 18 Nov 2019 18:50:48 +0300 Subject: [PATCH 22/68] Add functionality for str.endswith() --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 117 ++++++++++++++++++++++++++ numba/unicode.py | 69 +++++++++++++-- 3 files changed, 178 insertions(+), 9 deletions(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index b947e8d32ed..e096cdb006a 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -192,6 +192,7 @@ The following functions, attributes and methods are currently supported: * ``.istitle()`` * ``.rfind()`` * ``.title()`` +* ``.endswith()`` Additional operations as well as support for Python 2 strings / Python 3 bytes will be added in a future version of Numba. Python 2 Unicode objects will diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c97bc1b6f50..ee64167641b 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -118,6 +118,14 @@ def endswith_usecase(x, y): return x.endswith(y) +def endswith_with_start_only_usecase(x, y, start): + return x.endswith(y, start) + + +def endswith_with_start_end_usecase(x, y, start, end): + return x.endswith(y, start, end) + + def split_usecase(x, y): return x.split(y) @@ -398,6 +406,115 @@ def test_endswith(self, flags=no_pyobj_flags): cfunc(a, b), '%s, %s' % (a, b)) + def test_endswith_default(self): + pyfunc = endswith_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + msg = 'Results "{}".endswith("{}") must be equal' + self.assertEqual(pyfunc(s, sub_str), cfunc(s, sub_str), + msg=msg.format(s, sub_str)) + + def test_endswith_with_start(self): + pyfunc = endswith_with_start_only_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + for start in list(range(-20, 20)) + [None]: + msg = 'Results "{}".endswith("{}", {}) must be equal' + self.assertEqual(pyfunc(s, sub_str, start), + cfunc(s, sub_str, start), + msg=msg.format(s, sub_str, start)) + + def test_endswith_with_start_end(self): + pyfunc = endswith_with_start_end_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#LL1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + for start in list(range(-20, 20)) + [None]: + for end in list(range(-20, 20)) + [None]: + msg = 'Results "{}".endswith("{}", {}, {})\ + must be equal' + self.assertEqual(pyfunc(s, sub_str, start, end), + cfunc(s, sub_str, start, end), + msg=msg.format(s, sub_str, start, end)) + + def test_endswith_tuple(self): + pyfunc = endswith_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + msg = 'Results "{}".endswith({}) must be equal' + tuple_subs = (sub_str, 'lo') + self.assertEqual(pyfunc(s, tuple_subs), + cfunc(s, tuple_subs), + msg=msg.format(s, tuple_subs)) + + def test_endswith_tuple_args(self): + pyfunc = endswith_with_start_end_usecase + cfunc = njit(pyfunc) + + # Samples taken from CPython testing: + # https://github.com/python/cpython/blob/865c3b257fe38154a4320c7ee6afb416f665b9c2/Lib/test/string_tests.py#L1049-L1099 # noqa: E501 + cpython_str = ['hello', 'helloworld', ''] + cpython_subs = [ + 'he', 'hello', 'helloworld', 'ello', + '', 'lowo', 'lo', 'he', 'lo', 'o', + ] + extra_subs = ['hellohellohello', ' '] + for s in cpython_str + UNICODE_EXAMPLES: + default_subs = ['', 'x', s[:-2], s[3:], s, s + s] + for sub_str in cpython_subs + default_subs + extra_subs: + for start in list(range(-20, 20)) + [None]: + for end in list(range(-20, 20)) + [None]: + msg = 'Results "{}".endswith("{}", {}, {})\ + must be equal' + tuple_subs = (sub_str, 'lo') + self.assertEqual(pyfunc(s, tuple_subs, start, end), + cfunc(s, tuple_subs, start, end), + msg=msg.format(s, tuple_subs, + start, end)) + def test_in(self, flags=no_pyobj_flags): pyfunc = in_usecase cfunc = njit(pyfunc) diff --git a/numba/unicode.py b/numba/unicode.py index 88d27610fe4..0eccfd8abfc 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -660,18 +660,69 @@ def startswith_impl(a, b): return startswith_impl +# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9342-L9354 # noqa: E501 +@register_jitable +def _adjust_indices(length, start, end): + if end > length: + end = length + if end < 0: + end += length + if end < 0: + end = 0 + if start < 0: + start += length + if start < 0: + start = 0 + + return start, end + + @overload_method(types.UnicodeType, 'endswith') -def unicode_endswith(a, b): - if isinstance(b, types.UnicodeType): - def endswith_impl(a, b): - a_offset = len(a) - len(b) - if a_offset < 0: +def unicode_endswith(s, substr, start=None, end=None): + if not (start is None or isinstance(start, (types.Omitted, + types.Integer, + types.NoneType))): + raise TypingError('The arg must be a Integer or None') + + if not (end is None or isinstance(end, (types.Omitted, + types.Integer, + types.NoneType))): + raise TypingError('The arg must be a Integer or None') + + if isinstance(substr, (types.Tuple, types.UniTuple)): + def endswith_impl(s, substr, start=None, end=None): + for item in substr: + if s.endswith(item, start, end) is True: + return True + + return False + return endswith_impl + + if isinstance(substr, types.UnicodeType): + def endswith_impl(s, substr, start=None, end=None): + length = len(s) + sub_length = len(substr) + if start is None: + start = 0 + if end is None: + end = length + + start, end = _adjust_indices(length, start, end) + if end - start < sub_length: return False - return _cmp_region(a, a_offset, b, 0, len(b)) == 0 + + if sub_length == 0: + return True + + s = s[start:end] + offset = len(s) - sub_length + + return _cmp_region(s, offset, substr, 0, sub_length) == 0 return endswith_impl - if isinstance(b, types.UnicodeCharSeq): - def endswith_impl(a, b): - return a.endswith(str(b)) + + if isinstance(substr, types.UnicodeCharSeq): + def endswith_impl(s, substr, start=None, end=None): + return s.endswith(str(substr), start, end) return endswith_impl From 0f3e6953bc657e2c199ac8736bc72f38c2e3f4c7 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Wed, 13 Nov 2019 16:37:21 +0300 Subject: [PATCH 23/68] Implement str.isdecimal --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 17 +++++++++++++++++ numba/unicode.py | 26 +++++++++++++++++++++++++- numba/unicode_support.py | 18 ++++++++++++------ 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index b947e8d32ed..fa04fcb58ab 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -184,6 +184,7 @@ The following functions, attributes and methods are currently supported: * ``.strip()`` * ``.isupper()`` * ``.upper()`` +* ``.isdecimal()`` * ``.islower()`` * ``.lower()`` * ``.zfill()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c97bc1b6f50..a77949dc615 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1408,6 +1408,23 @@ def test_lower(self): for s in UNICODE_EXAMPLES + [''] + extras + cpython + sigma: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_isdecimal(self): + def pyfunc(x): + return x.isdecimal() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L646-L662 # noqa: E501 + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', '\U00010401', '\U00010427', + '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107', '\U0001D7F6', + '\U00011066', '\U000104A0'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', + 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] + + msg = 'Results of "{}".isspace() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + @unittest.skipUnless(_py34_or_later, 'unicode support requires Python 3.4 or later') diff --git a/numba/unicode.py b/numba/unicode.py index 88d27610fe4..5e6a5be7e74 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -37,7 +37,8 @@ _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, - _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) + _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER, + _PyUnicode_IsDecimalDigit) # DATA MODEL @@ -1410,6 +1411,29 @@ def impl(data): return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045 # noqa: E501 +@overload_method(types.UnicodeType, 'isdecimal') +def unicode_isdecimal(data): + """Implements UnicodeType.isdecimal()""" + + def impl(data): + length = len(data) + + if length == 1: + return _PyUnicode_IsDecimalDigit(_get_code_point(data, 0)) + + if length == 0: + return False + + for i in range(length): + if not _PyUnicode_IsDecimalDigit(_get_code_point(data, i)): + return False + + return True + + return impl + + @overload_method(types.UnicodeType, 'istitle') def unicode_istitle(s): """ diff --git a/numba/unicode_support.py b/numba/unicode_support.py index f8b7b7a6c9c..44c63f68d1e 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -201,12 +201,10 @@ def _PyUnicode_IsXidContinue(ch): @register_jitable def _PyUnicode_ToDecimalDigit(ch): - raise NotImplementedError - - -@register_jitable -def _PyUnicode_IsDecimalDigit(ch): - raise NotImplementedError + ctype = _PyUnicode_gettyperecord(ch) + if ctype.flags & _PyUnicode_TyperecordMasks.DECIMAL_MASK: + return ctype.decimal + return -1 @register_jitable @@ -315,6 +313,14 @@ def _PyUnicode_IsCaseIgnorable(ch): return ctype.flags & _PyUnicode_TyperecordMasks.CASE_IGNORABLE_MASK != 0 +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L106-L118 # noqa: E501 +@register_jitable +def _PyUnicode_IsDecimalDigit(ch): + if _PyUnicode_ToDecimalDigit(ch) < 0: + return 0 + return 1 + + @register_jitable def _PyUnicode_IsSpace(ch): ctype = _PyUnicode_gettyperecord(ch) From 4ae54efd657a0ffd699af1102f69856289845cf4 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Thu, 14 Nov 2019 09:28:59 +0300 Subject: [PATCH 24/68] Implement str.isdecimal --- numba/tests/test_unicode.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index a77949dc615..09f531ae27b 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1414,12 +1414,14 @@ def pyfunc(x): cfunc = njit(pyfunc) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L646-L662 # noqa: E501 - cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', '\U00010401', '\U00010427', - '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107', '\U0001D7F6', - '\U00011066', '\U000104A0'] + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', + '0123456789a', '\U00010401', '\U00010427', '\U00010429', + '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', + '\U0001F107', '\U0001D7F6', '\U00011066', '\U000104A0'] # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 - cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', - 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', + 'a\uDFFFb\uD800a'] msg = 'Results of "{}".isspace() must be equal' for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: From 57ad3d8f3c9b486891d1f59acd2a242224f43021 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Mon, 18 Nov 2019 13:02:10 +0300 Subject: [PATCH 25/68] Implement str.isdecimal --- numba/tests/test_unicode.py | 2 +- numba/unicode.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 09f531ae27b..07712b72c4b 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1423,7 +1423,7 @@ def pyfunc(x): 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] - msg = 'Results of "{}".isspace() must be equal' + msg = 'Results of "{}".isdecimal() must be equal' for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) diff --git a/numba/unicode.py b/numba/unicode.py index 5e6a5be7e74..5390a020aea 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -953,6 +953,10 @@ def unicode_isidentifier(data): def impl(data): length = len(data) + + if length == 1: + return _PyUnicode_IsDecimalDigit(_get_code_point(data, 0)) + if length == 0: return False From ddd87c26cc28ed09ef8139b7b980d601d3aa39f3 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Wed, 13 Nov 2019 17:17:06 +0300 Subject: [PATCH 26/68] Implement str.isdigit --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 16 ++++++++++++++++ numba/unicode.py | 27 ++++++++++++++++++++++++++- numba/unicode_support.py | 18 ++++++++++++------ 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index b947e8d32ed..5fba440083c 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -184,6 +184,7 @@ The following functions, attributes and methods are currently supported: * ``.strip()`` * ``.isupper()`` * ``.upper()`` +* ``.isdigit()`` * ``.islower()`` * ``.lower()`` * ``.zfill()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c97bc1b6f50..5ce22149f8c 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1408,6 +1408,22 @@ def test_lower(self): for s in UNICODE_EXAMPLES + [''] + extras + cpython + sigma: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_isdigit(self): + def pyfunc(x): + return x.isdigit() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L664-L674 # noqa: E501 + cpython = ['\u2460', '\xbc', '\u0660', '\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', + '\U0001F46F', '\U00011065', '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', + 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] + + msg = 'Results of "{}".isspace() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + @unittest.skipUnless(_py34_or_later, 'unicode support requires Python 3.4 or later') diff --git a/numba/unicode.py b/numba/unicode.py index 88d27610fe4..e094db70fbf 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -37,7 +37,8 @@ _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, - _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) + _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER, + _PyUnicode_IsDigit) # DATA MODEL @@ -1410,6 +1411,30 @@ def impl(data): return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12056-L12085 # noqa: E501 +@overload_method(types.UnicodeType, 'isdigit') +def unicode_isdigit(data): + """Implements UnicodeType.isdigit()""" + + def impl(data): + length = len(data) + + if length == 1: + ch = _get_code_point(data, 0) + return _PyUnicode_IsDigit(ch) + + if length == 0: + return False + + for i in range(length): + if not _PyUnicode_IsDigit(_get_code_point(data, i)): + return False + + return True + + return impl + + @overload_method(types.UnicodeType, 'istitle') def unicode_istitle(s): """ diff --git a/numba/unicode_support.py b/numba/unicode_support.py index f8b7b7a6c9c..824b73eef59 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -211,12 +211,10 @@ def _PyUnicode_IsDecimalDigit(ch): @register_jitable def _PyUnicode_ToDigit(ch): - raise NotImplementedError - - -@register_jitable -def _PyUnicode_IsDigit(ch): - raise NotImplementedError + ctype = _PyUnicode_gettyperecord(ch) + if ctype.flags & _PyUnicode_TyperecordMasks.DIGIT_MASK: + return ctype.digit + return -1 @register_jitable @@ -315,6 +313,14 @@ def _PyUnicode_IsCaseIgnorable(ch): return ctype.flags & _PyUnicode_TyperecordMasks.CASE_IGNORABLE_MASK != 0 +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L135 # noqa: E501 +@register_jitable +def _PyUnicode_IsDigit(ch): + if _PyUnicode_ToDigit(ch) < 0: + return 0 + return 1 + + @register_jitable def _PyUnicode_IsSpace(ch): ctype = _PyUnicode_gettyperecord(ch) From 22acdb89707967ed8bad5e15bdcc6f49f8834630 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Thu, 14 Nov 2019 10:10:05 +0300 Subject: [PATCH 27/68] Implement str.isdigit --- numba/tests/test_unicode.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 5ce22149f8c..e4c17a42e7a 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1414,10 +1414,13 @@ def pyfunc(x): cfunc = njit(pyfunc) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L664-L674 # noqa: E501 - cpython = ['\u2460', '\xbc', '\u0660', '\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', - '\U0001F46F', '\U00011065', '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107'] + cpython = ['\u2460', '\xbc', '\u0660', '\U00010401', '\U00010427', + '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', + '\U00011065', '\U0001D7F6', '\U00011066', '\U000104A0', + '\U0001F107'] # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 - cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] msg = 'Results of "{}".isspace() must be equal' From 8bfb184d0db12e607bedf7b232be531aa49e9ace Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Mon, 18 Nov 2019 13:16:43 +0300 Subject: [PATCH 28/68] Implement str.isdigit --- numba/tests/test_unicode.py | 2 +- numba/unicode.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index e4c17a42e7a..0b424f1e67f 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1423,7 +1423,7 @@ def pyfunc(x): 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] - msg = 'Results of "{}".isspace() must be equal' + msg = 'Results of "{}".isdigit() must be equal' for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) diff --git a/numba/unicode.py b/numba/unicode.py index e094db70fbf..49fa172c748 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -953,6 +953,11 @@ def unicode_isidentifier(data): def impl(data): length = len(data) + + if length == 1: + ch = _get_code_point(data, 0) + return _PyUnicode_IsDigit(ch) + if length == 0: return False From 1bb174672cbbf77f4286c4dd1872bab11f96abb7 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Mon, 18 Nov 2019 13:31:01 +0300 Subject: [PATCH 29/68] Implement str.isdigit --- numba/unicode_support.py | 1 + 1 file changed, 1 insertion(+) diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 824b73eef59..7e11461530a 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -209,6 +209,7 @@ def _PyUnicode_IsDecimalDigit(ch): raise NotImplementedError +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L123-L1128 # noqa: E501 @register_jitable def _PyUnicode_ToDigit(ch): ctype = _PyUnicode_gettyperecord(ch) From 57007309348813761271eb9ad16529d952364cc0 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Tue, 19 Nov 2019 09:27:14 +0300 Subject: [PATCH 30/68] Implement str.isdecimal --- numba/unicode.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 5390a020aea..5e6a5be7e74 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -953,10 +953,6 @@ def unicode_isidentifier(data): def impl(data): length = len(data) - - if length == 1: - return _PyUnicode_IsDecimalDigit(_get_code_point(data, 0)) - if length == 0: return False From 53171aa6b9c4173fc590e76230b0036aa75b9aa6 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Tue, 19 Nov 2019 09:30:01 +0300 Subject: [PATCH 31/68] Implement str.isdigit --- numba/unicode.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 49fa172c748..e094db70fbf 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -953,11 +953,6 @@ def unicode_isidentifier(data): def impl(data): length = len(data) - - if length == 1: - ch = _get_code_point(data, 0) - return _PyUnicode_IsDigit(ch) - if length == 0: return False From 77f8f0fa07b917d695a2a3c745d6bee8d4669c7f Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Wed, 13 Nov 2019 17:49:34 +0300 Subject: [PATCH 32/68] Implement str.isnumeric --- docs/source/reference/pysupported.rst | 1 + numba/tests/test_unicode.py | 18 ++++++++++++++++++ numba/unicode.py | 26 +++++++++++++++++++++++++- numba/unicode_support.py | 3 ++- 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index b947e8d32ed..783bcf178d4 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -184,6 +184,7 @@ The following functions, attributes and methods are currently supported: * ``.strip()`` * ``.isupper()`` * ``.upper()`` +* ``.isnumeric()`` * ``.islower()`` * ``.lower()`` * ``.zfill()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c97bc1b6f50..1f52230b872 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1409,6 +1409,24 @@ def test_lower(self): self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + def test_isnumeric(self): + def pyfunc(x): + return x.isnumeric() + + cfunc = njit(pyfunc) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L676-L693 # noqa: E501 + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', '\U00010401', '\U00010427', + '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001D7F6', '\U00011066', + '\U000104A0', '\U0001F107'] + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', + 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] + + msg = 'Results of "{}".isspace() must be equal' + for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: + self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) + + @unittest.skipUnless(_py34_or_later, 'unicode support requires Python 3.4 or later') class TestUnicodeInTuple(BaseTest): diff --git a/numba/unicode.py b/numba/unicode.py index 88d27610fe4..d6ed9a53004 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -37,7 +37,8 @@ _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, - _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) + _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER, + _PyUnicode_IsNumeric,) # DATA MODEL @@ -1410,6 +1411,29 @@ def impl(data): return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12096-L12124 # noqa: E501 +@overload_method(types.UnicodeType, 'isnumeric') +def unicode_isnumeric(data): + """Implements UnicodeType.isnumeric()""" + + def impl(data): + length = len(data) + + if length == 1: + return _PyUnicode_IsNumeric(_get_code_point(data, 0)) + + if length == 0: + return False + + for i in range(length): + if not _PyUnicode_IsNumeric(_get_code_point(data, i)): + return False + + return True + + return impl + + @overload_method(types.UnicodeType, 'istitle') def unicode_istitle(s): """ diff --git a/numba/unicode_support.py b/numba/unicode_support.py index f8b7b7a6c9c..c00da77c248 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -221,7 +221,8 @@ def _PyUnicode_IsDigit(ch): @register_jitable def _PyUnicode_IsNumeric(ch): - raise NotImplementedError + ctype = _PyUnicode_gettyperecord(ch) + return ctype.flags & _PyUnicode_TyperecordMasks.NUMERIC_MASK != 0 @register_jitable From ee4d8bac48fa54475c4313717da139447e1e87e4 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Wed, 13 Nov 2019 18:07:04 +0300 Subject: [PATCH 33/68] change --- numba/tests/test_unicode.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 1f52230b872..284e4374131 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1415,12 +1415,12 @@ def pyfunc(x): cfunc = njit(pyfunc) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L676-L693 # noqa: E501 - cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', '\U00010401', '\U00010427', - '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001D7F6', '\U00011066', - '\U000104A0', '\U0001F107'] + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', '\U00010401', + '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', + '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107'] # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 - cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', - 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', + 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] msg = 'Results of "{}".isspace() must be equal' for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: From 7c7025124bf3395160f76947c15b90f2452d51b2 Mon Sep 17 00:00:00 2001 From: Rubtsowa <36762665+Rubtsowa@users.noreply.github.com> Date: Wed, 13 Nov 2019 18:28:18 +0300 Subject: [PATCH 34/68] Update test_unicode.py --- numba/tests/test_unicode.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 284e4374131..daa5de4507a 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1415,12 +1415,14 @@ def pyfunc(x): cfunc = njit(pyfunc) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L676-L693 # noqa: E501 - cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', '\U00010401', - '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', - '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107'] + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', + '\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', + '\U0001F46F', '\U00011065', '\U0001D7F6', '\U00011066', '\U000104A0', + '\U0001F107'] # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 - cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', - 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', + 'a\uDFFFb\uD800a'] msg = 'Results of "{}".isspace() must be equal' for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: From cd2ae60a846916ae72c2b9280c97701340cb9ec6 Mon Sep 17 00:00:00 2001 From: Rubtsowa <36762665+Rubtsowa@users.noreply.github.com> Date: Wed, 13 Nov 2019 19:22:21 +0300 Subject: [PATCH 35/68] Update test_unicode.py --- numba/tests/test_unicode.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index daa5de4507a..9746b379c72 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1415,10 +1415,10 @@ def pyfunc(x): cfunc = njit(pyfunc) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L676-L693 # noqa: E501 - cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', - '\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', - '\U0001F46F', '\U00011065', '\U0001D7F6', '\U00011066', '\U000104A0', - '\U0001F107'] + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', + '0123456789a', '\U00010401', '\U00010427', '\U00010429', + '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', + '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107'] # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', From 1680025a4ded4ba9ec8d86689ad20bacad4e7714 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Thu, 14 Nov 2019 10:04:28 +0300 Subject: [PATCH 36/68] Implement str.isnumeric --- numba/tests/test_unicode.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 9746b379c72..c1360209ad4 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1408,19 +1408,18 @@ def test_lower(self): for s in UNICODE_EXAMPLES + [''] + extras + cpython + sigma: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) - def test_isnumeric(self): def pyfunc(x): return x.isnumeric() cfunc = njit(pyfunc) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L676-L693 # noqa: E501 - cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', + cpython = ['', 'a', '0', '\u2460', '\xbc', '\u0660', '0123456789', '0123456789a', '\U00010401', '\U00010427', '\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107'] # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Lib/test/test_unicode.py#L742-L749 # noqa: E501 - cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + cpython_extras = ['\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] From 502b7558bfd32673151f32ae6abc165412d9a5a6 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Mon, 18 Nov 2019 13:26:48 +0300 Subject: [PATCH 37/68] Implement str.isnumeric --- numba/tests/test_unicode.py | 2 +- numba/unicode.py | 4 ++++ numba/unicode_support.py | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index c1360209ad4..1873b3fd13f 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1423,7 +1423,7 @@ def pyfunc(x): 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'] - msg = 'Results of "{}".isspace() must be equal' + msg = 'Results of "{}".isnumeric() must be equal' for s in UNICODE_EXAMPLES + [''] + cpython + cpython_extras: self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) diff --git a/numba/unicode.py b/numba/unicode.py index d6ed9a53004..a4273e1ca7f 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -953,6 +953,10 @@ def unicode_isidentifier(data): def impl(data): length = len(data) + + if length == 1: + return _PyUnicode_IsNumeric(_get_code_point(data, 0)) + if length == 0: return False diff --git a/numba/unicode_support.py b/numba/unicode_support.py index c00da77c248..5e532b3c6ec 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -219,6 +219,7 @@ def _PyUnicode_IsDigit(ch): raise NotImplementedError +# From: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodectype.c#L140-L145 # noqa: E501 @register_jitable def _PyUnicode_IsNumeric(ch): ctype = _PyUnicode_gettyperecord(ch) From 3bce8373fd9897bccd28960e00c1d1c2699ac9f7 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Tue, 19 Nov 2019 09:56:33 +0300 Subject: [PATCH 38/68] Implement str.isnumeric --- numba/unicode.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index a4273e1ca7f..d6ed9a53004 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -953,10 +953,6 @@ def unicode_isidentifier(data): def impl(data): length = len(data) - - if length == 1: - return _PyUnicode_IsNumeric(_get_code_point(data, 0)) - if length == 0: return False From 7bbc28b66add01a80ae74c857ecdcfa7d8879792 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Wed, 20 Nov 2019 10:22:39 +0300 Subject: [PATCH 39/68] change --- numba/unicode.py | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 34831f6363b..b4c64b0e82d 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1317,34 +1317,27 @@ def unicode_replace(s, old_str, new_str, count=-1): raise TypingError('The object must be a UnicodeType.' ' Given: {}'.format(new_str)) - def impl(s, old_str, new_str, count=-1): + def impl(s, old, new, count=-1): if count == 0: return s - if old_str == '' or old_str is None: - q = list(s) + if old == '': + schars = list(s) if count == -1: - str_res = new_str.join(q) - str_result = new_str + str_res + new_str - return str_result - i = 0 - str_result = new_str - if count > len(q): - counter = len(q) - else: - counter = count - while i < counter: - str_result += q[i] - if i + 1 != counter: - str_result += new_str + return new + new.join(schars) + new + split_result = [new] + min_count = min(len(schars), count) + for i in range(min_count): + split_result.append(schars[i]) + if i + 1 != min_count: + split_result.append(new) else: - str_result += ''.join(q[(i + 1):]) - i += 1 - if count > len(q): - str_result += new_str - return str_result - q = s.split(old_str, count) - str_result = new_str.join(q) - return str_result + split_result.append(''.join(schars[(i + 1):])) + if count > len(schars): + split_result.append(new) + return ''.join(split_result) + schars = s.split(old, count) + result = new.join(schars) + return result return impl From ace863124e2edc61f18b2a53b443fccc1becbcf9 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Mon, 25 Nov 2019 09:05:03 +0300 Subject: [PATCH 40/68] change --- numba/tests/test_unicode.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 4191fcfa08b..9f357621346 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -958,9 +958,6 @@ def test_join(self): ] for sep, parts in CASES: - print(sep) - print(parts) - print(cfunc(sep, parts)) self.assertEqual(pyfunc(sep, parts), cfunc(sep, parts), "'%s'.join('%s')?" % (sep, parts)) From 771782aad5be300ac9d710de69a2d3e9a4bfb04a Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 27 Nov 2019 10:41:45 +0300 Subject: [PATCH 41/68] Extend str.capitalize() for ascii --- numba/unicode.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 88a966e975d..e50c2730a4c 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1277,12 +1277,20 @@ def impl(data): return _empty_string(data._kind, length, data._is_ascii) if data._is_ascii: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382 # noqa: E501 res = _empty_string(data._kind, length, 1) code_point = _get_code_point(data, 0) - _set_code_point(res, 0, _Py_TOUPPER(code_point)) + if _Py_ISLOWER(code_point): + _set_code_point(res, 0, _Py_TOUPPER(code_point)) + else: + _set_code_point(res, 0, code_point) + for idx in range(1, length): code_point = _get_code_point(data, idx) - _set_code_point(res, idx, _Py_TOLOWER(code_point)) + if _Py_ISUPPER(code_point): + _set_code_point(res, idx, _Py_TOLOWER(code_point)) + else: + _set_code_point(res, idx, code_point) return res From 2900c16f42c7e8a86d0eec0e48a956cb2c9b310f Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 27 Nov 2019 11:14:51 +0300 Subject: [PATCH 42/68] Support newer version of ascii capitalization --- numba/unicode.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index e50c2730a4c..bc89691fa24 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1277,20 +1277,15 @@ def impl(data): return _empty_string(data._kind, length, data._is_ascii) if data._is_ascii: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382 # noqa: E501 + # https://github.com/python/cpython/blob/593bb30e82eded7f2ec02f7d1aa49742e6962113/Objects/bytes_methods.c#L361-L368 # noqa: E501 + # mixed with: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L299-L307 # noqa: E501 res = _empty_string(data._kind, length, 1) code_point = _get_code_point(data, 0) - if _Py_ISLOWER(code_point): - _set_code_point(res, 0, _Py_TOUPPER(code_point)) - else: - _set_code_point(res, 0, code_point) - + _set_code_point(res, 0, _Py_TOUPPER(code_point)) for idx in range(1, length): code_point = _get_code_point(data, idx) - if _Py_ISUPPER(code_point): - _set_code_point(res, idx, _Py_TOLOWER(code_point)) - else: - _set_code_point(res, idx, code_point) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) return res From f2ac10c2213288267270d7530a04de1a6d37003e Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 27 Nov 2019 11:17:17 +0300 Subject: [PATCH 43/68] Change link to CPython for str.capitalize() --- numba/unicode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/unicode.py b/numba/unicode.py index bc89691fa24..c3a24158bae 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1279,7 +1279,7 @@ def impl(data): if data._is_ascii: # https://github.com/python/cpython/blob/593bb30e82eded7f2ec02f7d1aa49742e6962113/Objects/bytes_methods.c#L361-L368 # noqa: E501 # mixed with: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L299-L307 # noqa: E501 + # https://github.com/python/cpython/blob/593bb30e82eded7f2ec02f7d1aa49742e6962113/Objects/bytes_methods.c#L299-L307 # noqa: E501 res = _empty_string(data._kind, length, 1) code_point = _get_code_point(data, 0) _set_code_point(res, 0, _Py_TOUPPER(code_point)) From 302e0a50b49d7e8443fb6c33b6cdf0a68a23d19d Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 27 Nov 2019 13:42:27 +0300 Subject: [PATCH 44/68] Add support of python 3.8 for str.capitalize() --- numba/unicode.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/numba/unicode.py b/numba/unicode.py index c3a24158bae..42fad816dba 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1,3 +1,4 @@ +import sys import operator import numpy as np @@ -38,6 +39,8 @@ _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) +_py38_or_later = sys.version_info[:2] >= (3, 8) + # DATA MODEL @@ -1297,7 +1300,13 @@ def impl(data): mapped = np.zeros(3, dtype=_Py_UCS4) tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length) code_point = _get_code_point(data, 0) - n_res = _PyUnicode_ToUpperFull(code_point, mapped) + + # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c # noqa: E501 + if _py38_or_later: + n_res = _PyUnicode_ToTitleFull(code_point, mapped) + else: + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + for m in mapped[:n_res]: maxchar = max(maxchar, m) _set_code_point(tmp, k, m) From d80571c5c375fcdfa287af9647368ebe3f33e8a0 Mon Sep 17 00:00:00 2001 From: Rubtsowa <36762665+Rubtsowa@users.noreply.github.com> Date: Wed, 4 Dec 2019 17:01:48 +0300 Subject: [PATCH 45/68] Update unicode.py --- numba/unicode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/unicode.py b/numba/unicode.py index 275622b5495..efcd9386f5e 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -39,7 +39,7 @@ _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER, - _PyUnicode_IsAlpha, _PyUnicode_IsNumeric, + _PyUnicode_IsAlpha, _PyUnicode_IsNumeric, _PyUnicode_IsDecimalDigit) # DATA MODEL From 3c708879f1c2d1e5dda678600137f6241959ac92 Mon Sep 17 00:00:00 2001 From: Rubtsowa <36762665+Rubtsowa@users.noreply.github.com> Date: Wed, 4 Dec 2019 18:02:14 +0300 Subject: [PATCH 46/68] Update test_unicode.py --- numba/tests/test_unicode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 438fa0918e5..645e8b14a58 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -126,7 +126,7 @@ def replace_usecase(s, x, y): def replace_with_count_usecase(s, x, y, count): return s.replace(x, y, count) - + def index_usecase(x, y): return x.index(y) From b292caf70c85df364926283479cad62bed3eaf05 Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 10 Dec 2019 09:22:01 +0300 Subject: [PATCH 47/68] Rename unit test for str.splitlines() --- numba/tests/test_unicode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index ece09af668a..247b3c704a7 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -1150,7 +1150,7 @@ def test_split_whitespace(self): cfunc(test_str), "'%s'.split()?" % (test_str,)) - def test_split_exception_invalid_keepends(self): + def test_splitlines_exception_invalid_keepends(self): pyfunc = splitlines_with_keepends_usecase cfunc = njit(pyfunc) From 5b9969b15ae002199e4500055d8f2367fdcad25c Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 10 Dec 2019 09:26:39 +0300 Subject: [PATCH 48/68] Remove str.splitlines() from this branch --- docs/source/reference/pysupported.rst | 1 - numba/tests/test_unicode.py | 53 --------------------- numba/unicode.py | 68 --------------------------- numba/unicode_support.py | 58 ----------------------- 4 files changed, 180 deletions(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index a7c6ad8b34b..382214294c3 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -179,7 +179,6 @@ The following functions, attributes and methods are currently supported: * ``.ljust()`` * ``.rjust()`` * ``.split()`` -* ``.splitlines()`` * ``.join()`` * ``.lstrip()`` * ``.rstrip()`` diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 247b3c704a7..75304fdfcd5 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -167,18 +167,6 @@ def split_whitespace_usecase(x): return x.split() -def splitlines_usecase(s): - return s.splitlines() - - -def splitlines_with_keepends_usecase(s, keepends): - return s.splitlines(keepends) - - -def splitlines_with_keepends_kwarg_usecase(s, keepends): - return s.splitlines(keepends=keepends) - - def lstrip_usecase(x): return x.lstrip() @@ -1150,47 +1138,6 @@ def test_split_whitespace(self): cfunc(test_str), "'%s'.split()?" % (test_str,)) - def test_splitlines_exception_invalid_keepends(self): - pyfunc = splitlines_with_keepends_usecase - cfunc = njit(pyfunc) - - accepted_types = (types.Integer, int, types.Boolean, bool) - for ty, keepends in (('none', None), ('unicode_type', 'None')): - with self.assertRaises(TypingError) as raises: - cfunc('\n', keepends) - msg = '"keepends" must be {}, not {}'.format(accepted_types, ty) - self.assertIn(msg, str(raises.exception)) - - def test_splitlines(self): - pyfunc = splitlines_usecase - cfunc = njit(pyfunc) - - cases = ['', '\n', 'abc\r\rabc\r\n', '🐍⚡\v', '\f🐍⚡\f\v\v🐍\x85', - '\u2028aba\u2029baba', '\n\r\na\v\fb\x0b\x0cc\x1c\x1d\x1e'] - - msg = 'Results of "{}".splitlines() must be equal' - for s in cases: - self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s)) - - def test_splitlines_with_keepends(self): - pyfuncs = [ - splitlines_with_keepends_usecase, - splitlines_with_keepends_kwarg_usecase - ] - messages = [ - 'Results of "{}".splitlines({}) must be equal', - 'Results of "{}".splitlines(keepends={}) must be equal' - ] - cases = ['', '\n', 'abc\r\rabc\r\n', '🐍⚡\v', '\f🐍⚡\f\v\v🐍\x85', - '\u2028aba\u2029baba', '\n\r\na\v\fb\x0b\x0cc\x1c\x1d\x1e'] - all_keepends = [True, False, 0, 1, -1, 100] - - for pyfunc, msg in zip(pyfuncs, messages): - cfunc = njit(pyfunc) - for s, keepends in product(cases, all_keepends): - self.assertEqual(pyfunc(s, keepends), cfunc(s, keepends), - msg=msg.format(s, keepends)) - def test_join_empty(self): # Can't pass empty list to nopython mode, so we have to make a # separate test case diff --git a/numba/unicode.py b/numba/unicode.py index 4fd375c71cc..6829b3bb67d 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -40,8 +40,6 @@ _PyUnicode_IsXidStart, _PyUnicode_IsXidContinue, _PyUnicode_IsCased, _PyUnicode_IsCaseIgnorable, _PyUnicode_IsUppercase, _PyUnicode_IsLowercase, - _PyUnicode_IsLineBreak, _Py_ISLINEBREAK, - _Py_ISLINEFEED, _Py_ISCARRIAGERETURN, _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER, _PyUnicode_IsAlpha, _PyUnicode_IsNumeric, _Py_ISALPHA,) @@ -933,72 +931,6 @@ def rjust_impl(string, width, fillchar=' '): return rjust_impl -def generate_splitlines_func(is_line_break_func): - """Generate splitlines performer based on ascii or unicode line breaks.""" - def impl(data, keepends): - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L335-L389 # noqa: E501 - length = len(data) - result = [] - i = j = 0 - while i < length: - # find a line and append it - while i < length: - code_point = _get_code_point(data, i) - if is_line_break_func(code_point): - break - i += 1 - - # skip the line break reading CRLF as one line break - eol = i - if i < length: - if i + 1 < length: - cur_cp = _get_code_point(data, i) - next_cp = _get_code_point(data, i + 1) - if _Py_ISCARRIAGERETURN(cur_cp) and _Py_ISLINEFEED(next_cp): - i += 1 - i += 1 - if keepends: - eol = i - - result.append(data[j:eol]) - j = i - - return result - - return impl - - -_ascii_splitlines = register_jitable(generate_splitlines_func(_Py_ISLINEBREAK)) -_unicode_splitlines = register_jitable(generate_splitlines_func( - _PyUnicode_IsLineBreak)) - - -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10196-L10229 # noqa: E501 -@overload_method(types.UnicodeType, 'splitlines') -def unicode_splitlines(data, keepends=False): - """Implements str.splitlines()""" - thety = keepends - # if the type is omitted, the concrete type is the value - if isinstance(keepends, types.Omitted): - thety = keepends.value - # if the type is optional, the concrete type is the captured type - elif isinstance(keepends, types.Optional): - thety = keepends.type - - accepted = (types.Integer, int, types.Boolean, bool) - if thety is not None and not isinstance(thety, accepted): - raise TypingError( - '"{}" must be {}, not {}'.format('keepends', accepted, keepends)) - - def splitlines_impl(data, keepends=False): - if data._is_ascii: - return _ascii_splitlines(data, keepends) - - return _unicode_splitlines(data, keepends) - - return splitlines_impl - - @register_jitable def join_list(sep, parts): parts_len = len(parts) diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 917b0f99f43..480ab3be9b4 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -247,12 +247,6 @@ def _PyUnicode_IsUppercase(ch): return ctype.flags & _PyUnicode_TyperecordMasks.UPPER_MASK != 0 -@register_jitable -def _PyUnicode_IsLineBreak(ch): - ctype = _PyUnicode_gettyperecord(ch) - return ctype.flags & _PyUnicode_TyperecordMasks.LINEBREAK_MASK != 0 - - @register_jitable def _PyUnicode_ToUppercase(ch): raise NotImplementedError @@ -583,40 +577,6 @@ class _PY_CTF(IntEnum): ], dtype=np.uint8) -class _PY_CTF_LB(IntEnum): - LINE_BREAK = 0x01 - LINE_FEED = 0x02 - CARRIAGE_RETURN = 0x04 - - -_Py_ctype_islinebreak = np.array([ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.LINE_FEED, # 0xa '\n' - _PY_CTF_LB.LINE_BREAK, # 0xb '\v' - _PY_CTF_LB.LINE_BREAK, # 0xc '\f' - _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.CARRIAGE_RETURN, # 0xd '\r' - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - _PY_CTF_LB.LINE_BREAK, # 0x1c '\x1c' - _PY_CTF_LB.LINE_BREAK, # 0x1d '\x1d' - _PY_CTF_LB.LINE_BREAK, # 0x1e '\x1e' - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - _PY_CTF_LB.LINE_BREAK, # 0x85 '\x85' - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, -], dtype=np.intc) - - # Translation of: # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pymacro.h#L25 # noqa: E501 @register_jitable @@ -720,23 +680,5 @@ def _Py_ISSPACE(ch): return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.SPACE -@register_jitable -def _Py_ISLINEBREAK(ch): - """Check if character is ASCII line break""" - return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_BREAK - - -@register_jitable -def _Py_ISLINEFEED(ch): - """Check if character is line feed `\n`""" - return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_FEED - - -@register_jitable -def _Py_ISCARRIAGERETURN(ch): - """Check if character is carriage return `\r`""" - return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.CARRIAGE_RETURN - - # End code related to/from CPython's pyctype # ------------------------------------------------------------------------------ From 51ba21c0937ac31920c205a678003a7a74846f95 Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 10 Dec 2019 09:28:40 +0300 Subject: [PATCH 49/68] Remove excess line --- numba/unicode_support.py | 1 - 1 file changed, 1 deletion(-) diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 480ab3be9b4..1d1e8c4cd57 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -679,6 +679,5 @@ def _Py_ISSPACE(ch): """ return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.SPACE - # End code related to/from CPython's pyctype # ------------------------------------------------------------------------------ From aeff032a4e8c72090d1dbfc5260b7bb16c1a9a52 Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 10 Dec 2019 11:09:46 +0300 Subject: [PATCH 50/68] Remove excess import from unicode.py --- numba/unicode.py | 1 - 1 file changed, 1 deletion(-) diff --git a/numba/unicode.py b/numba/unicode.py index 8b91a7db8b5..b48974465c8 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1,4 +1,3 @@ -import sys import operator import sys From 0f7bd42e70087a0af4cdf7e3fb7f1962d630a207 Mon Sep 17 00:00:00 2001 From: mrubtsov Date: Mon, 30 Dec 2019 19:46:28 +0300 Subject: [PATCH 51/68] correction names in method --- numba/unicode.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index c7dc5cd07cb..fdda0201146 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1539,26 +1539,26 @@ def unicode_replace(s, old_str, new_str, count=-1): raise TypingError('The object must be a UnicodeType.' ' Given: {}'.format(new_str)) - def impl(s, old, new, count=-1): + def impl(s, old_str, new_str, count=-1): if count == 0: return s - if old == '': + if old_str == '': schars = list(s) if count == -1: - return new + new.join(schars) + new - split_result = [new] + return new_str + new_str.join(schars) + new_str + split_result = [new_str] min_count = min(len(schars), count) for i in range(min_count): split_result.append(schars[i]) if i + 1 != min_count: - split_result.append(new) + split_result.append(new_str) else: split_result.append(''.join(schars[(i + 1):])) if count > len(schars): - split_result.append(new) + split_result.append(new_str) return ''.join(split_result) - schars = s.split(old, count) - result = new.join(schars) + schars = s.split(old_str, count) + result = new_str.join(schars) return result return impl From aa63a0c1efdee2b1cea6c5b045b88e37d364fa39 Mon Sep 17 00:00:00 2001 From: Denis Date: Fri, 15 Nov 2019 09:54:12 +0300 Subject: [PATCH 52/68] Implement str.splitlines() based on CPython Conflicts: docs/source/reference/pysupported.rst numba/tests/test_unicode.py numba/unicode.py --- numba/unicode.py | 68 ++++++++++++++++++++++++++++++++++++++++ numba/unicode_support.py | 59 ++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/numba/unicode.py b/numba/unicode.py index f3baeab3b62..e91676e2cee 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -48,6 +48,8 @@ _PyUnicode_IsAlpha, _PyUnicode_IsNumeric, _Py_ISALPHA, _PyUnicode_IsDigit, _PyUnicode_IsDecimalDigit) + _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) + _py38_or_later = sys.version_info[:2] >= (3, 8) @@ -1250,6 +1252,72 @@ def rjust_impl(string, width, fillchar=' '): return rjust_impl +def generate_splitlines_func(is_line_break_func): + """Generate splitlines performer based on ascii or unicode line breaks.""" + def impl(data, keepends): + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L335-L389 # noqa: E501 + length = len(data) + result = [] + i = j = 0 + while i < length: + # find a line and append it + while i < length: + code_point = _get_code_point(data, i) + if is_line_break_func(code_point): + break + i += 1 + + # skip the line break reading CRLF as one line break + eol = i + if i < length: + if i + 1 < length: + cur_cp = _get_code_point(data, i) + next_cp = _get_code_point(data, i + 1) + if _Py_ISCARRIAGERETURN(cur_cp) and _Py_ISLINEFEED(next_cp): + i += 1 + i += 1 + if keepends: + eol = i + + result.append(data[j:eol]) + j = i + + return result + + return impl + + +_ascii_splitlines = register_jitable(generate_splitlines_func(_Py_ISLINEBREAK)) +_unicode_splitlines = register_jitable(generate_splitlines_func( + _PyUnicode_IsLineBreak)) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10196-L10229 # noqa: E501 +@overload_method(types.UnicodeType, 'splitlines') +def unicode_splitlines(data, keepends=False): + """Implements str.splitlines()""" + thety = keepends + # if the type is omitted, the concrete type is the value + if isinstance(keepends, types.Omitted): + thety = keepends.value + # if the type is optional, the concrete type is the captured type + elif isinstance(keepends, types.Optional): + thety = keepends.type + + accepted = (types.Integer, int, types.Boolean, bool) + if thety is not None and not isinstance(thety, accepted): + raise TypingError( + '"{}" must be {}, not {}'.format('keepends', accepted, keepends)) + + def splitlines_impl(data, keepends=False): + if data._is_ascii: + return _ascii_splitlines(data, keepends) + + return _unicode_splitlines(data, keepends) + + return splitlines_impl + + @register_jitable def join_list(sep, parts): parts_len = len(parts) diff --git a/numba/unicode_support.py b/numba/unicode_support.py index 8d9a61d26bd..d170102e0cf 100644 --- a/numba/unicode_support.py +++ b/numba/unicode_support.py @@ -250,6 +250,12 @@ def _PyUnicode_IsUppercase(ch): return ctype.flags & _PyUnicode_TyperecordMasks.UPPER_MASK != 0 +@register_jitable +def _PyUnicode_IsLineBreak(ch): + ctype = _PyUnicode_gettyperecord(ch) + return ctype.flags & _PyUnicode_TyperecordMasks.LINEBREAK_MASK != 0 + + @register_jitable def _PyUnicode_ToUppercase(ch): raise NotImplementedError @@ -596,6 +602,40 @@ class _PY_CTF(IntEnum): ], dtype=np.uint8) +class _PY_CTF_LB(IntEnum): + LINE_BREAK = 0x01 + LINE_FEED = 0x02 + CARRIAGE_RETURN = 0x04 + + +_Py_ctype_islinebreak = np.array([ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.LINE_FEED, # 0xa '\n' + _PY_CTF_LB.LINE_BREAK, # 0xb '\v' + _PY_CTF_LB.LINE_BREAK, # 0xc '\f' + _PY_CTF_LB.LINE_BREAK | _PY_CTF_LB.CARRIAGE_RETURN, # 0xd '\r' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK, # 0x1c '\x1c' + _PY_CTF_LB.LINE_BREAK, # 0x1d '\x1d' + _PY_CTF_LB.LINE_BREAK, # 0x1e '\x1e' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _PY_CTF_LB.LINE_BREAK, # 0x85 '\x85' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, +], dtype=np.intc) + + # Translation of: # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Include/pymacro.h#L25 # noqa: E501 @register_jitable @@ -698,5 +738,24 @@ def _Py_ISSPACE(ch): """ return _Py_ctype_table[_Py_CHARMASK(ch)] & _PY_CTF.SPACE + +@register_jitable +def _Py_ISLINEBREAK(ch): + """Check if character is ASCII line break""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_BREAK + + +@register_jitable +def _Py_ISLINEFEED(ch): + """Check if character is line feed `\n`""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.LINE_FEED + + +@register_jitable +def _Py_ISCARRIAGERETURN(ch): + """Check if character is carriage return `\r`""" + return _Py_ctype_islinebreak[_Py_CHARMASK(ch)] & _PY_CTF_LB.CARRIAGE_RETURN + + # End code related to/from CPython's pyctype # ------------------------------------------------------------------------------ From a9745952dbff01082f445c79de58106b1bb597f8 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 31 Dec 2019 11:07:31 +0000 Subject: [PATCH 53/68] Fix up --- numba/tests/test_unicode.py | 3 ++- numba/unicode.py | 20 ++------------------ 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py index 81392a12427..3df0cb8a6f0 100644 --- a/numba/tests/test_unicode.py +++ b/numba/tests/test_unicode.py @@ -186,6 +186,7 @@ def endswith_with_start_only_usecase(x, y, start): def endswith_with_start_end_usecase(x, y, start, end): return x.endswith(y, start, end) + def split_usecase(x, y): return x.split(y) @@ -840,7 +841,7 @@ def test_count_with_start_end(self): "'{0}'.c_count('{1}', {2}, {3}) = {5}") for s, sub in UNICODE_COUNT_EXAMPLES: - for i , j in product(range(-18, 18), (-18, 18)): + for i, j in product(range(-18, 18), (-18, 18)): py_result = pyfunc(s, sub, i, j) c_result = cfunc(s, sub, i, j) self.assertEqual(py_result, c_result, diff --git a/numba/unicode.py b/numba/unicode.py index e91676e2cee..b947f122bdf 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -48,7 +48,6 @@ _PyUnicode_IsAlpha, _PyUnicode_IsNumeric, _Py_ISALPHA, _PyUnicode_IsDigit, _PyUnicode_IsDecimalDigit) - _PyUnicode_IsTitlecase, _Py_ISLOWER, _Py_ISUPPER) _py38_or_later = sys.version_info[:2] >= (3, 8) @@ -530,23 +529,6 @@ def contains_impl(a, b): return contains_impl -# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9342-L9354 # noqa: E501 -@register_jitable -def _adjust_indices(length, start, end): - if end > length: - end = length - if end < 0: - end += length - if end < 0: - end = 0 - if start < 0: - start += length - if start < 0: - start = 0 - - return start, end - - def unicode_idx_check_type(ty, name): """Check object belongs to one of specific types ty: type @@ -2072,6 +2054,8 @@ def impl(data): return res + return impl + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045 # noqa: E501 @overload_method(types.UnicodeType, 'isdecimal') From d3188b50a6a05628c8f704a75e9dbf44f3d817e2 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 31 Dec 2019 11:21:18 +0000 Subject: [PATCH 54/68] Refactor ljust/rjust --- numba/unicode.py | 90 ++++++++++++++++++------------------------------ 1 file changed, 34 insertions(+), 56 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index b947f122bdf..855fbaee279 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1172,66 +1172,44 @@ def center_impl(string, width, fillchar=' '): return center_impl -@overload_method(types.UnicodeType, 'ljust') -def unicode_ljust(string, width, fillchar=' '): - if not isinstance(width, types.Integer): - raise TypingError('The width must be an Integer') - - if isinstance(fillchar, types.UnicodeCharSeq): - def ljust_impl(string, width, fillchar=' '): - return string.ljust(width, str(fillchar)) - return ljust_impl - - if not (fillchar == ' ' or isinstance( - fillchar, (types.Omitted, types.UnicodeType))): - raise TypingError('The fillchar must be a UnicodeType') - - def ljust_impl(string, width, fillchar=' '): - str_len = len(string) - fillchar_len = len(fillchar) - - if fillchar_len != 1: - raise ValueError('The fill character must be exactly one ' - 'character long') - - if width <= str_len: - return string - - newstr = string + (fillchar * (width - str_len)) - - return newstr - return ljust_impl - - -@overload_method(types.UnicodeType, 'rjust') -def unicode_rjust(string, width, fillchar=' '): - if not isinstance(width, types.Integer): - raise TypingError('The width must be an Integer') - - if isinstance(fillchar, types.UnicodeCharSeq): - def rjust_impl(string, width, fillchar=' '): - return string.rjust(width, str(fillchar)) - return rjust_impl - - if not (fillchar == ' ' or - isinstance(fillchar, (types.Omitted, types.UnicodeType))): - raise TypingError('The fillchar must be a UnicodeType') - - def rjust_impl(string, width, fillchar=' '): - str_len = len(string) - fillchar_len = len(fillchar) +def gen_unicode_Xjust(STRING_FIRST): + def unicode_Xjust(string, width, fillchar=' '): + if not isinstance(width, types.Integer): + raise TypingError('The width must be an Integer') + + if isinstance(fillchar, types.UnicodeCharSeq): + def rjust_impl(string, width, fillchar=' '): + return string.rjust(width, str(fillchar)) + return rjust_impl + + if not (fillchar == ' ' or + isinstance(fillchar, (types.Omitted, types.UnicodeType))): + raise TypingError('The fillchar must be a UnicodeType') + + def impl(string, width, fillchar=' '): + str_len = len(string) + fillchar_len = len(fillchar) + + if fillchar_len != 1: + raise ValueError('The fill character must be exactly one ' + 'character long') + + if width <= str_len: + return string + + newstr = (fillchar * (width - str_len)) + if STRING_FIRST: + return string + newstr + else: + return newstr + string - if fillchar_len != 1: - raise ValueError('The fill character must be exactly one ' - 'character long') + return impl - if width <= str_len: - return string + return unicode_Xjust - newstr = (fillchar * (width - str_len)) + string - return newstr - return rjust_impl +overload_method(types.UnicodeType, 'rjust')(gen_unicode_Xjust(False)) +overload_method(types.UnicodeType, 'ljust')(gen_unicode_Xjust(True)) def generate_splitlines_func(is_line_break_func): From e32b88f06a0032164e3333e22cf11ea94bee88c0 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 31 Dec 2019 11:43:28 +0000 Subject: [PATCH 55/68] Refactor some isX methods --- docs/source/reference/pysupported.rst | 54 +++++------ numba/unicode.py | 127 +++++++------------------- 2 files changed, 59 insertions(+), 122 deletions(-) diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst index e5bda3b997f..af0bcdc9e25 100644 --- a/docs/source/reference/pysupported.rst +++ b/docs/source/reference/pysupported.rst @@ -248,46 +248,46 @@ The following functions, attributes and methods are currently supported: * ``*`` (repetition of strings) * ``in``, ``.contains()`` * ``==``, ``<``, ``<=``, ``>``, ``>=`` (comparison) +* ``.capitalize()`` * ``.casefold()`` -* ``.startswith()`` +* ``.center()`` +* ``.count()`` +* ``.endswith()`` * ``.endswith()`` * ``.expandtabs()`` -* ``.isspace()`` -* ``.isidentifier()`` * ``.find()`` -* ``.center()`` -* ``.ljust()`` -* ``.rjust()`` -* ``.split()`` -* ``.splitlines()`` -* ``.rsplit()`` -* ``.join()`` -* ``.lstrip()`` -* ``.rstrip()`` -* ``.strip()`` -* ``.capitalize()`` -* ``.isupper()`` -* ``.upper()`` -* ``.isnumeric()`` -* ``.isdigit()`` +* ``.index()`` +* ``.isalnum()`` +* ``.isalpha()`` * ``.isdecimal()`` +* ``.isdigit()`` +* ``.isidentifier()`` * ``.islower()`` -* ``.lower()`` -* ``.partition()`` +* ``.isnumeric()`` * ``.isprintable()`` -* ``.zfill()`` -* ``.rpartition()`` -* ``.count()`` +* ``.isspace()`` * ``.istitle()`` +* ``.isupper()`` +* ``.join()`` +* ``.ljust()`` +* ``.lower()`` +* ``.lstrip()`` +* ``.partition()`` * ``.replace()`` * ``.rfind()`` * ``.rindex()`` -* ``.index()`` +* ``.rjust()`` +* ``.rpartition()`` +* ``.rsplit()`` +* ``.rstrip()`` +* ``.split()`` +* ``.splitlines()`` +* ``.startswith()`` +* ``.strip()`` * ``.swapcase()`` * ``.title()`` -* ``.isalpha()`` -* ``.isalnum()`` -* ``.endswith()`` +* ``.upper()`` +* ``.zfill()`` Additional operations as well as support for Python 2 strings / Python 3 bytes will be added in a future version of Numba. Python 2 Unicode objects will diff --git a/numba/unicode.py b/numba/unicode.py index 855fbaee279..919a6572523 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1938,72 +1938,52 @@ def impl(a): return ret return impl +# generator for simple unicode "isX" methods -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11896-L11925 # noqa: E501 -@overload_method(types.UnicodeType, 'isspace') -def unicode_isspace(data): - """Implements UnicodeType.isspace()""" - - def impl(data): - length = len(data) - if length == 1: - return _PyUnicode_IsSpace(_get_code_point(data, 0)) - if length == 0: - return False +def gen_isX(_PyUnicode_IS_func, empty_is_false=True): + def unicode_isX(data): + def impl(data): + length = len(data) + if length == 1: + return _PyUnicode_IS_func(_get_code_point(data, 0)) - for i in range(length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsSpace(code_point): + if empty_is_false and length == 0: return False - return True - - return impl - - -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12096-L12124 # noqa: E501 -@overload_method(types.UnicodeType, 'isnumeric') -def unicode_isnumeric(data): - """Implements UnicodeType.isnumeric()""" - def impl(data): - length = len(data) - if length == 1: - return _PyUnicode_IsNumeric(_get_code_point(data, 0)) + for i in range(length): + code_point = _get_code_point(data, i) + if not _PyUnicode_IS_func(code_point): + return False - if length == 0: - return False + return True - for i in range(length): - if not _PyUnicode_IsNumeric(_get_code_point(data, i)): - return False + return impl + return unicode_isX - return True - return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11896-L11925 # noqa: E501 +overload_method(types.UnicodeType, 'isspace')(gen_isX(_PyUnicode_IsSpace)) +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12096-L12124 # noqa: E501 +overload_method(types.UnicodeType, 'isnumeric')(gen_isX(_PyUnicode_IsNumeric)) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12056-L12085 # noqa: E501 -@overload_method(types.UnicodeType, 'isdigit') -def unicode_isdigit(data): - """Implements UnicodeType.isdigit()""" - - def impl(data): - length = len(data) +overload_method(types.UnicodeType, 'isdigit')(gen_isX(_PyUnicode_IsDigit)) - if length == 1: - ch = _get_code_point(data, 0) - return _PyUnicode_IsDigit(ch) - if length == 0: - return False - - for i in range(length): - if not _PyUnicode_IsDigit(_get_code_point(data, i)): - return False - - return True +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045 # noqa: E501 +overload_method( + types.UnicodeType, + 'isdecimal')( + gen_isX(_PyUnicode_IsDecimalDigit)) - return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12188-L12213 # noqa: E501 +overload_method( + types.UnicodeType, + 'isprintable')( + gen_isX( + _PyUnicode_IsPrintable, + False)) def generate_operation_func(ascii_func, unicode_nres_func): @@ -2035,29 +2015,6 @@ def impl(data): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045 # noqa: E501 -@overload_method(types.UnicodeType, 'isdecimal') -def unicode_isdecimal(data): - """Implements UnicodeType.isdecimal()""" - - def impl(data): - length = len(data) - - if length == 1: - return _PyUnicode_IsDecimalDigit(_get_code_point(data, 0)) - - if length == 0: - return False - - for i in range(length): - if not _PyUnicode_IsDecimalDigit(_get_code_point(data, i)): - return False - - return True - - return impl - - @register_jitable def _unicode_casefold_doer(data, length, res, maxchars): k = 0 @@ -2122,26 +2079,6 @@ def impl(s): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12188-L12213 # noqa: E501 -@overload_method(types.UnicodeType, 'isprintable') -def unicode_isprintable(data): - """Implements UnicodeType.isprintable()""" - - def impl(data): - length = len(data) - if length == 1: - return _PyUnicode_IsPrintable(_get_code_point(data, 0)) - - for i in range(length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsPrintable(code_point): - return False - - return True - - return impl - - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11975-L12006 # noqa: E501 @overload_method(types.UnicodeType, 'isalnum') def unicode_isalnum(data): From 1561eb02eb7926e87fbab978846f7de1db21128e Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 31 Dec 2019 12:01:13 +0000 Subject: [PATCH 56/68] Factor out isalX methods As title. --- numba/unicode.py | 89 +++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 54 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 919a6572523..d64f9663456 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1711,34 +1711,50 @@ def impl(a): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11928-L11964 # noqa: E501 -@overload_method(types.UnicodeType, 'isalpha') -def unicode_isalpha(data): - """Implements UnicodeType.isalpha()""" +# generates isalpha/isalnum +def gen_isAlX(ascii_func, unicode_func): + def unicode_isAlX(data): - def impl(data): - length = len(data) - if length == 0: - return False + def impl(data): + length = len(data) + if length == 0: + return False - if length == 1: - code_point = _get_code_point(data, 0) - return _PyUnicode_IsAlpha(code_point) + if length == 1: + code_point = _get_code_point(data, 0) + if data._is_ascii: + return ascii_func(code_point) + else: + return unicode_func(code_point) + + if data._is_ascii: + for i in range(length): + code_point = _get_code_point(data, i) + if not ascii_func(code_point): + return False - if data._is_ascii: for i in range(length): code_point = _get_code_point(data, i) - if not _Py_ISALPHA(code_point): + if not unicode_func(code_point): return False - for i in range(length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsAlpha(code_point): - return False + return True - return True + return impl + return unicode_isAlX - return impl + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11928-L11964 # noqa: E501 +overload_method(types.UnicodeType, 'isalpha')(gen_isAlX(_Py_ISALPHA, + _PyUnicode_IsAlpha)) + +_unicode_is_alnum = register_jitable(lambda x: + (_PyUnicode_IsNumeric(x) or + _PyUnicode_IsAlpha(x))) + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11975-L12006 # noqa: E501 +overload_method(types.UnicodeType, 'isalnum')(gen_isAlX(_Py_ISALNUM, + _unicode_is_alnum)) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 @@ -2079,41 +2095,6 @@ def impl(s): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11975-L12006 # noqa: E501 -@overload_method(types.UnicodeType, 'isalnum') -def unicode_isalnum(data): - """Implements UnicodeType.isalnum()""" - - def impl(data): - length = len(data) - - if length == 1: - code_point = _get_code_point(data, 0) - if data._is_ascii: - return _Py_ISALNUM(code_point) - return (_PyUnicode_IsNumeric(code_point) or - _PyUnicode_IsAlpha(code_point)) - - if length == 0: - return False - - if data._is_ascii: - for i in range(length): - code_point = _get_code_point(data, i) - if not _Py_ISALNUM(code_point): - return False - - for i in range(length): - code_point = _get_code_point(data, i) - if (not _PyUnicode_IsNumeric(code_point) and - not _PyUnicode_IsAlpha(code_point)): - return False - - return True - - return impl - - if sys.version_info[:2] >= (3, 7): @overload_method(types.UnicodeType, 'isascii') def unicode_isascii(data): From c98755840fb227351c9906db5412df97d95ae1fc Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 31 Dec 2019 12:13:41 +0000 Subject: [PATCH 57/68] loop hoist invariant --- numba/unicode.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index d64f9663456..246bfad8295 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1094,22 +1094,23 @@ def rsplit_impl(data, sep=None, maxsplit=-1): def _rsplit_char(data, ch, maxsplit): # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/stringlib/split.h#L242-L284 # noqa: E501 result = [] + ch_code_point = _get_code_point(ch, 0) i = j = len(data) - 1 while i >= 0 and maxsplit > 0: data_code_point = _get_code_point(data, i) - ch_code_point = _get_code_point(ch, 0) if data_code_point == ch_code_point: - result.append(data[i + 1:j + 1]) + result.append(data[i + 1 : j + 1]) j = i = i - 1 maxsplit -= 1 i -= 1 if j >= -1: - result.append(data[0:j + 1]) + result.append(data[0 : j + 1]) return result[::-1] if maxsplit < 0: maxsplit = sys.maxsize + sep_length = len(sep) if sep_length == 0: From 9c80831d2f739ec20fe8ef40251d785870cb19b6 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 31 Dec 2019 12:48:42 +0000 Subject: [PATCH 58/68] Fix bug --- numba/unicode.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 246bfad8295..8847a374c51 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -2068,44 +2068,52 @@ def unicode_casefold(data): """Implements str.casefold()""" return _do_casefold +if sys.version_info[:2] >= (3, 7): + @overload_method(types.UnicodeType, 'isascii') + def unicode_isascii(data): + """Implements UnicodeType.isascii()""" + + def impl(data): + return data._is_ascii + return impl @overload_method(types.UnicodeType, 'istitle') -def unicode_istitle(s): +def unicode_istitle(data): """ Implements UnicodeType.istitle() The algorithm is an approximate translation from CPython: https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 """ - def impl(s): + def impl(data): + length = len(data) + if length == 1: + char = _get_code_point(data, 0) + return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char) + + if length == 0: + return False + cased = False previous_is_cased = False - for char in s: + for idx in range(length): + char = _get_code_point(data, idx) if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char): if previous_is_cased: return False - cased = True previous_is_cased = True + cased = True elif _PyUnicode_IsLowercase(char): if not previous_is_cased: return False + previous_is_cased = True + cased = True else: previous_is_cased = False return cased return impl - -if sys.version_info[:2] >= (3, 7): - @overload_method(types.UnicodeType, 'isascii') - def unicode_isascii(data): - """Implements UnicodeType.isascii()""" - - def impl(data): - return data._is_ascii - return impl - - @overload_method(types.UnicodeType, 'islower') def unicode_islower(data): """ From 32e8372418e9e1aac1931c29cf2fb1c81cf837af Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 31 Dec 2019 13:04:07 +0000 Subject: [PATCH 59/68] Fix flake8 --- numba/unicode.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/numba/unicode.py b/numba/unicode.py index 8847a374c51..726def15159 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -2068,6 +2068,7 @@ def unicode_casefold(data): """Implements str.casefold()""" return _do_casefold + if sys.version_info[:2] >= (3, 7): @overload_method(types.UnicodeType, 'isascii') def unicode_isascii(data): @@ -2077,6 +2078,7 @@ def impl(data): return data._is_ascii return impl + @overload_method(types.UnicodeType, 'istitle') def unicode_istitle(data): """ @@ -2114,6 +2116,7 @@ def impl(data): return cased return impl + @overload_method(types.UnicodeType, 'islower') def unicode_islower(data): """ From a37c1a089368551230976a88b28d81490aff087b Mon Sep 17 00:00:00 2001 From: Siu Kwan Lam Date: Tue, 31 Dec 2019 10:40:10 -0600 Subject: [PATCH 60/68] Fix ljust --- numba/unicode.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 726def15159..3eb89de4463 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1179,9 +1179,14 @@ def unicode_Xjust(string, width, fillchar=' '): raise TypingError('The width must be an Integer') if isinstance(fillchar, types.UnicodeCharSeq): - def rjust_impl(string, width, fillchar=' '): - return string.rjust(width, str(fillchar)) - return rjust_impl + if STRING_FIRST: + def ljust_impl(string, width, fillchar=' '): + return string.ljust(width, str(fillchar)) + return ljust_impl + else: + def rjust_impl(string, width, fillchar=' '): + return string.rjust(width, str(fillchar)) + return rjust_impl if not (fillchar == ' ' or isinstance(fillchar, (types.Omitted, types.UnicodeType))): From d2584f6eab4aa43430e9a95a033d33f3a0fe853f Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 10:20:51 +0000 Subject: [PATCH 61/68] Fix pep8 --- numba/unicode.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 3eb89de4463..d0cb17181f1 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1994,18 +1994,12 @@ def impl(data): overload_method(types.UnicodeType, 'isdigit')(gen_isX(_PyUnicode_IsDigit)) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12017-L12045 # noqa: E501 -overload_method( - types.UnicodeType, - 'isdecimal')( - gen_isX(_PyUnicode_IsDecimalDigit)) +overload_method(types.UnicodeType, 'isdecimal')( + gen_isX(_PyUnicode_IsDecimalDigit)) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12188-L12213 # noqa: E501 -overload_method( - types.UnicodeType, - 'isprintable')( - gen_isX( - _PyUnicode_IsPrintable, - False)) +overload_method(types.UnicodeType, 'isprintable')( + gen_isX(_PyUnicode_IsPrintable, False)) def generate_operation_func(ascii_func, unicode_nres_func): From 5ab03f32a53dd525882216b706e5e469f0582c86 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 10:44:12 +0000 Subject: [PATCH 62/68] refactor --- numba/unicode.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index d0cb17181f1..e210c69fa51 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -2001,8 +2001,8 @@ def impl(data): overload_method(types.UnicodeType, 'isprintable')( gen_isX(_PyUnicode_IsPrintable, False)) - -def generate_operation_func(ascii_func, unicode_nres_func): +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 +def case_operation(ascii_func, unicode_func): """Generate common case operation performer.""" def impl(data): length = len(data) @@ -2012,14 +2012,13 @@ def impl(data): if data._is_ascii: res = _empty_string(data._kind, length, 1) ascii_func(data, res) - return res # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) # maxchar should be inside of a list to be pass as argument by reference maxchars = [0] - newlength = unicode_nres_func(data, length, tmp, maxchars) + newlength = unicode_func(data, length, tmp, maxchars) maxchar = maxchars[0] newkind = _codepoint_to_kind(maxchar) res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) @@ -2030,9 +2029,9 @@ def impl(data): return impl - +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 @register_jitable -def _unicode_casefold_doer(data, length, res, maxchars): +def _unicode_casefold(data, length, res, maxchars): k = 0 mapped = np.zeros(3, dtype=_Py_UCS4) for idx in range(length): @@ -2049,23 +2048,19 @@ def _unicode_casefold_doer(data, length, res, maxchars): @register_jitable -def _ascii_casefold_doer(data, res): +def _ascii_casefold(data, res): for idx in range(len(data)): code_point = _get_code_point(data, idx) _set_code_point(res, idx, _Py_TOLOWER(code_point)) -_do_casefold = register_jitable(generate_operation_func(_ascii_casefold_doer, - _unicode_casefold_doer)) - - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 # mixed with # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 @overload_method(types.UnicodeType, 'casefold') def unicode_casefold(data): """Implements str.casefold()""" - return _do_casefold + return case_operation(_ascii_casefold, _unicode_casefold) if sys.version_info[:2] >= (3, 7): From 21a3f70c68e712b90753d7b136cd263a2259494b Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 11:04:54 +0000 Subject: [PATCH 63/68] Fix title() to use ascii shortcut --- numba/unicode.py | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index e210c69fa51..34948d211d9 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -2001,6 +2001,7 @@ def impl(data): overload_method(types.UnicodeType, 'isprintable')( gen_isX(_PyUnicode_IsPrintable, False)) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 def case_operation(ascii_func, unicode_func): """Generate common case operation performer.""" @@ -2174,11 +2175,11 @@ def _lower_ucs4(code_point, data, length, idx, mapped): # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9996-L10021 # noqa: E501 @register_jitable -def _do_title(data, length, res, maxchars): +def _unicode_title(data, length, res, maxchars): """This is a translation of the function that titles a unicode string.""" k = 0 previous_cased = False - mapped = np.zeros(3, dtype=_Py_UCS4) + mapped = np.empty(3, dtype=_Py_UCS4) for idx in range(length): mapped.fill(0) code_point = _get_code_point(data, idx) @@ -2195,25 +2196,32 @@ def _do_title(data, length, res, maxchars): return k +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L332-L352 # noqa: E501 +@register_jitable +def _ascii_title(data, res): + """ Does .title() on an ASCII string """ + previous_is_cased = False + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + if _Py_ISLOWER(code_point): + if not previous_is_cased: + code_point = _Py_TOUPPER(code_point) + previous_is_cased = True + elif _Py_ISUPPER(code_point): + if previous_is_cased: + code_point = _Py_TOLOWER(code_point) + previous_is_cased = True + else: + previous_is_cased = False + _set_code_point(res, idx, code_point) + + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069 # noqa: E501 @overload_method(types.UnicodeType, 'title') def unicode_title(data): """Implements str.title()""" # https://docs.python.org/3/library/stdtypes.html#str.title - def impl(data): - length = len(data) - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, data._is_ascii) - # maxchar should be inside of a list to be pass as argument by reference - maxchar = 0 - maxchars = [maxchar] - newlength = _do_title(data, length, tmp, maxchars) - maxchar, = maxchars - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(res, i, _get_code_point(tmp, i)) - return res - return impl + return case_operation(_ascii_title, _unicode_title) # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13140-L13147 # noqa: E501 From a848a1e40c405370f4b758af85ae33c0529aab14 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 13:51:18 +0000 Subject: [PATCH 64/68] Refactor unicode.capitalize --- numba/unicode.py | 87 ++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 34948d211d9..5fc67c55275 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1763,63 +1763,50 @@ def impl(data): _unicode_is_alnum)) -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 -@overload_method(types.UnicodeType, 'capitalize') -def unicode_capitalize(data): - """Implements str.capitalize()""" - def impl(data): - length = len(data) - if length == 0: - return _empty_string(data._kind, length, data._is_ascii) - - if data._is_ascii: - # https://github.com/python/cpython/blob/593bb30e82eded7f2ec02f7d1aa49742e6962113/Objects/bytes_methods.c#L361-L368 # noqa: E501 - # mixed with: - # https://github.com/python/cpython/blob/593bb30e82eded7f2ec02f7d1aa49742e6962113/Objects/bytes_methods.c#L299-L307 # noqa: E501 - res = _empty_string(data._kind, length, 1) - code_point = _get_code_point(data, 0) - _set_code_point(res, 0, _Py_TOUPPER(code_point)) - for idx in range(1, length): - code_point = _get_code_point(data, idx) - _set_code_point(res, idx, _Py_TOLOWER(code_point)) - - return res - - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 - # mixed with: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 - k = 0 - maxchar = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length) - code_point = _get_code_point(data, 0) +@register_jitable +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 +def _unicode_capitalize(data, length, res, maxchars): + k = 0 + maxchar = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + code_point = _get_code_point(data, 0) - # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c # noqa: E501 - if _py38_or_later: - n_res = _PyUnicode_ToTitleFull(code_point, mapped) - else: - n_res = _PyUnicode_ToUpperFull(code_point, mapped) + # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c # noqa: E501 + if _py38_or_later: + n_res = _PyUnicode_ToTitleFull(code_point, mapped) + else: + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + for idx in range(1, length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _lower_ucs4(code_point, data, length, idx, mapped) for m in mapped[:n_res]: maxchar = max(maxchar, m) - _set_code_point(tmp, k, m) + _set_code_point(res, k, m) k += 1 - for idx in range(1, length): - mapped.fill(0) - code_point = _get_code_point(data, idx) - n_res = _lower_ucs4(code_point, data, length, idx, mapped) - for m in mapped[:n_res]: - maxchar = max(maxchar, m) - _set_code_point(tmp, k, m) - k += 1 - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, k) - for i in range(k): - _set_code_point(res, i, _get_code_point(tmp, i)) + maxchars[0] = maxchar + return k - return res - return impl +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382 # noqa: E501 +@register_jitable +def _ascii_capitalize(data, res): + code_point = _get_code_point(data, 0) + _set_code_point(res, 0, _Py_TOUPPER(code_point)) + for idx in range(1, len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 +@overload_method(types.UnicodeType, 'capitalize') +def unicode_capitalize(data): + return case_operation(_ascii_capitalize, _unicode_capitalize) def _is_upper(is_lower, is_upper, is_title): From c785737b1cb9996f5877306c7280e619cd397336 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 14:08:19 +0000 Subject: [PATCH 65/68] Fix ordering of functions --- numba/unicode.py | 242 +++++++++++++++++++++++------------------------ 1 file changed, 121 insertions(+), 121 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 5fc67c55275..408e69865f8 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1763,52 +1763,6 @@ def impl(data): _unicode_is_alnum)) -@register_jitable -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 -def _unicode_capitalize(data, length, res, maxchars): - k = 0 - maxchar = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - code_point = _get_code_point(data, 0) - - # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c # noqa: E501 - if _py38_or_later: - n_res = _PyUnicode_ToTitleFull(code_point, mapped) - else: - n_res = _PyUnicode_ToUpperFull(code_point, mapped) - - for m in mapped[:n_res]: - maxchar = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - for idx in range(1, length): - mapped.fill(0) - code_point = _get_code_point(data, idx) - n_res = _lower_ucs4(code_point, data, length, idx, mapped) - for m in mapped[:n_res]: - maxchar = max(maxchar, m) - _set_code_point(res, k, m) - k += 1 - maxchars[0] = maxchar - return k - - -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382 # noqa: E501 -@register_jitable -def _ascii_capitalize(data, res): - code_point = _get_code_point(data, 0) - _set_code_point(res, 0, _Py_TOUPPER(code_point)) - for idx in range(1, len(data)): - code_point = _get_code_point(data, idx) - _set_code_point(res, idx, _Py_TOLOWER(code_point)) - - -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 -@overload_method(types.UnicodeType, 'capitalize') -def unicode_capitalize(data): - return case_operation(_ascii_capitalize, _unicode_capitalize) - - def _is_upper(is_lower, is_upper, is_title): # impl is an approximate translation of: # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11794-L11827 # noqa: E501 @@ -2051,81 +2005,6 @@ def unicode_casefold(data): return case_operation(_ascii_casefold, _unicode_casefold) -if sys.version_info[:2] >= (3, 7): - @overload_method(types.UnicodeType, 'isascii') - def unicode_isascii(data): - """Implements UnicodeType.isascii()""" - - def impl(data): - return data._is_ascii - return impl - - -@overload_method(types.UnicodeType, 'istitle') -def unicode_istitle(data): - """ - Implements UnicodeType.istitle() - The algorithm is an approximate translation from CPython: - https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 - """ - - def impl(data): - length = len(data) - if length == 1: - char = _get_code_point(data, 0) - return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char) - - if length == 0: - return False - - cased = False - previous_is_cased = False - for idx in range(length): - char = _get_code_point(data, idx) - if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char): - if previous_is_cased: - return False - previous_is_cased = True - cased = True - elif _PyUnicode_IsLowercase(char): - if not previous_is_cased: - return False - previous_is_cased = True - cased = True - else: - previous_is_cased = False - - return cased - return impl - - -@overload_method(types.UnicodeType, 'islower') -def unicode_islower(data): - """ - impl is an approximate translation of: - https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933 # noqa: E501 - mixed with: - https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156 # noqa: E501 - """ - - def impl(data): - length = len(data) - if length == 1: - return _PyUnicode_IsLowercase(_get_code_point(data, 0)) - if length == 0: - return False - - cased = False - for idx in range(length): - cp = _get_code_point(data, idx) - if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp): - return False - elif not cased and _PyUnicode_IsLowercase(cp): - cased = True - return cased - return impl - - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9856-L9883 # noqa: E501 @register_jitable def _handle_capital_sigma(data, length, idx): @@ -2160,6 +2039,52 @@ def _lower_ucs4(code_point, data, length, idx, mapped): return _PyUnicode_ToLowerFull(code_point, mapped) +@register_jitable +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 +def _unicode_capitalize(data, length, res, maxchars): + k = 0 + maxchar = 0 + mapped = np.zeros(3, dtype=_Py_UCS4) + code_point = _get_code_point(data, 0) + + # https://github.com/python/cpython/commit/b015fc86f7b1f35283804bfee788cce0a5495df7/Objects/unicodeobject.c#diff-220e5da0d1c8abf508b25c02da6ca16c # noqa: E501 + if _py38_or_later: + n_res = _PyUnicode_ToTitleFull(code_point, mapped) + else: + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + for idx in range(1, length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + maxchars[0] = maxchar + return k + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L361-L382 # noqa: E501 +@register_jitable +def _ascii_capitalize(data, res): + code_point = _get_code_point(data, 0) + _set_code_point(res, 0, _Py_TOUPPER(code_point)) + for idx in range(1, len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10765-L10774 # noqa: E501 +@overload_method(types.UnicodeType, 'capitalize') +def unicode_capitalize(data): + return case_operation(_ascii_capitalize, _unicode_capitalize) + + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9996-L10021 # noqa: E501 @register_jitable def _unicode_title(data, length, res, maxchars): @@ -2211,6 +2136,81 @@ def unicode_title(data): return case_operation(_ascii_title, _unicode_title) +if sys.version_info[:2] >= (3, 7): + @overload_method(types.UnicodeType, 'isascii') + def unicode_isascii(data): + """Implements UnicodeType.isascii()""" + + def impl(data): + return data._is_ascii + return impl + + +@overload_method(types.UnicodeType, 'istitle') +def unicode_istitle(data): + """ + Implements UnicodeType.istitle() + The algorithm is an approximate translation from CPython: + https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 + """ + + def impl(data): + length = len(data) + if length == 1: + char = _get_code_point(data, 0) + return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char) + + if length == 0: + return False + + cased = False + previous_is_cased = False + for idx in range(length): + char = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char): + if previous_is_cased: + return False + previous_is_cased = True + cased = True + elif _PyUnicode_IsLowercase(char): + if not previous_is_cased: + return False + previous_is_cased = True + cased = True + else: + previous_is_cased = False + + return cased + return impl + + +@overload_method(types.UnicodeType, 'islower') +def unicode_islower(data): + """ + impl is an approximate translation of: + https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933 # noqa: E501 + mixed with: + https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156 # noqa: E501 + """ + + def impl(data): + length = len(data) + if length == 1: + return _PyUnicode_IsLowercase(_get_code_point(data, 0)) + if length == 0: + return False + + cased = False + for idx in range(length): + cp = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp): + return False + elif not cased and _PyUnicode_IsLowercase(cp): + cased = True + return cased + return impl + + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13140-L13147 # noqa: E501 @overload_method(types.UnicodeType, 'swapcase') def unicode_swapcase(data): From 51f03cf722b02db1bd42174720cde2e0c99f52ae Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 16:07:38 +0000 Subject: [PATCH 66/68] refactor swapcase --- numba/unicode.py | 93 +++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 408e69865f8..348d14e5124 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -2136,6 +2136,47 @@ def unicode_title(data): return case_operation(_ascii_title, _unicode_title) +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L391-L408 # noqa: E501 +@register_jitable +def _ascii_swapcase(data, res): + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + if _Py_ISUPPER(code_point): + code_point = _Py_TOLOWER(code_point) + elif _Py_ISLOWER(code_point): + code_point = _Py_TOUPPER(code_point) + _set_code_point(res, idx, code_point) + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784 # noqa: E501 +@register_jitable +def _unicode_swapcase(data, length, res, maxchars): + k = 0 + maxchar = 0 + mapped = np.empty(3, dtype=_Py_UCS4) + for idx in range(length): + mapped.fill(0) + code_point = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(code_point): + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + elif _PyUnicode_IsLowercase(code_point): + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + else: + n_res = 1 + mapped[0] = code_point + for m in mapped[:n_res]: + maxchar = max(maxchar, m) + _set_code_point(res, k, m) + k += 1 + maxchars[0] = maxchar + return k + + +@overload_method(types.UnicodeType, 'swapcase') +def unicode_swapcase(data): + return case_operation(_ascii_swapcase, _unicode_swapcase) + + if sys.version_info[:2] >= (3, 7): @overload_method(types.UnicodeType, 'isascii') def unicode_isascii(data): @@ -2211,58 +2252,6 @@ def impl(data): return impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13140-L13147 # noqa: E501 -@overload_method(types.UnicodeType, 'swapcase') -def unicode_swapcase(data): - """Implements str.swapcase()""" - def impl(data): - length = len(data) - if length == 0: - return _empty_string(data._kind, length, data._is_ascii) - - if data._is_ascii: - res = _empty_string(data._kind, length, 1) - for idx in range(length): - code_point = _get_code_point(data, idx) - if _Py_ISUPPER(code_point): - code_point = _Py_TOLOWER(code_point) - elif _Py_ISLOWER(code_point): - code_point = _Py_TOUPPER(code_point) - _set_code_point(res, idx, code_point) - - return res - - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 - # mixed with: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9761-L9784 # noqa: E501 - k = 0 - maxchar = 0 - mapped = np.zeros(3, dtype=_Py_UCS4) - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length) - for idx in range(length): - mapped.fill(0) - code_point = _get_code_point(data, idx) - if _PyUnicode_IsUppercase(code_point): - n_res = _lower_ucs4(code_point, data, length, idx, mapped) - elif _PyUnicode_IsLowercase(code_point): - n_res = _PyUnicode_ToUpperFull(code_point, mapped) - else: - n_res = 1 - mapped[0] = code_point - for m in mapped[:n_res]: - maxchar = max(maxchar, m) - _set_code_point(tmp, k, m) - k += 1 - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, k) - for i in range(k): - _set_code_point(res, i, _get_code_point(tmp, i)) - - return res - - return impl - - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 @register_jitable def _do_upper_or_lower(data, length, res, maxchars, lower): From 378d87e4be91140b60a28e06332ae0c31d2b9874 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 16:46:29 +0000 Subject: [PATCH 67/68] Move functions around --- numba/unicode.py | 478 ++++++++++++++++++++++++----------------------- 1 file changed, 246 insertions(+), 232 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index 348d14e5124..f60cd7a0fa9 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -1369,30 +1369,9 @@ def zfill_impl(string, width): return zfill_impl -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12126-L12161 # noqa: E501 -@overload_method(types.UnicodeType, 'isidentifier') -def unicode_isidentifier(data): - """Implements UnicodeType.isidentifier()""" - - def impl(data): - length = len(data) - if length == 0: - return False - - first_cp = _get_code_point(data, 0) - if not _PyUnicode_IsXidStart(first_cp) and first_cp != 0x5F: - return False - - for i in range(1, length): - code_point = _get_code_point(data, i) - if not _PyUnicode_IsXidContinue(code_point): - return False - - return True - - return impl - - +# ------------------------------------------------------------------------------ +# Strip functions +# ------------------------------------------------------------------------------ @register_jitable def unicode_strip_left_bound(string, chars): chars = ' ' if chars is None else chars @@ -1481,7 +1460,9 @@ def strip_impl(string, chars=None): return strip_impl -# String creation +# ------------------------------------------------------------------------------ +# Slice functions +# ------------------------------------------------------------------------------ @register_jitable def normalize_str_idx(idx, length, is_start=True): @@ -1648,6 +1629,11 @@ def getitem_slice(s, idx): return getitem_slice +# ------------------------------------------------------------------------------ +# String operations +# ------------------------------------------------------------------------------ + + @overload(operator.add) @overload(operator.iadd) def unicode_concat(a, b): @@ -1717,6 +1703,55 @@ def impl(a): return impl +@overload_method(types.UnicodeType, 'replace') +def unicode_replace(s, old_str, new_str, count=-1): + thety = count + if isinstance(count, types.Omitted): + thety = count.value + elif isinstance(count, types.Optional): + thety = count.type + + if not isinstance(thety, (int, types.Integer)): + raise TypingError('Unsupported parameters. The parametrs ' + 'must be Integer. Given count: {}'.format(count)) + + if not isinstance(old_str, (types.UnicodeType, types.NoneType)): + raise TypingError('The object must be a UnicodeType.' + ' Given: {}'.format(old_str)) + + if not isinstance(new_str, types.UnicodeType): + raise TypingError('The object must be a UnicodeType.' + ' Given: {}'.format(new_str)) + + def impl(s, old_str, new_str, count=-1): + if count == 0: + return s + if old_str == '': + schars = list(s) + if count == -1: + return new_str + new_str.join(schars) + new_str + split_result = [new_str] + min_count = min(len(schars), count) + for i in range(min_count): + split_result.append(schars[i]) + if i + 1 != min_count: + split_result.append(new_str) + else: + split_result.append(''.join(schars[(i + 1):])) + if count > len(schars): + split_result.append(new_str) + return ''.join(split_result) + schars = s.split(old_str, count) + result = new_str.join(schars) + return result + + return impl + +# ------------------------------------------------------------------------------ +# String `is*()` methods +# ------------------------------------------------------------------------------ + + # generates isalpha/isalnum def gen_isAlX(ascii_func, unicode_func): def unicode_isAlX(data): @@ -1793,51 +1828,6 @@ def impl(a): _PyUnicode_IsTitlecase)) -@overload_method(types.UnicodeType, 'replace') -def unicode_replace(s, old_str, new_str, count=-1): - thety = count - if isinstance(count, types.Omitted): - thety = count.value - elif isinstance(count, types.Optional): - thety = count.type - - if not isinstance(thety, (int, types.Integer)): - raise TypingError('Unsupported parameters. The parametrs ' - 'must be Integer. Given count: {}'.format(count)) - - if not isinstance(old_str, (types.UnicodeType, types.NoneType)): - raise TypingError('The object must be a UnicodeType.' - ' Given: {}'.format(old_str)) - - if not isinstance(new_str, types.UnicodeType): - raise TypingError('The object must be a UnicodeType.' - ' Given: {}'.format(new_str)) - - def impl(s, old_str, new_str, count=-1): - if count == 0: - return s - if old_str == '': - schars = list(s) - if count == -1: - return new_str + new_str.join(schars) + new_str - split_result = [new_str] - min_count = min(len(schars), count) - for i in range(min_count): - split_result.append(schars[i]) - if i + 1 != min_count: - split_result.append(new_str) - else: - split_result.append(''.join(schars[(i + 1):])) - if count > len(schars): - split_result.append(new_str) - return ''.join(split_result) - schars = s.split(old_str, count) - result = new_str.join(schars) - return result - - return impl - - @overload_method(types.UnicodeType, 'isupper') def unicode_isupper(a): """ @@ -1851,59 +1841,106 @@ def impl(a): return impl -@overload_method(types.UnicodeType, 'upper') -def unicode_upper(a): +if sys.version_info[:2] >= (3, 7): + @overload_method(types.UnicodeType, 'isascii') + def unicode_isascii(data): + """Implements UnicodeType.isascii()""" + + def impl(data): + return data._is_ascii + return impl + + +@overload_method(types.UnicodeType, 'istitle') +def unicode_istitle(data): """ - Implements .upper() + Implements UnicodeType.istitle() + The algorithm is an approximate translation from CPython: + https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 """ - def impl(a): - # main structure is a translation of: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13308-L13316 # noqa: E501 - # ASCII fast path - l = len(a) - if a._is_ascii: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L300 # noqa: E501 - ret = _empty_string(a._kind, l, a._is_ascii) - for idx in range(l): - code_point = _get_code_point(a, idx) - _set_code_point(ret, idx, _Py_TOUPPER(code_point)) - return ret - else: - # This part in an amalgamation of two algorithms: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9864-L9908 # noqa: E501 - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9787-L9805 # noqa: E501 - # - # The alg walks the string and writes the upper version of the code - # point into a 4byte kind unicode string and at the same time - # tracks the maximum width "upper" character encountered, following - # this the 4byte kind string is reinterpreted as needed into the - # maximum width kind string - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * l, a._is_ascii) - mapped = np.array((3,), dtype=_Py_UCS4) - maxchar = 0 - k = 0 - for idx in range(l): - mapped[:] = 0 - code_point = _get_code_point(a, idx) - n_res = _PyUnicode_ToUpperFull(_Py_UCS4(code_point), mapped) - for j in range(n_res): - maxchar = max(maxchar, mapped[j]) - _set_code_point(tmp, k, mapped[j]) - k += 1 - newlength = k - newkind = _codepoint_to_kind(maxchar) - ret = _empty_string(newkind, newlength, - _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(ret, i, _get_code_point(tmp, i)) - return ret + def impl(data): + length = len(data) + if length == 1: + char = _get_code_point(data, 0) + return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char) + + if length == 0: + return False + + cased = False + previous_is_cased = False + for idx in range(length): + char = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char): + if previous_is_cased: + return False + previous_is_cased = True + cased = True + elif _PyUnicode_IsLowercase(char): + if not previous_is_cased: + return False + previous_is_cased = True + cased = True + else: + previous_is_cased = False + + return cased return impl -# generator for simple unicode "isX" methods +@overload_method(types.UnicodeType, 'islower') +def unicode_islower(data): + """ + impl is an approximate translation of: + https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933 # noqa: E501 + mixed with: + https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156 # noqa: E501 + """ + + def impl(data): + length = len(data) + if length == 1: + return _PyUnicode_IsLowercase(_get_code_point(data, 0)) + if length == 0: + return False + + cased = False + for idx in range(length): + cp = _get_code_point(data, idx) + if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp): + return False + elif not cased and _PyUnicode_IsLowercase(cp): + cased = True + return cased + return impl + + +# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L12126-L12161 # noqa: E501 +@overload_method(types.UnicodeType, 'isidentifier') +def unicode_isidentifier(data): + """Implements UnicodeType.isidentifier()""" + + def impl(data): + length = len(data) + if length == 0: + return False + + first_cp = _get_code_point(data, 0) + if not _PyUnicode_IsXidStart(first_cp) and first_cp != 0x5F: + return False + for i in range(1, length): + code_point = _get_code_point(data, i) + if not _PyUnicode_IsXidContinue(code_point): + return False + + return True + + return impl + + +# generator for simple unicode "isX" methods def gen_isX(_PyUnicode_IS_func, empty_is_false=True): def unicode_isX(data): def impl(data): @@ -1942,6 +1979,10 @@ def impl(data): overload_method(types.UnicodeType, 'isprintable')( gen_isX(_PyUnicode_IsPrintable, False)) +# ------------------------------------------------------------------------------ +# String methods that apply a transformation to the characters themselves +# ------------------------------------------------------------------------------ + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9863-L9908 # noqa: E501 def case_operation(ascii_func, unicode_func): @@ -1971,6 +2012,94 @@ def impl(data): return impl + +@overload_method(types.UnicodeType, 'upper') +def unicode_upper(a): + """ + Implements .upper() + """ + def impl(a): + # main structure is a translation of: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13308-L13316 # noqa: E501 + + # ASCII fast path + l = len(a) + if a._is_ascii: + # This is an approximate translation of: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L300 # noqa: E501 + ret = _empty_string(a._kind, l, a._is_ascii) + for idx in range(l): + code_point = _get_code_point(a, idx) + _set_code_point(ret, idx, _Py_TOUPPER(code_point)) + return ret + else: + # This part in an amalgamation of two algorithms: + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9864-L9908 # noqa: E501 + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9787-L9805 # noqa: E501 + # + # The alg walks the string and writes the upper version of the code + # point into a 4byte kind unicode string and at the same time + # tracks the maximum width "upper" character encountered, following + # this the 4byte kind string is reinterpreted as needed into the + # maximum width kind string + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * l, a._is_ascii) + mapped = np.array((3,), dtype=_Py_UCS4) + maxchar = 0 + k = 0 + for idx in range(l): + mapped[:] = 0 + code_point = _get_code_point(a, idx) + n_res = _PyUnicode_ToUpperFull(_Py_UCS4(code_point), mapped) + for j in range(n_res): + maxchar = max(maxchar, mapped[j]) + _set_code_point(tmp, k, mapped[j]) + k += 1 + newlength = k + newkind = _codepoint_to_kind(maxchar) + ret = _empty_string(newkind, newlength, + _codepoint_is_ascii(maxchar)) + for i in range(newlength): + _set_code_point(ret, i, _get_code_point(tmp, i)) + return ret + return impl + + +@overload_method(types.UnicodeType, 'lower') +def unicode_lower(data): + """Implements .lower()""" + def impl(data): + # main structure is a translation of: + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L12380-L12388 # noqa: E501 + + # ASCII fast path + length = len(data) + if data._is_ascii: + # This is an approximate translation of: + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L247-L255 # noqa: E501 + res = _empty_string(data._kind, length, data._is_ascii) + for idx in range(length): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, _Py_TOLOWER(code_point)) + return res + else: + # This is an approximate translation of: + # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069 # noqa: E501 + tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, + data._is_ascii) + # maxchar is inside of a list to be pass as argument by reference + maxchars = [0] + newlength = _do_upper_or_lower(data, length, tmp, maxchars, + lower=True) + maxchar = maxchars[0] + newkind = _codepoint_to_kind(maxchar) + res = _empty_string(newkind, newlength, + _codepoint_is_ascii(maxchar)) + for i in range(newlength): + _set_code_point(res, i, _get_code_point(tmp, i)) + return res + + return impl + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 @register_jitable def _unicode_casefold(data, length, res, maxchars): @@ -1996,9 +2125,6 @@ def _ascii_casefold(data, res): _set_code_point(res, idx, _Py_TOLOWER(code_point)) -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L10782-L10791 # noqa: E501 -# mixed with -# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 @overload_method(types.UnicodeType, 'casefold') def unicode_casefold(data): """Implements str.casefold()""" @@ -2039,8 +2165,8 @@ def _lower_ucs4(code_point, data, length, idx, mapped): return _PyUnicode_ToLowerFull(code_point, mapped) -@register_jitable # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 +@register_jitable def _unicode_capitalize(data, length, res, maxchars): k = 0 maxchar = 0 @@ -2177,81 +2303,6 @@ def unicode_swapcase(data): return case_operation(_ascii_swapcase, _unicode_swapcase) -if sys.version_info[:2] >= (3, 7): - @overload_method(types.UnicodeType, 'isascii') - def unicode_isascii(data): - """Implements UnicodeType.isascii()""" - - def impl(data): - return data._is_ascii - return impl - - -@overload_method(types.UnicodeType, 'istitle') -def unicode_istitle(data): - """ - Implements UnicodeType.istitle() - The algorithm is an approximate translation from CPython: - https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11829-L11885 # noqa: E501 - """ - - def impl(data): - length = len(data) - if length == 1: - char = _get_code_point(data, 0) - return _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char) - - if length == 0: - return False - - cased = False - previous_is_cased = False - for idx in range(length): - char = _get_code_point(data, idx) - if _PyUnicode_IsUppercase(char) or _PyUnicode_IsTitlecase(char): - if previous_is_cased: - return False - previous_is_cased = True - cased = True - elif _PyUnicode_IsLowercase(char): - if not previous_is_cased: - return False - previous_is_cased = True - cased = True - else: - previous_is_cased = False - - return cased - return impl - - -@overload_method(types.UnicodeType, 'islower') -def unicode_islower(data): - """ - impl is an approximate translation of: - https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L11900-L11933 # noqa: E501 - mixed with: - https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L131-L156 # noqa: E501 - """ - - def impl(data): - length = len(data) - if length == 1: - return _PyUnicode_IsLowercase(_get_code_point(data, 0)) - if length == 0: - return False - - cased = False - for idx in range(length): - cp = _get_code_point(data, idx) - if _PyUnicode_IsUppercase(cp) or _PyUnicode_IsTitlecase(cp): - return False - elif not cased and _PyUnicode_IsLowercase(cp): - cased = True - return cased - return impl - - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 @register_jitable def _do_upper_or_lower(data, length, res, maxchars, lower): @@ -2271,43 +2322,6 @@ def _do_upper_or_lower(data, length, res, maxchars, lower): return k -@overload_method(types.UnicodeType, 'lower') -def unicode_lower(data): - """Implements .lower()""" - def impl(data): - # main structure is a translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L12380-L12388 # noqa: E501 - - # ASCII fast path - length = len(data) - if data._is_ascii: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L247-L255 # noqa: E501 - res = _empty_string(data._kind, length, data._is_ascii) - for idx in range(length): - code_point = _get_code_point(data, idx) - _set_code_point(res, idx, _Py_TOLOWER(code_point)) - return res - else: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069 # noqa: E501 - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, - data._is_ascii) - # maxchar is inside of a list to be pass as argument by reference - maxchars = [0] - newlength = _do_upper_or_lower(data, length, tmp, maxchars, - lower=True) - maxchar = maxchars[0] - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, - _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(res, i, _get_code_point(tmp, i)) - return res - - return impl - - @lower_builtin('getiter', types.UnicodeType) def getiter_unicode(context, builder, sig, args): [ty] = sig.args From a3970cea4b23bbf3d4a0d84c13ff7515929db8f7 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Thu, 2 Jan 2020 17:08:14 +0000 Subject: [PATCH 68/68] Sort out lower/upper --- numba/unicode.py | 206 +++++++++++++++++------------------------------ 1 file changed, 76 insertions(+), 130 deletions(-) diff --git a/numba/unicode.py b/numba/unicode.py index f60cd7a0fa9..f5b113d795e 100644 --- a/numba/unicode.py +++ b/numba/unicode.py @@ -2013,92 +2013,87 @@ def impl(data): return impl -@overload_method(types.UnicodeType, 'upper') -def unicode_upper(a): - """ - Implements .upper() - """ - def impl(a): - # main structure is a translation of: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L13308-L13316 # noqa: E501 +# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9856-L9883 # noqa: E501 +@register_jitable +def _handle_capital_sigma(data, length, idx): + """This is a translation of the function that handles the capital sigma.""" + c = 0 + j = idx - 1 + while j >= 0: + c = _get_code_point(data, j) + if not _PyUnicode_IsCaseIgnorable(c): + break + j -= 1 + final_sigma = (j >= 0 and _PyUnicode_IsCased(c)) + if final_sigma: + j = idx + 1 + while j < length: + c = _get_code_point(data, j) + if not _PyUnicode_IsCaseIgnorable(c): + break + j += 1 + final_sigma = (j == length or (not _PyUnicode_IsCased(c))) + + return 0x3c2 if final_sigma else 0x3c3 + + +# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9885-L9895 # noqa: E501 +@register_jitable +def _lower_ucs4(code_point, data, length, idx, mapped): + """This is a translation of the function that lowers a character.""" + if code_point == 0x3A3: + mapped[0] = _handle_capital_sigma(data, length, idx) + return 1 + return _PyUnicode_ToLowerFull(code_point, mapped) - # ASCII fast path - l = len(a) - if a._is_ascii: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/bytes_methods.c#L300 # noqa: E501 - ret = _empty_string(a._kind, l, a._is_ascii) - for idx in range(l): - code_point = _get_code_point(a, idx) - _set_code_point(ret, idx, _Py_TOUPPER(code_point)) - return ret - else: - # This part in an amalgamation of two algorithms: - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9864-L9908 # noqa: E501 - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9787-L9805 # noqa: E501 - # - # The alg walks the string and writes the upper version of the code - # point into a 4byte kind unicode string and at the same time - # tracks the maximum width "upper" character encountered, following - # this the 4byte kind string is reinterpreted as needed into the - # maximum width kind string - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * l, a._is_ascii) - mapped = np.array((3,), dtype=_Py_UCS4) - maxchar = 0 - k = 0 - for idx in range(l): - mapped[:] = 0 - code_point = _get_code_point(a, idx) - n_res = _PyUnicode_ToUpperFull(_Py_UCS4(code_point), mapped) - for j in range(n_res): - maxchar = max(maxchar, mapped[j]) - _set_code_point(tmp, k, mapped[j]) - k += 1 - newlength = k - newkind = _codepoint_to_kind(maxchar) - ret = _empty_string(newkind, newlength, - _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(ret, i, _get_code_point(tmp, i)) - return ret - return impl + +# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 +def _gen_unicode_upper_or_lower(lower): + def _do_upper_or_lower(data, length, res, maxchars): + k = 0 + for idx in range(length): + mapped = np.zeros(3, dtype=_Py_UCS4) + code_point = _get_code_point(data, idx) + if lower: + n_res = _lower_ucs4(code_point, data, length, idx, mapped) + else: + # might be needed if call _do_upper_or_lower in unicode_upper + n_res = _PyUnicode_ToUpperFull(code_point, mapped) + for m in mapped[:n_res]: + maxchars[0] = max(maxchars[0], m) + _set_code_point(res, k, m) + k += 1 + return k + return _do_upper_or_lower + + +_unicode_upper = register_jitable(_gen_unicode_upper_or_lower(False)) +_unicode_lower = register_jitable(_gen_unicode_upper_or_lower(True)) + + +def _gen_ascii_upper_or_lower(func): + def _ascii_upper_or_lower(data, res): + for idx in range(len(data)): + code_point = _get_code_point(data, idx) + _set_code_point(res, idx, func(code_point)) + return _ascii_upper_or_lower + + +_ascii_upper = register_jitable(_gen_ascii_upper_or_lower(_Py_TOUPPER)) +_ascii_lower = register_jitable(_gen_ascii_upper_or_lower(_Py_TOLOWER)) @overload_method(types.UnicodeType, 'lower') def unicode_lower(data): """Implements .lower()""" - def impl(data): - # main structure is a translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L12380-L12388 # noqa: E501 + return case_operation(_ascii_lower, _unicode_lower) - # ASCII fast path - length = len(data) - if data._is_ascii: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/bytes_methods.c#L247-L255 # noqa: E501 - res = _empty_string(data._kind, length, data._is_ascii) - for idx in range(length): - code_point = _get_code_point(data, idx) - _set_code_point(res, idx, _Py_TOLOWER(code_point)) - return res - else: - # This is an approximate translation of: - # https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L10023-L10069 # noqa: E501 - tmp = _empty_string(PY_UNICODE_4BYTE_KIND, 3 * length, - data._is_ascii) - # maxchar is inside of a list to be pass as argument by reference - maxchars = [0] - newlength = _do_upper_or_lower(data, length, tmp, maxchars, - lower=True) - maxchar = maxchars[0] - newkind = _codepoint_to_kind(maxchar) - res = _empty_string(newkind, newlength, - _codepoint_is_ascii(maxchar)) - for i in range(newlength): - _set_code_point(res, i, _get_code_point(tmp, i)) - return res - return impl +@overload_method(types.UnicodeType, 'upper') +def unicode_upper(data): + """Implements .upper()""" + return case_operation(_ascii_upper, _unicode_upper) + # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9819-L9834 # noqa: E501 @register_jitable @@ -2131,40 +2126,6 @@ def unicode_casefold(data): return case_operation(_ascii_casefold, _unicode_casefold) -# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9856-L9883 # noqa: E501 -@register_jitable -def _handle_capital_sigma(data, length, idx): - """This is a translation of the function that handles the capital sigma.""" - c = 0 - j = idx - 1 - while j >= 0: - c = _get_code_point(data, j) - if not _PyUnicode_IsCaseIgnorable(c): - break - j -= 1 - final_sigma = (j >= 0 and _PyUnicode_IsCased(c)) - if final_sigma: - j = idx + 1 - while j < length: - c = _get_code_point(data, j) - if not _PyUnicode_IsCaseIgnorable(c): - break - j += 1 - final_sigma = (j == length or (not _PyUnicode_IsCased(c))) - - return 0x3c2 if final_sigma else 0x3c3 - - -# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9885-L9895 # noqa: E501 -@register_jitable -def _lower_ucs4(code_point, data, length, idx, mapped): - """This is a translation of the function that lowers a character.""" - if code_point == 0x3A3: - mapped[0] = _handle_capital_sigma(data, length, idx) - return 1 - return _PyUnicode_ToLowerFull(code_point, mapped) - - # https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L9737-L9759 # noqa: E501 @register_jitable def _unicode_capitalize(data, length, res, maxchars): @@ -2302,24 +2263,9 @@ def _unicode_swapcase(data, length, res, maxchars): def unicode_swapcase(data): return case_operation(_ascii_swapcase, _unicode_swapcase) - -# https://github.com/python/cpython/blob/201c8f79450628241574fba940e08107178dc3a5/Objects/unicodeobject.c#L9946-L9965 # noqa: E501 -@register_jitable -def _do_upper_or_lower(data, length, res, maxchars, lower): - k = 0 - for idx in range(length): - mapped = np.zeros(3, dtype=_Py_UCS4) - code_point = _get_code_point(data, idx) - if lower: - n_res = _lower_ucs4(code_point, data, length, idx, mapped) - else: - # might be needed if call _do_upper_or_lower in unicode_upper - n_res = _PyUnicode_ToUpperFull(code_point, mapped) - for m in mapped[:n_res]: - maxchars[0] = max(maxchars[0], m) - _set_code_point(res, k, m) - k += 1 - return k +# ------------------------------------------------------------------------------ +# iteration +# ------------------------------------------------------------------------------ @lower_builtin('getiter', types.UnicodeType)