-
-
Notifications
You must be signed in to change notification settings - Fork 19.2k
CLN: use integer parsing functions from stdlib #62658
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 64 commits
c3cc4a1
2459313
d8a454e
87789e6
2287944
2bea3c2
fb38679
f9ede5c
798c263
2f06f19
280b55e
d85aaf0
9046ecc
a4e2fb8
ef82cf4
5afeb11
0ef47a7
c840ef0
132342b
e6977cc
8120eea
1f5d506
479a2ab
c37c355
7d55283
ffe50ce
af5ad71
9211704
d026b01
d76ff5f
b523a19
6265172
abed6c1
8616f9f
c4e0e25
0b19208
29d74f7
b5b8f3b
803a8bf
31f26cf
171b553
30f6bdc
554675b
7ea4454
82dc037
627df4c
1b3eba0
e1667fa
bee776a
d693345
593c614
ff4d48b
e3a88d3
b135738
ba8c9b3
818921f
3e067f7
c0ed83c
cb60adb
5117e89
e1e327a
ffcb7c2
47b87f9
cd536fb
1ef9259
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -23,10 +23,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these. | |||||
| #include <float.h> | ||||||
| #include <math.h> | ||||||
| #include <stdbool.h> | ||||||
| #include <stdlib.h> | ||||||
|
|
||||||
| #include "pandas/portable.h" | ||||||
| #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 | ||||||
|
|
||||||
| // Arrow256 allows up to 76 decimal digits. | ||||||
| // We rounded up to the next power of 2. | ||||||
| #define PROCESSED_WORD_CAPACITY 128 | ||||||
|
|
||||||
| void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, | ||||||
| int64_t start) { | ||||||
| // column i, starting at 0 | ||||||
|
|
@@ -1834,6 +1839,39 @@ int uint64_conflict(uint_state *self) { | |||||
| return self->seen_uint && (self->seen_sint || self->seen_null); | ||||||
| } | ||||||
|
|
||||||
| /* Copy a string without `char_to_remove` into `output`. | ||||||
| */ | ||||||
| static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], | ||||||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| const char *str, size_t str_len, | ||||||
| char char_to_remove) { | ||||||
| const char *left = str; | ||||||
| const char *end_ptr = str + str_len; | ||||||
| size_t bytes_written = 0; | ||||||
|
|
||||||
| while (left < end_ptr) { | ||||||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| const size_t remaining_bytes_to_read = end_ptr - left; | ||||||
| const char *right = memchr(left, char_to_remove, remaining_bytes_to_read); | ||||||
|
|
||||||
| if (!right) { | ||||||
| // If it doesn't find the char to remove, just copy until EOS. | ||||||
| right = end_ptr; | ||||||
| } | ||||||
|
|
||||||
| const size_t chunk_size = right - left; | ||||||
|
|
||||||
| if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { | ||||||
| return -1; | ||||||
| } | ||||||
| memcpy(&output[bytes_written], left, chunk_size); | ||||||
| bytes_written += chunk_size; | ||||||
|
|
||||||
| left = right + 1; | ||||||
| } | ||||||
|
|
||||||
| output[bytes_written] = '\0'; | ||||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, | ||||||
| int *error, char tsep) { | ||||||
| const char *p = p_item; | ||||||
|
|
@@ -1843,105 +1881,45 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, | |||||
| } | ||||||
|
|
||||||
| // Handle sign. | ||||||
| const bool isneg = *p == '-' ? true : false; | ||||||
| const bool has_sign = *p == '-' || *p == '+'; | ||||||
| // Handle sign. | ||||||
| if (isneg || (*p == '+')) { | ||||||
| p++; | ||||||
| } | ||||||
| const char *digit_start = has_sign ? p + 1 : p; | ||||||
|
|
||||||
| // Check that there is a first digit. | ||||||
| if (!isdigit_ascii(*p)) { | ||||||
| if (!isdigit_ascii(*digit_start)) { | ||||||
| // Error... | ||||||
| *error = ERROR_NO_DIGITS; | ||||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| int64_t number = 0; | ||||||
| if (isneg) { | ||||||
| // If number is greater than pre_min, at least one more digit | ||||||
| // can be processed without overflowing. | ||||||
| int dig_pre_min = -(int_min % 10); | ||||||
| int64_t pre_min = int_min / 10; | ||||||
|
|
||||||
| // Process the digits. | ||||||
| char d = *p; | ||||||
| if (tsep != '\0') { | ||||||
| while (1) { | ||||||
| if (d == tsep) { | ||||||
| d = *++p; | ||||||
| continue; | ||||||
| } else if (!isdigit_ascii(d)) { | ||||||
| break; | ||||||
| } | ||||||
| if ((number > pre_min) || | ||||||
| ((number == pre_min) && (d - '0' <= dig_pre_min))) { | ||||||
| number = number * 10 - (d - '0'); | ||||||
| d = *++p; | ||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
| } else { | ||||||
| while (isdigit_ascii(d)) { | ||||||
| if ((number > pre_min) || | ||||||
| ((number == pre_min) && (d - '0' <= dig_pre_min))) { | ||||||
| number = number * 10 - (d - '0'); | ||||||
| d = *++p; | ||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
| char buffer[PROCESSED_WORD_CAPACITY]; | ||||||
| const size_t str_len = strlen(p); | ||||||
| if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { | ||||||
| const int status = copy_string_without_char(buffer, p, str_len, tsep); | ||||||
| if (status != 0) { | ||||||
| // Word is too big, probably will cause an overflow | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } else { | ||||||
| // If number is less than pre_max, at least one more digit | ||||||
| // can be processed without overflowing. | ||||||
| int64_t pre_max = int_max / 10; | ||||||
| int dig_pre_max = int_max % 10; | ||||||
|
|
||||||
| // Process the digits. | ||||||
| char d = *p; | ||||||
| if (tsep != '\0') { | ||||||
| while (1) { | ||||||
| if (d == tsep) { | ||||||
| d = *++p; | ||||||
| continue; | ||||||
| } else if (!isdigit_ascii(d)) { | ||||||
| break; | ||||||
| } | ||||||
| if ((number < pre_max) || | ||||||
| ((number == pre_max) && (d - '0' <= dig_pre_max))) { | ||||||
| number = number * 10 + (d - '0'); | ||||||
| d = *++p; | ||||||
| p = buffer; | ||||||
| } | ||||||
|
|
||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
| } else { | ||||||
| while (isdigit_ascii(d)) { | ||||||
| if ((number < pre_max) || | ||||||
| ((number == pre_max) && (d - '0' <= dig_pre_max))) { | ||||||
| number = number * 10 + (d - '0'); | ||||||
| d = *++p; | ||||||
| char *endptr; | ||||||
| int64_t number = strtoll(p, &endptr, 10); | ||||||
|
|
||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
| if (errno == ERANGE || number > int_max || number < int_min) { | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Are you doing a follow up to get rid of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I pretend to. I've left it in the verification so that the function wouldn't contain unused arguments. |
||||||
| *error = ERROR_OVERFLOW; | ||||||
| errno = 0; | ||||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| // Skip trailing spaces. | ||||||
| while (isspace_ascii(*p)) { | ||||||
| ++p; | ||||||
| while (isspace_ascii(*endptr)) { | ||||||
| ++endptr; | ||||||
| } | ||||||
|
|
||||||
| // Did we use up all the characters? | ||||||
| if (*p) { | ||||||
| if (*endptr) { | ||||||
| *error = ERROR_INVALID_CHARS; | ||||||
| return 0; | ||||||
| } | ||||||
|
|
@@ -1974,53 +1952,34 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, | |||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| // If number is less than pre_max, at least one more digit | ||||||
| // can be processed without overflowing. | ||||||
| // | ||||||
| // Process the digits. | ||||||
| uint64_t number = 0; | ||||||
| const uint64_t pre_max = uint_max / 10; | ||||||
| const uint64_t dig_pre_max = uint_max % 10; | ||||||
| char d = *p; | ||||||
| if (tsep != '\0') { | ||||||
| while (1) { | ||||||
| if (d == tsep) { | ||||||
| d = *++p; | ||||||
| continue; | ||||||
| } else if (!isdigit_ascii(d)) { | ||||||
| break; | ||||||
| } | ||||||
| if ((number < pre_max) || | ||||||
| ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { | ||||||
| number = number * 10 + (d - '0'); | ||||||
| d = *++p; | ||||||
|
|
||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| char buffer[PROCESSED_WORD_CAPACITY]; | ||||||
| const size_t str_len = strlen(p); | ||||||
| if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { | ||||||
| const int status = copy_string_without_char(buffer, p, str_len, tsep); | ||||||
| if (status != 0) { | ||||||
| // Word is too big, probably will cause an overflow | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } else { | ||||||
| while (isdigit_ascii(d)) { | ||||||
| if ((number < pre_max) || | ||||||
| ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { | ||||||
| number = number * 10 + (d - '0'); | ||||||
| d = *++p; | ||||||
| p = buffer; | ||||||
| } | ||||||
|
|
||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
| char *endptr; | ||||||
| uint64_t number = strtoull(p, &endptr, 10); | ||||||
|
|
||||||
| if (errno == ERANGE || number > uint_max) { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| errno = 0; | ||||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| // Skip trailing spaces. | ||||||
| while (isspace_ascii(*p)) { | ||||||
| ++p; | ||||||
| while (isspace_ascii(*endptr)) { | ||||||
| ++endptr; | ||||||
| } | ||||||
|
|
||||||
| // Did we use up all the characters? | ||||||
| if (*p) { | ||||||
| if (*endptr) { | ||||||
| *error = ERROR_INVALID_CHARS; | ||||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,19 +72,34 @@ def test_read_csv_local(all_parsers, csv1): | |
| tm.assert_frame_equal(result, expected) | ||
|
|
||
|
|
||
| def test_1000_sep(all_parsers): | ||
| @pytest.mark.parametrize( | ||
| "number_csv, expected_number", | ||
| [ | ||
| ("2,334", 2334), | ||
| ("-2,334", -2334), | ||
| ("-2,334,", -2334), | ||
| ("2,,,,,,,,,,,,,,,5", 25), | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think its good to have this for test coverage but I'm doubtful it was intended even on main. Maybe we can add a comment that this is just to make sure our parser code is sound, but not necessarily to always ensure |
||
| ("2,,3,4,,,,,,,,,,,,5", 2345), | ||
| ], | ||
| ) | ||
| def test_1000_sep(all_parsers, number_csv, expected_number, request): | ||
| parser = all_parsers | ||
| data = """A|B|C | ||
| 1|2,334|5 | ||
| data = f"""A|B|C | ||
| 1|{number_csv}|5 | ||
| 10|13|10. | ||
| """ | ||
| expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) | ||
| expected = DataFrame({"A": [1, 10], "B": [expected_number, 13], "C": [5, 10.0]}) | ||
|
|
||
| if parser.engine == "pyarrow": | ||
| msg = "The 'thousands' option is not supported with the 'pyarrow' engine" | ||
| with pytest.raises(ValueError, match=msg): | ||
| parser.read_csv(StringIO(data), sep="|", thousands=",") | ||
| return | ||
| elif parser.engine == "python" and ",," in number_csv: | ||
| mark = pytest.mark.xfail( | ||
| reason="Python engine doesn't allow consecutive thousands separators" | ||
| ) | ||
| request.applymarker(mark) | ||
|
|
||
| result = parser.read_csv(StringIO(data), sep="|", thousands=",") | ||
| tm.assert_frame_equal(result, expected) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.