Skip to content

Commit 533821c

Browse files
CLN: use integer parsing functions from stdlib (#62658)
Co-authored-by: William Ayd <[email protected]>
1 parent 56f1295 commit 533821c

File tree

2 files changed

+101
-125
lines changed

2 files changed

+101
-125
lines changed

pandas/_libs/src/parser/tokenizer.c

Lines changed: 80 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2323
#include <float.h>
2424
#include <math.h>
2525
#include <stdbool.h>
26+
#include <stdlib.h>
2627

2728
#include "pandas/portable.h"
2829
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
2930

31+
// Arrow256 allows up to 76 decimal digits.
32+
// We rounded up to the next power of 2.
33+
#define PROCESSED_WORD_CAPACITY 128
34+
3035
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
3136
int64_t start) {
3237
// column i, starting at 0
@@ -1834,6 +1839,39 @@ int uint64_conflict(uint_state *self) {
18341839
return self->seen_uint && (self->seen_sint || self->seen_null);
18351840
}
18361841

1842+
/* Copy a string without `char_to_remove` into `output`.
1843+
*/
1844+
static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
1845+
const char *str, size_t str_len,
1846+
char char_to_remove) {
1847+
const char *left = str;
1848+
const char *end_ptr = str + str_len;
1849+
size_t bytes_written = 0;
1850+
1851+
while (left < end_ptr) {
1852+
const size_t remaining_bytes_to_read = end_ptr - left;
1853+
const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
1854+
1855+
if (!right) {
1856+
// If it doesn't find the char to remove, just copy until EOS.
1857+
right = end_ptr;
1858+
}
1859+
1860+
const size_t chunk_size = right - left;
1861+
1862+
if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
1863+
return -1;
1864+
}
1865+
memcpy(&output[bytes_written], left, chunk_size);
1866+
bytes_written += chunk_size;
1867+
1868+
left = right + 1;
1869+
}
1870+
1871+
output[bytes_written] = '\0';
1872+
return 0;
1873+
}
1874+
18371875
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18381876
int *error, char tsep) {
18391877
const char *p = p_item;
@@ -1843,105 +1881,45 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18431881
}
18441882

18451883
// Handle sign.
1846-
const bool isneg = *p == '-' ? true : false;
1884+
const bool has_sign = *p == '-' || *p == '+';
18471885
// Handle sign.
1848-
if (isneg || (*p == '+')) {
1849-
p++;
1850-
}
1886+
const char *digit_start = has_sign ? p + 1 : p;
18511887

18521888
// Check that there is a first digit.
1853-
if (!isdigit_ascii(*p)) {
1889+
if (!isdigit_ascii(*digit_start)) {
18541890
// Error...
18551891
*error = ERROR_NO_DIGITS;
18561892
return 0;
18571893
}
18581894

1859-
int64_t number = 0;
1860-
if (isneg) {
1861-
// If number is greater than pre_min, at least one more digit
1862-
// can be processed without overflowing.
1863-
int dig_pre_min = -(int_min % 10);
1864-
int64_t pre_min = int_min / 10;
1865-
1866-
// Process the digits.
1867-
char d = *p;
1868-
if (tsep != '\0') {
1869-
while (1) {
1870-
if (d == tsep) {
1871-
d = *++p;
1872-
continue;
1873-
} else if (!isdigit_ascii(d)) {
1874-
break;
1875-
}
1876-
if ((number > pre_min) ||
1877-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1878-
number = number * 10 - (d - '0');
1879-
d = *++p;
1880-
} else {
1881-
*error = ERROR_OVERFLOW;
1882-
return 0;
1883-
}
1884-
}
1885-
} else {
1886-
while (isdigit_ascii(d)) {
1887-
if ((number > pre_min) ||
1888-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1889-
number = number * 10 - (d - '0');
1890-
d = *++p;
1891-
} else {
1892-
*error = ERROR_OVERFLOW;
1893-
return 0;
1894-
}
1895-
}
1895+
char buffer[PROCESSED_WORD_CAPACITY];
1896+
const size_t str_len = strlen(p);
1897+
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
1898+
const int status = copy_string_without_char(buffer, p, str_len, tsep);
1899+
if (status != 0) {
1900+
// Word is too big, probably will cause an overflow
1901+
*error = ERROR_OVERFLOW;
1902+
return 0;
18961903
}
1897-
} else {
1898-
// If number is less than pre_max, at least one more digit
1899-
// can be processed without overflowing.
1900-
int64_t pre_max = int_max / 10;
1901-
int dig_pre_max = int_max % 10;
1902-
1903-
// Process the digits.
1904-
char d = *p;
1905-
if (tsep != '\0') {
1906-
while (1) {
1907-
if (d == tsep) {
1908-
d = *++p;
1909-
continue;
1910-
} else if (!isdigit_ascii(d)) {
1911-
break;
1912-
}
1913-
if ((number < pre_max) ||
1914-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1915-
number = number * 10 + (d - '0');
1916-
d = *++p;
1904+
p = buffer;
1905+
}
19171906

1918-
} else {
1919-
*error = ERROR_OVERFLOW;
1920-
return 0;
1921-
}
1922-
}
1923-
} else {
1924-
while (isdigit_ascii(d)) {
1925-
if ((number < pre_max) ||
1926-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1927-
number = number * 10 + (d - '0');
1928-
d = *++p;
1907+
char *endptr;
1908+
int64_t number = strtoll(p, &endptr, 10);
19291909

1930-
} else {
1931-
*error = ERROR_OVERFLOW;
1932-
return 0;
1933-
}
1934-
}
1935-
}
1910+
if (errno == ERANGE || number > int_max || number < int_min) {
1911+
*error = ERROR_OVERFLOW;
1912+
errno = 0;
1913+
return 0;
19361914
}
19371915

19381916
// Skip trailing spaces.
1939-
while (isspace_ascii(*p)) {
1940-
++p;
1917+
while (isspace_ascii(*endptr)) {
1918+
++endptr;
19411919
}
19421920

19431921
// Did we use up all the characters?
1944-
if (*p) {
1922+
if (*endptr) {
19451923
*error = ERROR_INVALID_CHARS;
19461924
return 0;
19471925
}
@@ -1974,53 +1952,34 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19741952
return 0;
19751953
}
19761954

1977-
// If number is less than pre_max, at least one more digit
1978-
// can be processed without overflowing.
1979-
//
1980-
// Process the digits.
1981-
uint64_t number = 0;
1982-
const uint64_t pre_max = uint_max / 10;
1983-
const uint64_t dig_pre_max = uint_max % 10;
1984-
char d = *p;
1985-
if (tsep != '\0') {
1986-
while (1) {
1987-
if (d == tsep) {
1988-
d = *++p;
1989-
continue;
1990-
} else if (!isdigit_ascii(d)) {
1991-
break;
1992-
}
1993-
if ((number < pre_max) ||
1994-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
1995-
number = number * 10 + (d - '0');
1996-
d = *++p;
1997-
1998-
} else {
1999-
*error = ERROR_OVERFLOW;
2000-
return 0;
2001-
}
1955+
char buffer[PROCESSED_WORD_CAPACITY];
1956+
const size_t str_len = strlen(p);
1957+
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
1958+
const int status = copy_string_without_char(buffer, p, str_len, tsep);
1959+
if (status != 0) {
1960+
// Word is too big, probably will cause an overflow
1961+
*error = ERROR_OVERFLOW;
1962+
return 0;
20021963
}
2003-
} else {
2004-
while (isdigit_ascii(d)) {
2005-
if ((number < pre_max) ||
2006-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
2007-
number = number * 10 + (d - '0');
2008-
d = *++p;
1964+
p = buffer;
1965+
}
20091966

2010-
} else {
2011-
*error = ERROR_OVERFLOW;
2012-
return 0;
2013-
}
2014-
}
1967+
char *endptr;
1968+
uint64_t number = strtoull(p, &endptr, 10);
1969+
1970+
if (errno == ERANGE || number > uint_max) {
1971+
*error = ERROR_OVERFLOW;
1972+
errno = 0;
1973+
return 0;
20151974
}
20161975

20171976
// Skip trailing spaces.
2018-
while (isspace_ascii(*p)) {
2019-
++p;
1977+
while (isspace_ascii(*endptr)) {
1978+
++endptr;
20201979
}
20211980

20221981
// Did we use up all the characters?
2023-
if (*p) {
1982+
if (*endptr) {
20241983
*error = ERROR_INVALID_CHARS;
20251984
return 0;
20261985
}

pandas/tests/io/parser/common/test_common_basic.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,19 +72,36 @@ def test_read_csv_local(all_parsers, csv1):
7272
tm.assert_frame_equal(result, expected)
7373

7474

75-
def test_1000_sep(all_parsers):
75+
@pytest.mark.parametrize(
76+
"number_csv, expected_number",
77+
[
78+
("2,334", 2334),
79+
("-2,334", -2334),
80+
("-2,334,", -2334),
81+
# Multiple consecutive thousand separators are allowed in C engine,
82+
# but it's not necessarily intended behavior and may change in the future.
83+
("2,,,,,,,,,,,,,,,5", 25),
84+
("2,,3,4,,,,,,,,,,,,5", 2345),
85+
],
86+
)
87+
def test_1000_sep(all_parsers, number_csv, expected_number, request):
7688
parser = all_parsers
77-
data = """A|B|C
78-
1|2,334|5
89+
data = f"""A|B|C
90+
1|{number_csv}|5
7991
10|13|10.
8092
"""
81-
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
93+
expected = DataFrame({"A": [1, 10], "B": [expected_number, 13], "C": [5, 10.0]})
8294

8395
if parser.engine == "pyarrow":
8496
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
8597
with pytest.raises(ValueError, match=msg):
8698
parser.read_csv(StringIO(data), sep="|", thousands=",")
8799
return
100+
elif parser.engine == "python" and ",," in number_csv:
101+
mark = pytest.mark.xfail(
102+
reason="Python engine doesn't allow consecutive thousands separators"
103+
)
104+
request.applymarker(mark)
88105

89106
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
90107
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)