@@ -23,10 +23,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2323#include <float.h>
2424#include <math.h>
2525#include <stdbool.h>
26+ #include <stdlib.h>
2627
2728#include "pandas/portable.h"
2829#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
2930
31+ // Arrow256 allows up to 76 decimal digits.
32+ // We rounded up to the next power of 2.
33+ #define PROCESSED_WORD_CAPACITY 128
34+
3035void coliter_setup (coliter_t * self , parser_t * parser , int64_t i ,
3136 int64_t start ) {
3237 // column i, starting at 0
@@ -1834,6 +1839,39 @@ int uint64_conflict(uint_state *self) {
18341839 return self -> seen_uint && (self -> seen_sint || self -> seen_null );
18351840}
18361841
1842+ /* Copy a string without `char_to_remove` into `output`.
1843+ */
1844+ static int copy_string_without_char (char output [PROCESSED_WORD_CAPACITY ],
1845+ const char * str , size_t str_len ,
1846+ char char_to_remove ) {
1847+ const char * left = str ;
1848+ const char * end_ptr = str + str_len ;
1849+ size_t bytes_written = 0 ;
1850+
1851+ while (left < end_ptr ) {
1852+ const size_t remaining_bytes_to_read = end_ptr - left ;
1853+ const char * right = memchr (left , char_to_remove , remaining_bytes_to_read );
1854+
1855+ if (!right ) {
1856+ // If it doesn't find the char to remove, just copy until EOS.
1857+ right = end_ptr ;
1858+ }
1859+
1860+ const size_t chunk_size = right - left ;
1861+
1862+ if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY ) {
1863+ return -1 ;
1864+ }
1865+ memcpy (& output [bytes_written ], left , chunk_size );
1866+ bytes_written += chunk_size ;
1867+
1868+ left = right + 1 ;
1869+ }
1870+
1871+ output [bytes_written ] = '\0' ;
1872+ return 0 ;
1873+ }
1874+
18371875int64_t str_to_int64 (const char * p_item , int64_t int_min , int64_t int_max ,
18381876 int * error , char tsep ) {
18391877 const char * p = p_item ;
@@ -1843,105 +1881,45 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18431881 }
18441882
18451883 // Handle sign.
1846- const bool isneg = * p == '-' ? true : false ;
1884+ const bool has_sign = * p == '-' || * p == '+' ;
18471885 // Handle sign.
1848- if (isneg || (* p == '+' )) {
1849- p ++ ;
1850- }
1886+ const char * digit_start = has_sign ? p + 1 : p ;
18511887
18521888 // Check that there is a first digit.
1853- if (!isdigit_ascii (* p )) {
1889+ if (!isdigit_ascii (* digit_start )) {
18541890 // Error...
18551891 * error = ERROR_NO_DIGITS ;
18561892 return 0 ;
18571893 }
18581894
1859- int64_t number = 0 ;
1860- if (isneg ) {
1861- // If number is greater than pre_min, at least one more digit
1862- // can be processed without overflowing.
1863- int dig_pre_min = - (int_min % 10 );
1864- int64_t pre_min = int_min / 10 ;
1865-
1866- // Process the digits.
1867- char d = * p ;
1868- if (tsep != '\0' ) {
1869- while (1 ) {
1870- if (d == tsep ) {
1871- d = * ++ p ;
1872- continue ;
1873- } else if (!isdigit_ascii (d )) {
1874- break ;
1875- }
1876- if ((number > pre_min ) ||
1877- ((number == pre_min ) && (d - '0' <= dig_pre_min ))) {
1878- number = number * 10 - (d - '0' );
1879- d = * ++ p ;
1880- } else {
1881- * error = ERROR_OVERFLOW ;
1882- return 0 ;
1883- }
1884- }
1885- } else {
1886- while (isdigit_ascii (d )) {
1887- if ((number > pre_min ) ||
1888- ((number == pre_min ) && (d - '0' <= dig_pre_min ))) {
1889- number = number * 10 - (d - '0' );
1890- d = * ++ p ;
1891- } else {
1892- * error = ERROR_OVERFLOW ;
1893- return 0 ;
1894- }
1895- }
1895+ char buffer [PROCESSED_WORD_CAPACITY ];
1896+ const size_t str_len = strlen (p );
1897+ if (tsep != '\0' && memchr (p , tsep , str_len ) != NULL ) {
1898+ const int status = copy_string_without_char (buffer , p , str_len , tsep );
1899+ if (status != 0 ) {
1900+ // Word is too big, probably will cause an overflow
1901+ * error = ERROR_OVERFLOW ;
1902+ return 0 ;
18961903 }
1897- } else {
1898- // If number is less than pre_max, at least one more digit
1899- // can be processed without overflowing.
1900- int64_t pre_max = int_max / 10 ;
1901- int dig_pre_max = int_max % 10 ;
1902-
1903- // Process the digits.
1904- char d = * p ;
1905- if (tsep != '\0' ) {
1906- while (1 ) {
1907- if (d == tsep ) {
1908- d = * ++ p ;
1909- continue ;
1910- } else if (!isdigit_ascii (d )) {
1911- break ;
1912- }
1913- if ((number < pre_max ) ||
1914- ((number == pre_max ) && (d - '0' <= dig_pre_max ))) {
1915- number = number * 10 + (d - '0' );
1916- d = * ++ p ;
1904+ p = buffer ;
1905+ }
19171906
1918- } else {
1919- * error = ERROR_OVERFLOW ;
1920- return 0 ;
1921- }
1922- }
1923- } else {
1924- while (isdigit_ascii (d )) {
1925- if ((number < pre_max ) ||
1926- ((number == pre_max ) && (d - '0' <= dig_pre_max ))) {
1927- number = number * 10 + (d - '0' );
1928- d = * ++ p ;
1907+ char * endptr ;
1908+ int64_t number = strtoll (p , & endptr , 10 );
19291909
1930- } else {
1931- * error = ERROR_OVERFLOW ;
1932- return 0 ;
1933- }
1934- }
1935- }
1910+ if (errno == ERANGE || number > int_max || number < int_min ) {
1911+ * error = ERROR_OVERFLOW ;
1912+ errno = 0 ;
1913+ return 0 ;
19361914 }
19371915
19381916 // Skip trailing spaces.
1939- while (isspace_ascii (* p )) {
1940- ++ p ;
1917+ while (isspace_ascii (* endptr )) {
1918+ ++ endptr ;
19411919 }
19421920
19431921 // Did we use up all the characters?
1944- if (* p ) {
1922+ if (* endptr ) {
19451923 * error = ERROR_INVALID_CHARS ;
19461924 return 0 ;
19471925 }
@@ -1974,53 +1952,34 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19741952 return 0 ;
19751953 }
19761954
1977- // If number is less than pre_max, at least one more digit
1978- // can be processed without overflowing.
1979- //
1980- // Process the digits.
1981- uint64_t number = 0 ;
1982- const uint64_t pre_max = uint_max / 10 ;
1983- const uint64_t dig_pre_max = uint_max % 10 ;
1984- char d = * p ;
1985- if (tsep != '\0' ) {
1986- while (1 ) {
1987- if (d == tsep ) {
1988- d = * ++ p ;
1989- continue ;
1990- } else if (!isdigit_ascii (d )) {
1991- break ;
1992- }
1993- if ((number < pre_max ) ||
1994- ((number == pre_max ) && ((uint64_t )(d - '0' ) <= dig_pre_max ))) {
1995- number = number * 10 + (d - '0' );
1996- d = * ++ p ;
1997-
1998- } else {
1999- * error = ERROR_OVERFLOW ;
2000- return 0 ;
2001- }
1955+ char buffer [PROCESSED_WORD_CAPACITY ];
1956+ const size_t str_len = strlen (p );
1957+ if (tsep != '\0' && memchr (p , tsep , str_len ) != NULL ) {
1958+ const int status = copy_string_without_char (buffer , p , str_len , tsep );
1959+ if (status != 0 ) {
1960+ // Word is too big, probably will cause an overflow
1961+ * error = ERROR_OVERFLOW ;
1962+ return 0 ;
20021963 }
2003- } else {
2004- while (isdigit_ascii (d )) {
2005- if ((number < pre_max ) ||
2006- ((number == pre_max ) && ((uint64_t )(d - '0' ) <= dig_pre_max ))) {
2007- number = number * 10 + (d - '0' );
2008- d = * ++ p ;
1964+ p = buffer ;
1965+ }
20091966
2010- } else {
2011- * error = ERROR_OVERFLOW ;
2012- return 0 ;
2013- }
2014- }
1967+ char * endptr ;
1968+ uint64_t number = strtoull (p , & endptr , 10 );
1969+
1970+ if (errno == ERANGE || number > uint_max ) {
1971+ * error = ERROR_OVERFLOW ;
1972+ errno = 0 ;
1973+ return 0 ;
20151974 }
20161975
20171976 // Skip trailing spaces.
2018- while (isspace_ascii (* p )) {
2019- ++ p ;
1977+ while (isspace_ascii (* endptr )) {
1978+ ++ endptr ;
20201979 }
20211980
20221981 // Did we use up all the characters?
2023- if (* p ) {
1982+ if (* endptr ) {
20241983 * error = ERROR_INVALID_CHARS ;
20251984 return 0 ;
20261985 }
0 commit comments