CLN: use integer parsing functions from stdlib (pandas-dev#62658)

Alvaro-Kothe · WillAyd · eicchen · commit 4405a177962f · 2025-10-17T20:13:50.000-05:00
Co-authored-by: William Ayd &lt;william.ayd@icloud.com&gt;
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -23,10 +23,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include <float.h>
 #include <math.h>
 #include <stdbool.h>
+#include <stdlib.h>
 
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
 
+// Arrow256 allows up to 76 decimal digits.
+// We rounded up to the next power of 2.
+#define PROCESSED_WORD_CAPACITY 128
+
 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
                    int64_t start) {
   // column i, starting at 0
@@ -1834,6 +1839,39 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
+/* Copy a string without `char_to_remove` into `output`.
+ */
+static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
+                                    const char *str, size_t str_len,
+                                    char char_to_remove) {
+  const char *left = str;
+  const char *end_ptr = str + str_len;
+  size_t bytes_written = 0;
+
+  while (left < end_ptr) {
+    const size_t remaining_bytes_to_read = end_ptr - left;
+    const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
+
+    if (!right) {
+      // If it doesn't find the char to remove, just copy until EOS.
+      right = end_ptr;
+    }
+
+    const size_t chunk_size = right - left;
+
+    if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
+      return -1;
+    }
+    memcpy(&output[bytes_written], left, chunk_size);
+    bytes_written += chunk_size;
+
+    left = right + 1;
+  }
+
+  output[bytes_written] = '\0';
+  return 0;
+}
+
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
   const char *p = p_item;
@@ -1843,105 +1881,45 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   // Handle sign.
-  const bool isneg = *p == '-' ? true : false;
+  const bool has_sign = *p == '-' || *p == '+';
   // Handle sign.
-  if (isneg || (*p == '+')) {
-    p++;
-  }
+  const char *digit_start = has_sign ? p + 1 : p;
 
   // Check that there is a first digit.
-  if (!isdigit_ascii(*p)) {
+  if (!isdigit_ascii(*digit_start)) {
     // Error...
     *error = ERROR_NO_DIGITS;
     return 0;
   }
 
-  int64_t number = 0;
-  if (isneg) {
-    // If number is greater than pre_min, at least one more digit
-    // can be processed without overflowing.
-    int dig_pre_min = -(int_min % 10);
-    int64_t pre_min = int_min / 10;
-
-    // Process the digits.
-    char d = *p;
-    if (tsep != '\0') {
-      while (1) {
-        if (d == tsep) {
-          d = *++p;
-          continue;
-        } else if (!isdigit_ascii(d)) {
-          break;
-        }
-        if ((number > pre_min) ||
-            ((number == pre_min) && (d - '0' <= dig_pre_min))) {
-          number = number * 10 - (d - '0');
-          d = *++p;
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    } else {
-      while (isdigit_ascii(d)) {
-        if ((number > pre_min) ||
-            ((number == pre_min) && (d - '0' <= dig_pre_min))) {
-          number = number * 10 - (d - '0');
-          d = *++p;
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
+  char buffer[PROCESSED_WORD_CAPACITY];
+  const size_t str_len = strlen(p);
+  if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
+    const int status = copy_string_without_char(buffer, p, str_len, tsep);
+    if (status != 0) {
+      // Word is too big, probably will cause an overflow
+      *error = ERROR_OVERFLOW;
+      return 0;
     }
-  } else {
-    // If number is less than pre_max, at least one more digit
-    // can be processed without overflowing.
-    int64_t pre_max = int_max / 10;
-    int dig_pre_max = int_max % 10;
-
-    // Process the digits.
-    char d = *p;
-    if (tsep != '\0') {
-      while (1) {
-        if (d == tsep) {
-          d = *++p;
-          continue;
-        } else if (!isdigit_ascii(d)) {
-          break;
-        }
-        if ((number < pre_max) ||
-            ((number == pre_max) && (d - '0' <= dig_pre_max))) {
-          number = number * 10 + (d - '0');
-          d = *++p;
+    p = buffer;
+  }
 
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    } else {
-      while (isdigit_ascii(d)) {
-        if ((number < pre_max) ||
-            ((number == pre_max) && (d - '0' <= dig_pre_max))) {
-          number = number * 10 + (d - '0');
-          d = *++p;
+  char *endptr;
+  int64_t number = strtoll(p, &endptr, 10);
 
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    }
+  if (errno == ERANGE || number > int_max || number < int_min) {
+    *error = ERROR_OVERFLOW;
+    errno = 0;
+    return 0;
   }
 
   // Skip trailing spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+  while (isspace_ascii(*endptr)) {
+    ++endptr;
   }
 
   // Did we use up all the characters?
-  if (*p) {
+  if (*endptr) {
     *error = ERROR_INVALID_CHARS;
     return 0;
   }
@@ -1974,53 +1952,34 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     return 0;
   }
 
-  // If number is less than pre_max, at least one more digit
-  // can be processed without overflowing.
-  //
-  // Process the digits.
-  uint64_t number = 0;
-  const uint64_t pre_max = uint_max / 10;
-  const uint64_t dig_pre_max = uint_max % 10;
-  char d = *p;
-  if (tsep != '\0') {
-    while (1) {
-      if (d == tsep) {
-        d = *++p;
-        continue;
-      } else if (!isdigit_ascii(d)) {
-        break;
-      }
-      if ((number < pre_max) ||
-          ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
-        number = number * 10 + (d - '0');
-        d = *++p;
-
-      } else {
-        *error = ERROR_OVERFLOW;
-        return 0;
-      }
+  char buffer[PROCESSED_WORD_CAPACITY];
+  const size_t str_len = strlen(p);
+  if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
+    const int status = copy_string_without_char(buffer, p, str_len, tsep);
+    if (status != 0) {
+      // Word is too big, probably will cause an overflow
+      *error = ERROR_OVERFLOW;
+      return 0;
     }
-  } else {
-    while (isdigit_ascii(d)) {
-      if ((number < pre_max) ||
-          ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
-        number = number * 10 + (d - '0');
-        d = *++p;
+    p = buffer;
+  }
 
-      } else {
-        *error = ERROR_OVERFLOW;
-        return 0;
-      }
-    }
+  char *endptr;
+  uint64_t number = strtoull(p, &endptr, 10);
+
+  if (errno == ERANGE || number > uint_max) {
+    *error = ERROR_OVERFLOW;
+    errno = 0;
+    return 0;
   }
 
   // Skip trailing spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+  while (isspace_ascii(*endptr)) {
+    ++endptr;
   }
 
   // Did we use up all the characters?
-  if (*p) {
+  if (*endptr) {
     *error = ERROR_INVALID_CHARS;
     return 0;
   }
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -72,19 +72,36 @@ def test_read_csv_local(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
-def test_1000_sep(all_parsers):
+@pytest.mark.parametrize(
+    "number_csv, expected_number",
+    [
+        ("2,334", 2334),
+        ("-2,334", -2334),
+        ("-2,334,", -2334),
+        # Multiple consecutive thousand separators are allowed in C engine,
+        # but it's not necessarily intended behavior and may change in the future.
+        ("2,,,,,,,,,,,,,,,5", 25),
+        ("2,,3,4,,,,,,,,,,,,5", 2345),
+    ],
+)
+def test_1000_sep(all_parsers, number_csv, expected_number, request):
     parser = all_parsers
-    data = """A|B|C
-1|2,334|5
+    data = f"""A|B|C
+1|{number_csv}|5
 10|13|10.
 """
-    expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
+    expected = DataFrame({"A": [1, 10], "B": [expected_number, 13], "C": [5, 10.0]})
 
     if parser.engine == "pyarrow":
         msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
         with pytest.raises(ValueError, match=msg):
             parser.read_csv(StringIO(data), sep="|", thousands=",")
         return
+    elif parser.engine == "python" and ",," in number_csv:
+        mark = pytest.mark.xfail(
+            reason="Python engine doesn't allow consecutive thousands separators"
+        )
+        request.applymarker(mark)
 
     result = parser.read_csv(StringIO(data), sep="|", thousands=",")
     tm.assert_frame_equal(result, expected)