Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ Other enhancements
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :func:`to_numeric` on big integers converts to ``object`` datatype with python integers when not coercing. (:issue:`51295`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`)
- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)
Expand Down Expand Up @@ -1091,7 +1092,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
- Bug in :meth:`read_csv` with ``c`` and ``python`` engines reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
- Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
Expand Down
26 changes: 21 additions & 5 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,7 @@ cdef class Seen:
bint nan_ # seen_np.nan
bint uint_ # seen_uint (unsigned integer)
bint sint_ # seen_sint (signed integer)
bint overflow_ # seen_overflow
bint float_ # seen_float
bint object_ # seen_object
bint complex_ # seen_complex
Expand Down Expand Up @@ -1414,6 +1415,7 @@ cdef class Seen:
self.nan_ = False
self.uint_ = False
self.sint_ = False
self.overflow_ = False
self.float_ = False
self.object_ = False
self.complex_ = False
Expand Down Expand Up @@ -2379,6 +2381,9 @@ def maybe_convert_numeric(
ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(
1, values.shape, cnp.NPY_UINT64, 0
)
ndarray[object, ndim=1] pyints = cnp.PyArray_EMPTY(
1, values.shape, cnp.NPY_OBJECT, 0
)
ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(
1, values.shape, cnp.NPY_UINT8, 0
)
Expand Down Expand Up @@ -2421,18 +2426,24 @@ def maybe_convert_numeric(

val = int(val)
seen.saw_int(val)
pyints[i] = val

if val >= 0:
if val <= oUINT64_MAX:
uints[i] = val
else:
elif seen.coerce_numeric:
seen.float_ = True
else:
seen.overflow_ = True

if oINT64_MIN <= val <= oINT64_MAX:
ints[i] = val

if val < oINT64_MIN or (seen.sint_ and seen.uint_):
seen.float_ = True
if seen.coerce_numeric:
seen.float_ = True
else:
seen.overflow_ = True

elif util.is_bool_object(val):
floats[i] = uints[i] = ints[i] = bools[i] = val
Expand Down Expand Up @@ -2476,6 +2487,7 @@ def maybe_convert_numeric(

if maybe_int:
as_int = int(val)
pyints[i] = as_int

if as_int in na_values:
mask[i] = 1
Expand All @@ -2490,7 +2502,7 @@ def maybe_convert_numeric(
if seen.coerce_numeric:
seen.float_ = True
else:
raise ValueError("Integer out of range.")
seen.overflow_ = True
else:
if as_int >= 0:
uints[i] = as_int
Expand Down Expand Up @@ -2529,11 +2541,15 @@ def maybe_convert_numeric(
return (floats, None)
elif seen.int_:
if seen.null_ and convert_to_masked_nullable:
if seen.uint_:
if seen.overflow_:
return (pyints, mask.view(np.bool_))
elif seen.uint_:
return (uints, mask.view(np.bool_))
else:
return (ints, mask.view(np.bool_))
if seen.uint_:
if seen.overflow_:
return (pyints, None)
elif seen.uint_:
return (uints, None)
else:
return (ints, None)
Expand Down
6 changes: 5 additions & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,11 @@ def _infer_types(
if values.dtype == np.object_:
na_count = parsers.sanitize_objects(values, na_values)

if result.dtype == np.object_ and try_num_bool:
if (
result.dtype == np.object_
and try_num_bool
and (len(result) == 0 or not isinstance(result[0], int))
):
result, bool_mask = libops.maybe_convert_bool(
np.asarray(values),
true_values=self.true_values,
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,26 @@ def test_convert_int_overflow(self, value):
result = lib.maybe_convert_objects(arr)
tm.assert_numpy_array_equal(arr, result)

@pytest.mark.parametrize(
"value, expected_value",
[
(-(1 << 65), -(1 << 65)),
(1 << 65, 1 << 65),
(str(1 << 65), 1 << 65),
(f"-{1 << 65}", -(1 << 65)),
],
)
@pytest.mark.parametrize("coerce_numeric", [False, True])
def test_convert_numeric_overflow(self, value, expected_value, coerce_numeric):
arr = np.array([value], dtype=object)
expected = np.array([expected_value], dtype=float if coerce_numeric else object)
result, _ = lib.maybe_convert_numeric(
arr,
set(),
coerce_numeric=coerce_numeric,
)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("val", [None, np.nan, float("nan")])
@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
def test_maybe_convert_objects_nat_inference(self, val, dtype):
Expand Down
8 changes: 0 additions & 8 deletions pandas/tests/io/parser/common/test_ints.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,6 @@ def test_int64_overflow(all_parsers, conv, request):
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="parses to float64")
request.applymarker(mark)
elif parser.engine == "python":
mark = pytest.mark.xfail(
reason="TODO: Python engine reads bigint as string"
)
request.applymarker(mark)

result = parser.read_csv(StringIO(data))
expected = DataFrame(
Expand Down Expand Up @@ -206,9 +201,6 @@ def test_outside_int64_uint64_range(all_parsers, val, request):
# These numbers fall just outside the int64-uint64
# range, so they should be parsed as object.
parser = all_parsers
if parser.engine == "python":
mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
request.applymarker(mark)

result = parser.read_csv(StringIO(str(val)), header=None)

Expand Down
37 changes: 10 additions & 27 deletions pandas/tests/tools/test_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,15 +250,9 @@ def test_really_large_scalar(large_val, signed, transform, errors):
val = -large_val if signed else large_val

val = transform(val)
val_is_string = isinstance(val, str)

if val_is_string and errors in (None, "raise"):
msg = "Integer out of range. at position 0"
with pytest.raises(ValueError, match=msg):
to_numeric(val, **kwargs)
else:
expected = float(val) if (errors == "coerce" and val_is_string) else val
tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
expected = float(val) if errors == "coerce" else int(val)
tm.assert_almost_equal(to_numeric(val, **kwargs), expected)


def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
Expand All @@ -270,21 +264,17 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors
extra_elt = "string"
arr = [val] + multiple_elts * [extra_elt]

val_is_string = isinstance(val, str)
coercing = errors == "coerce"

if errors in (None, "raise") and (val_is_string or multiple_elts):
if val_is_string:
msg = "Integer out of range. at position 0"
else:
msg = 'Unable to parse string "string" at position 1'
if errors in (None, "raise") and multiple_elts:
msg = 'Unable to parse string "string" at position 1'

with pytest.raises(ValueError, match=msg):
to_numeric(arr, **kwargs)
else:
result = to_numeric(arr, **kwargs)

exp_val = float(val) if (coercing and val_is_string) else val
exp_val = float(val) if (coercing) else int(val)
expected = [exp_val]

if multiple_elts:
Expand All @@ -295,7 +285,7 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors
expected.append(extra_elt)
exp_dtype = object
else:
exp_dtype = float if isinstance(exp_val, (int, float)) else object
exp_dtype = float if isinstance(exp_val, float) else object

tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))

Expand All @@ -311,18 +301,11 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors
if multiple_elts:
arr.insert(0, large_val)

if errors in (None, "raise"):
index = int(multiple_elts)
msg = f"Integer out of range. at position {index}"
result = to_numeric(arr, **kwargs)
expected = [float(i) if errors == "coerce" else int(i) for i in arr]
exp_dtype = float if errors == "coerce" else object

with pytest.raises(ValueError, match=msg):
to_numeric(arr, **kwargs)
else:
result = to_numeric(arr, **kwargs)
expected = [float(i) for i in arr]
exp_dtype = float

tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))


@pytest.mark.parametrize(
Expand Down
Loading