Skip to content

Commit 9974aad

Browse files
committed
Addresses #4514 - adds columnwise fillna.
1 parent 22bae73 commit 9974aad

File tree

3 files changed

+98
-45
lines changed

3 files changed

+98
-45
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ Other enhancements
214214
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
215215
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
216216
- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
217+
- Add columnwise fillna support to :meth:`DataFrame.fillna` (:issue:`4514`)
217218
- Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
218219
- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
219220
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)

pandas/core/generic.py

Lines changed: 56 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7117,53 +7117,67 @@ def fillna(
71177117
new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace)
71187118

71197119
elif isinstance(value, (dict, ABCSeries)):
7120-
if axis == 1:
7121-
raise NotImplementedError(
7122-
"Currently only can fill with dict/Series column by column"
7123-
)
71247120
result = self if inplace else self.copy(deep=False)
7125-
for k, v in value.items():
7126-
if k not in result:
7127-
continue
7121+
if axis == 1:
7122+
# Check that all columns in result have the same dtype
7123+
# otherwise don't bother with ffill and losing accurate dtypes
7124+
dtypes = [result[col].dtype for col in result.columns]
7125+
if len(set(dtypes)) > 1:
7126+
raise ValueError(
7127+
"All columns must have the same dtype, but got dtypes: "
7128+
f"{dict(zip(result.columns, dtypes))}"
7129+
)
7130+
if (value_dtype := np.asarray(value).dtype) != dtypes[0]:
7131+
raise ValueError(
7132+
"Dtype mismatch for value "
7133+
f"(value.dtype={value_dtype} vs {dtypes[0]})"
7134+
)
7135+
result = result.T.fillna(value=value).T
7136+
else:
7137+
for k, v in value.items():
7138+
if k not in result:
7139+
continue
71287140

7129-
res_k = result[k].fillna(v, limit=limit)
7141+
res_k = result[k].fillna(v, limit=limit)
71307142

7131-
if not inplace:
7132-
result[k] = res_k
7133-
else:
7134-
# We can write into our existing column(s) iff dtype
7135-
# was preserved.
7136-
if isinstance(res_k, ABCSeries):
7137-
# i.e. 'k' only shows up once in self.columns
7138-
if res_k.dtype == result[k].dtype:
7139-
result.loc[:, k] = res_k
7140-
else:
7141-
# Different dtype -> no way to do inplace.
7142-
result[k] = res_k
7143+
if not inplace:
7144+
result[k] = res_k
71437145
else:
7144-
# see test_fillna_dict_inplace_nonunique_columns
7145-
locs = result.columns.get_loc(k)
7146-
if isinstance(locs, slice):
7147-
locs = range(self.shape[1])[locs]
7148-
elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b":
7149-
locs = locs.nonzero()[0]
7150-
elif not (
7151-
isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
7152-
):
7153-
# Should never be reached, but let's cover our bases
7154-
raise NotImplementedError(
7155-
"Unexpected get_loc result, please report a bug at "
7156-
"https://github.com/pandas-dev/pandas"
7157-
)
7158-
7159-
for i, loc in enumerate(locs):
7160-
res_loc = res_k.iloc[:, i]
7161-
target = self.iloc[:, loc]
7162-
7163-
if res_loc.dtype == target.dtype:
7164-
result.iloc[:, loc] = res_loc
7146+
# We can write into our existing column(s) iff dtype
7147+
# was preserved.
7148+
if isinstance(res_k, ABCSeries):
7149+
# i.e. 'k' only shows up once in self.columns
7150+
if res_k.dtype == result[k].dtype:
7151+
result.loc[:, k] = res_k
71657152
else:
7166-
result.isetitem(loc, res_loc)
7153+
# Different dtype -> no way to do inplace.
7154+
result[k] = res_k
7155+
else:
7156+
# see test_fillna_dict_inplace_nonunique_columns
7157+
locs = result.columns.get_loc(k)
7158+
if isinstance(locs, slice):
7159+
locs = range(self.shape[1])[locs]
7160+
elif (
7161+
isinstance(locs, np.ndarray) and locs.dtype.kind == "b"
7162+
):
7163+
locs = locs.nonzero()[0]
7164+
elif not (
7165+
isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
7166+
):
7167+
# Should never be reached, but let's cover our bases
7168+
raise NotImplementedError(
7169+
"Unexpected get_loc result, please report a bug at "
7170+
"https://github.com/pandas-dev/pandas"
7171+
)
7172+
7173+
for i, loc in enumerate(locs):
7174+
res_loc = res_k.iloc[:, i]
7175+
target = self.iloc[:, loc]
7176+
7177+
if res_loc.dtype == target.dtype:
7178+
result.iloc[:, loc] = res_loc
7179+
else:
7180+
result.isetitem(loc, res_loc)
71677181
if inplace:
71687182
return self._update_inplace(result)
71697183
else:

pandas/tests/frame/methods/test_fillna.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -461,9 +461,47 @@ def test_fillna_dict_series(self):
461461
expected = df.fillna(df.max().to_dict())
462462
tm.assert_frame_equal(result, expected)
463463

464-
# disable this for now
465-
with pytest.raises(NotImplementedError, match="column by column"):
466-
df.fillna(df.max(axis=1), axis=1)
464+
def test_fillna_dict_series_axis_1(self):
465+
df = DataFrame(
466+
{
467+
"a": [np.nan, 1, 2, np.nan, np.nan],
468+
"b": [1, 2, 3, np.nan, np.nan],
469+
"c": [np.nan, 1, 2, 3, 4],
470+
}
471+
)
472+
result = df.fillna(df.max(axis=1), axis=1)
473+
df.fillna(df.max(axis=1), axis=1, inplace=True)
474+
expected = DataFrame(
475+
{
476+
"a": [1.0, 1, 2, 3, 4],
477+
"b": [1.0, 2, 3, 3, 4],
478+
"c": [1.0, 1, 2, 3, 4],
479+
}
480+
)
481+
tm.assert_frame_equal(result, expected)
482+
tm.assert_frame_equal(df, expected)
483+
484+
def test_fillna_dict_series_axis_1_mismatch_cols(self):
485+
df = DataFrame(
486+
{
487+
"a": ["abc", "def", np.nan, "ghi", "jkl"],
488+
"b": [1, 2, 3, np.nan, np.nan],
489+
"c": [np.nan, 1, 2, 3, 4],
490+
}
491+
)
492+
with pytest.raises(ValueError, match="All columns must have the same dtype"):
493+
df.fillna(Series({"a": "abc", "b": "def", "c": "hij"}), axis=1)
494+
495+
def test_fillna_dict_series_axis_1_value_mismatch_with_cols(self):
496+
df = DataFrame(
497+
{
498+
"a": [np.nan, 1, 2, np.nan, np.nan],
499+
"b": [1, 2, 3, np.nan, np.nan],
500+
"c": [np.nan, 1, 2, 3, 4],
501+
}
502+
)
503+
with pytest.raises(ValueError, match="Dtype mismatch for value"):
504+
df.fillna(Series({"a": "abc", "b": "def", "c": "hij"}), axis=1)
467505

468506
def test_fillna_dataframe(self):
469507
# GH#8377

0 commit comments

Comments
 (0)