diff --git a/ext/polars/src/lazyframe/general.rs b/ext/polars/src/lazyframe/general.rs index 99f95c4a37..a11ffe6cd0 100644 --- a/ext/polars/src/lazyframe/general.rs +++ b/ext/polars/src/lazyframe/general.rs @@ -816,6 +816,14 @@ impl RbLazyFrame { .into()) } + pub fn drop_nans(&self, subset: Option<&RbSelector>) -> Self { + self.ldf + .borrow() + .clone() + .drop_nans(subset.map(|e| e.inner.clone())) + .into() + } + pub fn drop_nulls(&self, subset: Option<&RbSelector>) -> Self { self.ldf .borrow() diff --git a/ext/polars/src/lib.rs b/ext/polars/src/lib.rs index 90290be1a6..5e06207ba1 100644 --- a/ext/polars/src/lib.rs +++ b/ext/polars/src/lib.rs @@ -781,6 +781,7 @@ fn init(ruby: &Ruby) -> RbResult<()> { class.define_method("explode", method!(RbLazyFrame::explode, 1))?; class.define_method("null_count", method!(RbLazyFrame::null_count, 0))?; class.define_method("unique", method!(RbLazyFrame::unique, 3))?; + class.define_method("drop_nans", method!(RbLazyFrame::drop_nans, 1))?; class.define_method("drop_nulls", method!(RbLazyFrame::drop_nulls, 1))?; class.define_method("slice", method!(RbLazyFrame::slice, 2))?; class.define_method("tail", method!(RbLazyFrame::tail, 1))?; diff --git a/lib/polars/data_frame.rb b/lib/polars/data_frame.rb index 8cc461ccc0..a9aea0d115 100644 --- a/lib/polars/data_frame.rb +++ b/lib/polars/data_frame.rb @@ -1848,6 +1848,51 @@ def tail(n = 5) _from_rbdf(_df.tail(n)) end + # Drop all rows that contain one or more NaN values. + # + # The original order of the remaining rows is preserved. + # + # @param subset [Object] + # Column name(s) for which NaN values are considered; if set to `nil` + # (default), use all columns (note that only floating-point columns + # can contain NaNs). + # + # @return [DataFrame] + # + # @example + # df = Polars::DataFrame.new( + # { + # "foo" => [-20.5, Float::NAN, 80.0], + # "bar" => [Float::NAN, 110.0, 25.5], + # "ham" => ["xxx", "yyy", nil], + # } + # ) + # df.drop_nans + # # => + # # shape: (1, 3) + # # ┌──────┬──────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞══════╪══════╪══════╡ + # # │ 80.0 ┆ 25.5 ┆ null │ + # # └──────┴──────┴──────┘ + # @example + # df.drop_nans(subset: ["bar"]) + # # => + # # shape: (2, 3) + # # ┌──────┬───────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞══════╪═══════╪══════╡ + # # │ NaN ┆ 110.0 ┆ yyy │ + # # │ 80.0 ┆ 25.5 ┆ null │ + # # └──────┴───────┴──────┘ + def drop_nans(subset: nil) + lazy.drop_nans(subset: subset).collect(_eager: true) + end + # Drop all rows that contain one or more null values. # # The original order of the remaining rows is preserved. diff --git a/lib/polars/lazy_frame.rb b/lib/polars/lazy_frame.rb index 4ce995bf0f..35515aedaf 100644 --- a/lib/polars/lazy_frame.rb +++ b/lib/polars/lazy_frame.rb @@ -3307,6 +3307,55 @@ def unique(maintain_order: true, subset: nil, keep: "first") _from_rbldf(_ldf.unique(maintain_order, selector_subset, keep)) end + # Drop all rows that contain one or more NaN values. + # + # The original order of the remaining rows is preserved. + # + # @param subset [Object] + # Column name(s) for which NaN values are considered; if set to `nil` + # (default), use all columns (note that only floating-point columns + # can contain NaNs). + # + # @return [LazyFrame] + # + # @example + # lf = Polars::LazyFrame.new( + # { + # "foo" => [-20.5, Float::NAN, 80.0], + # "bar" => [Float::NAN, 110.0, 25.5], + # "ham" => ["xxx", "yyy", nil], + # } + # ) + # lf.drop_nans.collect + # # => + # # shape: (1, 3) + # # ┌──────┬──────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞══════╪══════╪══════╡ + # # │ 80.0 ┆ 25.5 ┆ null │ + # # └──────┴──────┴──────┘ + # @example + # lf.drop_nans(subset: ["bar"]).collect + # # => + # # shape: (2, 3) + # # ┌──────┬───────┬──────┐ + # # │ foo ┆ bar ┆ ham │ + # # │ --- ┆ --- ┆ --- │ + # # │ f64 ┆ f64 ┆ str │ + # # ╞══════╪═══════╪══════╡ + # # │ NaN ┆ 110.0 ┆ yyy │ + # # │ 80.0 ┆ 25.5 ┆ null │ + # # └──────┴───────┴──────┘ + def drop_nans(subset: nil) + selector_subset = nil + if !subset.nil? + selector_subset = Utils.parse_list_into_selector(subset)._rbselector + end + _from_rbldf(_ldf.drop_nans(selector_subset)) + end + # Drop all rows that contain one or more null values. # # The original order of the remaining rows is preserved.