MarcoGorelli · MarcoGorelli · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,14 +12,18 @@ name = "minimal_plugin"
 crate-type= ["cdylib"]
 
 [dependencies]
-pyo3 = { version = "0.21.2", features = ["extension-module"] }
-pyo3-polars = { version = "0.15.0", features = ["derive"] }
+pyo3 = { version = "0.22.2", features = ["extension-module", "abi3-py38"] }
+pyo3-polars = { version = "0.16.0", features = ["derive", "dtype-struct", "dtype-decimal"] }
 serde = { version = "1", features = ["derive"] }
-polars = { version = "0.41.3", features=["dtype-struct"], default-features = false }
-polars-arrow = { version = "0.41.3", default-features = false }
-polars-core = { version = "0.41.3", default-features = false }
+polars = { version = "0.42.0", features = ["dtype-struct"], default-features = false }
+polars-arrow = { version = "0.42.0", default-features = false }
+polars-core = { version = "0.42.0", default-features = false }
+polars-sql = { version = "0.42.0", default-features = false }
 reverse_geocoder = "4.1.1"
 # rust-stemmers = "1.2.0"
 
+[patch.crates-io]
+pyo3-polars = { git = 'https://github.com/MarcoGorelli/pyo3-polars.git', rev='a0327ec121986711aec32dc1e52234838bf3b25b' }
+
 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
diff --git a/docs/arguments.md b/docs/arguments.md
@@ -49,7 +49,7 @@ which is going to be very similar to the good version of `pig_latinnify`:
 fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult<Series> {
     let s = &inputs[0];
     let ca = s.str()?;
-    let out = ca.apply_to_buffer(|value, output| {
+    let out = ca.apply_into_string_amortized(|value, output| {
         write!(output, "{}{}", value, kwargs.suffix).unwrap();
     });
     Ok(out.into_series())

diff --git a/docs/lists_in_lists_out.md b/docs/lists_in_lists_out.md
@@ -59,7 +59,7 @@ fn non_zero_indices(inputs: &[Series]) -> PolarsResult<Series> {
     Ok(out.into_series())
 }
 ```
-`apply_amortized` is a bit like the `apply_to_buffer` function we used in [How to STRING something together],
+`apply_amortized` is a bit like the `apply_into_string_amortized` function we used in [How to STRING something together],
 in that it makes a big allocation upfront to amortize the allocation costs. Think of it as a list version
 of `apply_values`, where each element is itself a `Series`.
 

diff --git a/docs/lost_in_space.md b/docs/lost_in_space.md
@@ -35,7 +35,7 @@ for longitude) and producing a String output column.
 
 ## Binary elementwise apply to buffer
 
-In [How to STRING something together], we learned how to use `StringChunked.apply_to_buffer`
+In [How to STRING something together], we learned how to use `StringChunked.apply_into_string_amortized`
 to run an elementwise function on a String column. Does Polars have a binary version of that one
 which allows us to start from any data type?
 

diff --git a/docs/stem.md b/docs/stem.md
@@ -53,7 +53,7 @@ use rust_stemmers::{Algorithm, Stemmer};
 fn snowball_stem(inputs: &[Series]) -> PolarsResult<Series> {
     let ca: &StringChunked = inputs[0].str()?;
     let en_stemmer = Stemmer::create(Algorithm::English);
-    let out: StringChunked = ca.apply_to_buffer(|value: &str, output: &mut String| {
+    let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
         write!(output, "{}", en_stemmer.stem(value)).unwrap()
     });
     Ok(out.into_series())

diff --git a/docs/stringify.md b/docs/stringify.md
@@ -89,7 +89,7 @@ Can we do better?
 
 ## Pig-latinnify - take 2
 
-Yes! `StringChunked` has a utility `apply_to_buffer` method which amortises
+Yes! `StringChunked` has a utility `apply_into_string_amortized` method which amortises
 the cost of creating new strings for each row by creating a string upfront,
 clearing it, and repeatedly writing to it.
 This gives a 4x speedup! All you need to do is change `pig_latinnify` to:
@@ -98,7 +98,7 @@ This gives a 4x speedup! All you need to do is change `pig_latinnify` to:
 #[polars_expr(output_type=String)]
 fn pig_latinnify(inputs: &[Series]) -> PolarsResult<Series> {
     let ca: &StringChunked = inputs[0].str()?;
-    let out: StringChunked = ca.apply_to_buffer(|value: &str, output: &mut String| {
+    let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
         if let Some(first_char) = value.chars().next() {
             write!(output, "{}{}ay", &value[1..], first_char).unwrap()
         }

diff --git a/docs/struct.md b/docs/struct.md
@@ -24,8 +24,8 @@ def shift_struct(expr: IntoExpr) -> pl.Expr:
 On the Rust side, we need to start by activating the necessary
 feature - in `Cargo.toml`, please make this change:
 ```diff
--polars = { version = "0.41.3", default-features = false }
-+polars = { version = "0.41.3", features=["dtype-struct"], default-features = false }
+-polars = { version = "0.42.0", default-features = false }
++polars = { version = "0.42.0", features=["dtype-struct"], default-features = false }
 ```
 
 Then, we need to get the schema right.

diff --git a/minimal_plugin/__init__.py b/minimal_plugin/__init__.py
@@ -1,15 +1,20 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
 import polars as pl
 from pathlib import Path
-from polars.type_aliases import IntoExpr
 from minimal_plugin.utils import register_plugin, parse_version
 
 if parse_version(pl.__version__) < parse_version("0.20.16"):
-    from polars.utils.udfs import _get_shared_lib_location
+    from polars.utils.udfs import _get_shared_lib_location  # type: ignore[missing-import]
 
     lib: str | Path = _get_shared_lib_location(__file__)
 else:
     lib = Path(__file__).parent
 
+if TYPE_CHECKING:
+    from minimal_plugin.typing import IntoExpr
+
 
 def noop(expr: IntoExpr) -> pl.Expr:
     return register_plugin(
@@ -141,6 +146,7 @@ def interpolate(expr: IntoExpr) -> pl.Expr:
         is_elementwise=False,
     )
 
+
 def life_step(left: IntoExpr, mid: IntoExpr, right: IntoExpr) -> pl.Expr:
     return register_plugin(
         args=[left, mid, right],

diff --git a/minimal_plugin/typing.py b/minimal_plugin/typing.py
@@ -0,0 +1,14 @@
+from typing import TYPE_CHECKING, Union
+
+if TYPE_CHECKING:
+    import sys
+    import polars as pl
+
+    if sys.version_info >= (3, 10):
+        from typing import TypeAlias
+    else:
+        from typing_extensions import TypeAlias
+    from polars.datatypes import DataType, DataTypeClass
+
+    IntoExpr: TypeAlias = Union[pl.Expr, str, pl.Series]
+    PolarsDataType: TypeAlias = Union[DataType, DataTypeClass]
diff --git a/minimal_plugin/utils.py b/minimal_plugin/utils.py
@@ -6,7 +6,7 @@
 import polars as pl
 
 if TYPE_CHECKING:
-    from polars.type_aliases import IntoExpr, PolarsDataType
+    from my_plugin.typing import IntoExpr, PolarsDataType
     from pathlib import Path
 
 
@@ -54,18 +54,18 @@ def register_plugin(
     *,
     symbol: str,
     is_elementwise: bool,
+    kwargs: dict[str, Any] | None = None,
     args: list[IntoExpr],
     lib: str | Path,
-    kwargs: dict[str, Any] | None = None,
     returns_scalar: bool = False,
 ) -> pl.Expr:
     if parse_version(pl.__version__) < parse_version("0.20.16"):
-        assert isinstance(args[0], pl.Expr)
+        expr = parse_into_expr(args[0])
         assert isinstance(lib, str)
-        return args[0].register_plugin(
+        return expr.register_plugin(
             lib=lib,
             symbol=symbol,
-            args=args[1:],
+            args=args[1:],  # type: ignore[arg-type]
             kwargs=kwargs,
             is_elementwise=is_elementwise,
             returns_scalar=returns_scalar,

diff --git a/perf.py b/perf.py
@@ -16,34 +16,43 @@
 )
 """
 
-results = np.array(timeit.Timer(
-    stmt="df.select(pl.col('a').mp.abs_i64_fast())",
-    setup=setup,
+results = (
+    np.array(
+        timeit.Timer(
+            stmt="df.select(pl.col('a').mp.abs_i64_fast())",
+            setup=setup,
+        ).repeat(7, 3)
     )
-    .repeat(7, 3)
-)/3
-print(f'min: {min(results)}')
-print(f'max: {max(results)}')
-print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
+    / 3
+)
+print(f"min: {min(results)}")
+print(f"max: {max(results)}")
+print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
 
-results = np.array(timeit.Timer(
-    stmt="df.select(pl.col('a').mp.abs_i64())",
-    setup=setup,
+results = (
+    np.array(
+        timeit.Timer(
+            stmt="df.select(pl.col('a').mp.abs_i64())",
+            setup=setup,
+        ).repeat(7, 3)
     )
-    .repeat(7, 3)
-)/3
-print(f'min: {min(results)}')
-print(f'max: {max(results)}')
-print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
+    / 3
+)
+print(f"min: {min(results)}")
+print(f"max: {max(results)}")
+print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
-    results = np.array(timeit.Timer(
-        stmt="df.select(pl.col('a').map_elements(lambda x: abs(x)))",
-        setup=setup,
+    results = (
+        np.array(
+            timeit.Timer(
+                stmt="df.select(pl.col('a').map_elements(lambda x: abs(x)))",
+                setup=setup,
+            ).repeat(7, 3)
         )
-        .repeat(7, 3)
-    )/3
-print(f'min: {min(results)}')
-print(f'max: {max(results)}')
-print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
+        / 3
+    )
+print(f"min: {min(results)}")
+print(f"max: {max(results)}")
+print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
diff --git a/perf_list.py b/perf_list.py
@@ -12,22 +12,28 @@
 df = pl.DataFrame({'a': [rng.integers(low=-100, high=100, size=5) for _ in range(N)]})
 """
 
-results = np.array(timeit.Timer(
-    stmt="df.select(mp.non_zero_indices('a'))",
-    setup=setup,
+results = (
+    np.array(
+        timeit.Timer(
+            stmt="df.select(mp.non_zero_indices('a'))",
+            setup=setup,
+        ).repeat(7, 3)
     )
-    .repeat(7, 3)
-)/3
-print(f'min: {min(results)}')
-print(f'max: {max(results)}')
-print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
+    / 3
+)
+print(f"min: {min(results)}")
+print(f"max: {max(results)}")
+print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
 
-results = np.array(timeit.Timer(
-    stmt="df.select(pl.col('a').list.eval(pl.arg_where(pl.element() != 0)))",
-    setup=setup,
+results = (
+    np.array(
+        timeit.Timer(
+            stmt="df.select(pl.col('a').list.eval(pl.arg_where(pl.element() != 0)))",
+            setup=setup,
+        ).repeat(7, 3)
     )
-    .repeat(7, 3)
-)/3
-print(f'min: {min(results)}')
-print(f'max: {max(results)}')
-print(f'{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}')
+    / 3
+)
+print(f"min: {min(results)}")
+print(f"max: {max(results)}")
+print(f"{np.mean(results)} +/- {np.std(results)/np.sqrt(len(results))}")
diff --git a/run.py b/run.py
@@ -5,26 +5,35 @@
 import polars as pl
 import minimal_plugin as mp
 
-df = pl.DataFrame({
-    'values': [[1, 3, 2], [5, 7], []],
-    'weights': [[.5, .3, .2], [.1, .9], []]
-})
-print(df.with_columns(weighted_mean = mp.weighted_mean('values', 'weights')))
+df = pl.DataFrame(
+    {"values": [[1, 3, 2], [5, 7], []], "weights": [[0.5, 0.3, 0.2], [0.1, 0.9], []]}
+)
+print(df.with_columns(weighted_mean=mp.weighted_mean("values", "weights")))
 
-df = pl.DataFrame({
-    'english': ['foo', 'bar', ''],
-})
-print(df.with_columns(pig_latin = mp.pig_latinnify('english')))
+df = pl.DataFrame(
+    {
+        "english": ["foo", "bar", ""],
+    }
+)
+print(df.with_columns(pig_latin=mp.pig_latinnify("english")))
 
-df = pl.DataFrame({
-    'values': [1., 3, 2, 5, 7],
-    'weights': [.5, .3, .2, .1, .9],
-    'group': ['a', 'a', 'a', 'b', 'b'],
-})
-print(df.group_by('group').agg(weighted_mean = mp.vertical_weighted_mean('values', 'weights')))
+df = pl.DataFrame(
+    {
+        "values": [1.0, 3, 2, 5, 7],
+        "weights": [0.5, 0.3, 0.2, 0.1, 0.9],
+        "group": ["a", "a", "a", "b", "b"],
+    }
+)
+print(
+    df.group_by("group").agg(
+        weighted_mean=mp.vertical_weighted_mean("values", "weights")
+    )
+)
 
-df = pl.DataFrame({
-    'a': [None, None, 3, None, None, 9, 11, None],
-})
-result = df.with_columns(interpolate=mp.interpolate('a'))
+df = pl.DataFrame(
+    {
+        "a": [None, None, 3, None, None, 9, 11, None],
+    }
+)
+result = df.with_columns(interpolate=mp.interpolate("a"))
 print(result)
diff --git a/src/expressions.rs b/src/expressions.rs
@@ -169,7 +169,7 @@ struct AddSuffixKwargs {
 fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult<Series> {
     let s = &inputs[0];
     let ca = s.str()?;
-    let out = ca.apply_to_buffer(|value, output| {
+    let out = ca.apply_into_string_amortized(|value, output| {
         write!(output, "{}{}", value, kwargs.suffix).unwrap();
     });
     Ok(out.into_series())
@@ -181,7 +181,7 @@ fn add_suffix(inputs: &[Series], kwargs: AddSuffixKwargs) -> PolarsResult<Series
 // fn snowball_stem(inputs: &[Series]) -> PolarsResult<Series> {
 //     let ca: &StringChunked = inputs[0].str()?;
 //     let en_stemmer = Stemmer::create(Algorithm::English);
-//     let out: StringChunked = ca.apply_to_buffer(|value: &str, output: &mut String| {
+//     let out: StringChunked = ca.apply_into_string_amortized(|value: &str, output: &mut String| {
 //         write!(output, "{}", en_stemmer.stem(value)).unwrap()
 //     });
 //     Ok(out.into_series())
@@ -241,7 +241,7 @@ fn shifted_struct(input_fields: &[Field]) -> PolarsResult<Field> {
 #[polars_expr(output_type_func=shifted_struct)]
 fn shift_struct(inputs: &[Series]) -> PolarsResult<Series> {
     let struct_ = inputs[0].struct_()?;
-    let fields = struct_.fields();
+    let fields = struct_.fields_as_series();
     if fields.is_empty() {
         return Ok(inputs[0].clone());
     }
@@ -257,7 +257,7 @@ fn shift_struct(inputs: &[Series]) -> PolarsResult<Series> {
         })
         .collect::<Vec<_>>();
     fields.push(field_0);
-    StructChunked::new(struct_.name(), &fields).map(|ca| ca.into_series())
+    StructChunked::from_series(struct_.name(), &fields).map(|ca| ca.into_series())
 }
 
 use polars_arrow::array::MutablePlString;
@@ -414,7 +414,7 @@ where
         }
 
         let array = PrimitiveArray::new(
-            T::get_dtype().to_arrow(true),
+            T::get_dtype().to_arrow(CompatLevel::newest()),
             out.into(),
             Some(validity.into()),
         );