diff --git a/Cargo.lock b/Cargo.lock index 3dc276d7c231..918f026c0be2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -204,7 +204,7 @@ dependencies = [ "snap", "strum", "strum_macros", - "thiserror", + "thiserror 2.0.18", "uuid", "zstd", ] @@ -1101,7 +1101,7 @@ dependencies = [ "serde_json", "serde_repr", "serde_urlencoded", - "thiserror", + "thiserror 2.0.18", "tokio", "tokio-stream", "tokio-util", @@ -1252,6 +1252,12 @@ dependencies = [ "shlex", ] +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cfg-if" version = "1.0.4" @@ -1379,6 +1385,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + [[package]] name = "comfy-table" version = "7.2.1" @@ -2469,6 +2485,7 @@ dependencies = [ "indexmap 2.13.0", "insta", "itertools 0.14.0", + "jni", "log", "num-traits", "parking_lot", @@ -2633,7 +2650,7 @@ dependencies = [ "sqlparser", "tempfile", "testcontainers-modules", - "thiserror", + "thiserror 2.0.18", "tokio", "tokio-postgres", ] @@ -3744,6 +3761,15 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "java-locator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09c46c1fe465c59b1474e665e85e1256c3893dd00927b8d55f63b09044c1e64f" +dependencies = [ + "glob", +] + [[package]] name = "jiff" version = "0.2.18" @@ -3768,6 +3794,30 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "java-locator", + "jni-sys", + "libloading", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + [[package]] name = "jobserver" version = "0.1.34" @@ -4225,7 +4275,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -4787,7 +4837,7 @@ dependencies = [ "rustc-hash", "rustls", "socket2", - "thiserror", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -4808,7 +4858,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -4988,7 +5038,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.16", "libredox", - "thiserror", + "thiserror 2.0.18", ] [[package]] @@ -5680,7 +5730,7 @@ dependencies = [ "similar", "subst", "tempfile", - "thiserror", + "thiserror 2.0.18", "tracing", ] @@ -5928,7 +5978,7 @@ dependencies = [ "serde", "serde_json", "serde_with", - "thiserror", + "thiserror 2.0.18", "tokio", "tokio-stream", "tokio-util", @@ -5944,13 +5994,33 @@ dependencies = [ "testcontainers", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + [[package]] name = "thiserror" version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] @@ -6394,7 +6464,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.114", - "thiserror", + "thiserror 2.0.18", "unicode-ident", ] @@ -6915,6 +6985,15 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -6951,6 +7030,21 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -6993,6 +7087,12 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -7005,6 +7105,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -7017,6 +7123,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -7041,6 +7153,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -7053,6 +7171,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -7065,6 +7189,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -7077,6 +7207,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 13f91fd7d4ea..d4f0b15cfa82 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -73,10 +73,12 @@ pin-project-lite = "^0.2.7" tokio = { workspace = true } [dev-dependencies] +arrow = { workspace = true, features = ["ffi"] } criterion = { workspace = true, features = ["async_futures"] } datafusion-functions-aggregate = { workspace = true } datafusion-functions-window = { workspace = true } insta = { workspace = true } +jni = { version = "0.21.1", features = ["invocation"] } rand = { workspace = true } rstest = { workspace = true } rstest_reuse = "0.7.0" @@ -102,3 +104,39 @@ name = "sort_preserving_merge" harness = false name = "aggregate_vectorized" required-features = ["test_utils"] + +[[bench]] +harness = false +name = "filter_bench" + +[[bench]] +harness = false +name = "sort_bench" + +[[bench]] +harness = false +name = "deser" + +[[bench]] +harness = false +name = "serde" + +[[bench]] +harness = false +name = "count_group_by_bench" + +[[bench]] +harness = false +name = "hash_join_bench" + +[[bench]] +harness = false +name = "hash_join_by_type" + +[[bench]] +harness = false +name = "distinct_group_by_bench" + +[[bench]] +harness = false +name = "filter_jni_benchmark" diff --git a/datafusion/physical-plan/benches/Untitled b/datafusion/physical-plan/benches/Untitled new file mode 100644 index 000000000000..36ea8ed8754a --- /dev/null +++ b/datafusion/physical-plan/benches/Untitled @@ -0,0 +1 @@ +create_schema \ No newline at end of file diff --git a/datafusion/physical-plan/benches/bench_utils.rs b/datafusion/physical-plan/benches/bench_utils.rs new file mode 100644 index 000000000000..2e9feff889ac --- /dev/null +++ b/datafusion/physical-plan/benches/bench_utils.rs @@ -0,0 +1,957 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Shared utilities for deserialization + execution benchmarks. +//! +//! This module provides common functionality for benchmarks that measure +//! Arrow IPC deserialization combined with DataFusion execution plans. + +use std::io::Write; +use std::sync::Arc; + +use arrow::array::{ + ArrayRef, BinaryArray, DictionaryArray, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, RecordBatch, StringArray, StringViewArray, +}; +use arrow::buffer::Buffer; +use arrow::datatypes::{DataType, Field, Int16Type, Schema, SchemaRef}; +use arrow::ipc::convert::fb_to_schema; +use arrow::ipc::reader::{FileDecoder, read_footer_length}; +use arrow::ipc::writer::FileWriter; +use arrow::ipc::root_as_footer; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; +use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion_physical_plan::memory::MemoryStream; +use datafusion_physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, +}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; + +// ============================================================================ +// Schema Definition +// ============================================================================ + +/// String column type for benchmarks. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StringColumnType { + /// Regular Utf8 strings + Utf8, + /// Utf8View strings (optimized for strings ≤12 bytes) + Utf8View, + /// Dictionary-encoded Utf8 strings with Int16 keys + DictionaryUtf8, + /// Dictionary-encoded Utf8View strings with Int16 keys + DictionaryUtf8View, +} + +/// Creates the benchmark schema with the following columns: +/// - colint: Int32 - integer values with modulo pattern +/// - collong: Int64 - long values with modulo pattern +/// - colfloat: Float32 - floating point values +/// - coldouble: Float64 - double precision values +/// - colstring: Utf8 - string values with limited cardinality +/// - colbinary: Binary - random binary data of configurable size +pub fn create_schema() -> SchemaRef { + create_schema_with_string_type(StringColumnType::Utf8) +} + +/// Creates the benchmark schema with a specific string column type. +pub fn create_schema_with_string_type(string_type: StringColumnType) -> SchemaRef { + let string_data_type = match string_type { + StringColumnType::Utf8 => DataType::Utf8, + StringColumnType::Utf8View => DataType::Utf8View, + StringColumnType::DictionaryUtf8 => { + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)) + } + StringColumnType::DictionaryUtf8View => { + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8View)) + } + }; + + Arc::new(Schema::new(vec![ + Field::new("colint", DataType::Int32, false), + Field::new("collong", DataType::Int64, false), + Field::new("colfloat", DataType::Float32, false), + Field::new("coldouble", DataType::Float64, false), + Field::new("colstring", string_data_type, false), + Field::new("colbinary", DataType::Binary, false), + ])) +} + +/// Creates a single-column schema with an Int32 column. +/// +/// Used for aggregation benchmarks where we want to isolate +/// the grouping column performance without other columns. +pub fn create_int_column_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "groupCol", + DataType::Int32, + false, + )])) +} + +/// Creates a single-column schema with a Binary column. +/// +/// Used for aggregation benchmarks to test grouping performance +/// with variable-length binary keys. +pub fn create_binary_column_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "groupCol", + DataType::Binary, + false, + )])) +} + +/// Creates a single-column schema with an Int32 column named "colInt". +/// +/// Used for join benchmarks where the build side has only the join key column. +pub fn create_join_build_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "colint", + DataType::Int32, + false, + )])) +} + +/// Creates a single-column schema with a Utf8 column named "colString". +pub fn create_join_build_schema_string() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "colstring", + DataType::Utf8, + false, + )])) +} + +/// Creates a single-column schema with a Utf8View column named "colString". +pub fn create_join_build_schema_string_view() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "colstring", + DataType::Utf8View, + false, + )])) +} + +/// Creates a single-column schema with a Dictionary(Int16, Utf8) column named "colString". +pub fn create_join_build_schema_dictionary_string() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "colstring", + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + false, + )])) +} + +/// Creates a single-column schema with a Dictionary(Int16, Utf8View) column named "colString". +pub fn create_join_build_schema_dictionary_string_view() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new( + "colstring", + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8View)), + false, + )])) +} + +// ============================================================================ +// Data Generation +// ============================================================================ + +/// Generates record batches with deterministic data for benchmarking. +/// +/// Each cell value is computed as a function of the row index, ensuring +/// reproducible benchmark results. The data patterns are designed to be +/// realistic for query processing benchmarks: +/// +/// - `colInt`: `i % 5000` - creates 5000 distinct values +/// - `colLong`: `i % 5000` - same pattern as colInt +/// - `colFloat`: `i / 2.0` - monotonically increasing +/// - `colDouble`: `i / 3.0` - monotonically increasing +/// - `colString`: `format!("str_{:04}", (start_row + i) % 5000)` - 100 distinct string values +/// - `colBinary`: random bytes of configurable size (seeded for reproducibility) +pub struct FunctionalBatchGenerator { + /// Schema for generated batches + schema: SchemaRef, + /// Number of rows in each batch + rows_per_batch: usize, + /// Total number of batches to generate + num_batches: usize, + /// Size in bytes for the binary column + binary_size: usize, + /// Type of string column to generate + string_column_type: StringColumnType, + /// Random number generator for binary data (seeded for reproducibility) + rng: StdRng, +} + +impl FunctionalBatchGenerator { + /// Creates a new batch generator. + /// + /// # Arguments + /// * `schema` - Arrow schema for the generated batches + /// * `rows_per_batch` - Number of rows per batch + /// * `num_batches` - Total number of batches to generate + /// * `binary_size` - Size in bytes for the binary column values + pub fn new( + schema: SchemaRef, + rows_per_batch: usize, + num_batches: usize, + binary_size: usize, + ) -> Self { + Self::new_with_string_type(schema, rows_per_batch, num_batches, binary_size, StringColumnType::Utf8) + } + + /// Creates a new batch generator with a specific string column type. + /// + /// # Arguments + /// * `schema` - Arrow schema for the generated batches + /// * `rows_per_batch` - Number of rows per batch + /// * `num_batches` - Total number of batches to generate + /// * `binary_size` - Size in bytes for the binary column values + /// * `string_column_type` - Type of string column to generate + pub fn new_with_string_type( + schema: SchemaRef, + rows_per_batch: usize, + num_batches: usize, + binary_size: usize, + string_column_type: StringColumnType, + ) -> Self { + // Use a fixed seed for reproducible benchmarks + let rng = StdRng::seed_from_u64(42); + Self { + schema, + rows_per_batch, + num_batches, + binary_size, + string_column_type, + rng, + } + } + + /// Generates a single record batch for the given batch index. + /// + /// Row indices are calculated as: `batch_index * rows_per_batch + local_row_index` + fn generate_batch(&mut self, batch_index: usize) -> RecordBatch { + let start_row = batch_index * self.rows_per_batch; + let num_rows = self.rows_per_batch; + + // Clone field names to avoid borrowing self while calling generate_column + let field_names: Vec = self + .schema + .fields() + .iter() + .map(|f| f.name().clone()) + .collect(); + + let columns: Vec = field_names + .iter() + .map(|name| self.generate_column(name, start_row, num_rows)) + .collect(); + + RecordBatch::try_new(Arc::clone(&self.schema), columns) + .expect("Failed to create record batch") + } + + /// Generates a single column array based on field name. + /// + /// Values are deterministic functions of the global row index `i`: + /// - colint: `i % 5000` (5000 distinct values) + /// - collong: `i % 5000` (5000 distinct values) + /// - colfloat: `i / 2.0` (monotonically increasing) + /// - coldouble: `i / 3.0` (monotonically increasing) + /// - colstring: format!("str_{:04}", (start_row + i) % 5000) (5000 distinct strings) + /// - colbinary: random bytes of `binary_size` length + fn generate_column(&mut self, field_name: &str, start_row: usize, num_rows: usize) -> ArrayRef { + match field_name { + "colint" => { + // Integer values with modulo 5000 pattern for reasonable cardinality + let values: Vec = (0..num_rows) + .map(|i| ((start_row + i) % 5000) as i32) + .collect(); + Arc::new(Int32Array::from(values)) + } + "collong" => { + // Long values with same modulo pattern as colint + let values: Vec = (0..num_rows) + .map(|i| ((start_row + i) % 5000) as i64) + .collect(); + Arc::new(Int64Array::from(values)) + } + "colfloat" => { + // Monotonically increasing float values + let values: Vec = (0..num_rows) + .map(|i| ((start_row + i) as f32) / 2.0) + .collect(); + Arc::new(Float32Array::from(values)) + } + "coldouble" => { + // Monotonically increasing double values + let values: Vec = (0..num_rows) + .map(|i| ((start_row + i) as f64) / 3.0) + .collect(); + Arc::new(Float64Array::from(values)) + } + "colstring" => { + // String values with 5000 distinct values (9 bytes each: "str_0000" to "str_4999") + let string_values: Vec = (0..num_rows) + .map(|i| format!("str_{:04}", (start_row + i) % 5000)) + .collect(); + + match self.string_column_type { + StringColumnType::Utf8 => { + Arc::new(StringArray::from(string_values)) + } + StringColumnType::Utf8View => { + Arc::new(StringViewArray::from(string_values)) + } + StringColumnType::DictionaryUtf8 => { + // Create dictionary from unique values, then create keys array + let keys: Int16Array = (0..num_rows) + .map(|i| ((start_row + i) % 5000) as i16) + .collect(); + + // Build dictionary values (unique strings) + let dict_values_vec: Vec = (0..5000) + .map(|i| format!("str_{:04}", i)) + .collect(); + let dict_values = StringArray::from(dict_values_vec); + + Arc::new(DictionaryArray::::try_new(keys, Arc::new(dict_values)).unwrap()) + } + StringColumnType::DictionaryUtf8View => { + let keys: Int16Array = (0..num_rows) + .map(|i| ((start_row + i) % 5000) as i16) + .collect(); + + // Build dictionary values (unique strings as StringView) + let dict_values_vec: Vec = (0..5000) + .map(|i| format!("str_{:04}", i)) + .collect(); + let dict_values = StringViewArray::from(dict_values_vec); + + Arc::new(DictionaryArray::::try_new(keys, Arc::new(dict_values)).unwrap()) + } + } + } + "colbinary" => { + // Random binary data of configurable size + let values: Vec> = (0..num_rows) + .map(|_| { + let mut buf = vec![0u8; self.binary_size]; + self.rng.fill(&mut buf[..]); + buf + }) + .collect(); + let values: Vec<&[u8]> = values.iter().map(|v| v.as_slice()).collect(); + Arc::new(BinaryArray::from(values)) + } + _ => panic!("Unknown column: {}", field_name), + } + } + + /// Generates all batches. + /// + /// Returns a vector of `num_batches` record batches, each containing + /// `rows_per_batch` rows. + pub fn generate_batches(&mut self) -> Vec { + (0..self.num_batches) + .map(|i| self.generate_batch(i)) + .collect() + } +} + +// ============================================================================ +// Single Column Data Generation (for aggregation benchmarks) +// ============================================================================ + +/// Type of single-column data to generate. +#[derive(Debug, Clone, Copy)] +pub enum SingleColumnType { + /// Int32 column with uniform distribution over [0, distinct_count) + Int, + /// Binary column with configurable size and distinct count + Binary { + /// Size of each binary value in bytes + binary_size: usize, + }, +} + +/// Generates single-column record batches for aggregation benchmarks. +/// +/// This generator creates data with a configurable number of distinct values, +/// useful for testing GROUP BY or JOIN performance at different cardinalities. +/// +/// Data patterns: +/// - **Int column**: Random values with uniform distribution over [0, distinct_count). +/// Uses a seeded RNG for reproducibility. +/// - **Binary column**: Pre-generates `distinct_count` distinct random byte +/// arrays of `binary_size` each, then selects randomly from them. +/// This ensures exactly `distinct_count` unique binary values with uniform distribution. +pub struct SingleColumnBatchGenerator { + /// Schema for generated batches (single column) + schema: SchemaRef, + /// Number of rows in each batch + rows_per_batch: usize, + /// Total number of batches to generate + num_batches: usize, + /// Number of distinct values to generate + distinct_count: usize, + /// Type of column to generate + column_type: SingleColumnType, + /// Pre-generated distinct binary values (only used for Binary column type) + distinct_binary_values: Vec>, + /// Random number generator for value selection (seeded for reproducibility) + rng: StdRng, +} + +impl SingleColumnBatchGenerator { + /// Creates a new single-column batch generator. + /// + /// # Arguments + /// * `column_type` - Type of column to generate (Int or Binary) + /// * `rows_per_batch` - Number of rows per batch + /// * `num_batches` - Total number of batches to generate + /// * `distinct_count` - Number of distinct values in the column + /// + /// # Returns + /// A new generator configured for the specified column type and cardinality. + pub fn new( + column_type: SingleColumnType, + rows_per_batch: usize, + num_batches: usize, + distinct_count: usize, + ) -> Self { + // Create appropriate schema based on column type + let schema = match column_type { + SingleColumnType::Int => create_int_column_schema(), + SingleColumnType::Binary { .. } => create_binary_column_schema(), + }; + + // Use a fixed seed for reproducible benchmarks + let mut rng = StdRng::seed_from_u64(42); + + // Pre-generate distinct binary values if needed + let distinct_binary_values = match column_type { + SingleColumnType::Binary { binary_size } => { + (0..distinct_count) + .map(|_| { + let mut buf = vec![0u8; binary_size]; + rng.fill(&mut buf[..]); + buf + }) + .collect() + } + SingleColumnType::Int => Vec::new(), + }; + + Self { + schema, + rows_per_batch, + num_batches, + distinct_count, + column_type, + distinct_binary_values, + rng, + } + } + + /// Returns the schema of the generated batches. + pub fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + /// Generates a single record batch. + /// + /// Values are randomly selected from [0, distinct_count) with uniform distribution. + fn generate_batch(&mut self) -> RecordBatch { + let num_rows = self.rows_per_batch; + + let column: ArrayRef = match self.column_type { + SingleColumnType::Int => { + // Generate random int values with uniform distribution over [0, distinct_count) + let values: Vec = (0..num_rows) + .map(|_| self.rng.random_range(0..self.distinct_count) as i32) + .collect(); + Arc::new(Int32Array::from(values)) + } + SingleColumnType::Binary { .. } => { + // Randomly select from pre-generated distinct binary values + let values: Vec<&[u8]> = (0..num_rows) + .map(|_| { + let idx = self.rng.random_range(0..self.distinct_count); + self.distinct_binary_values[idx].as_slice() + }) + .collect(); + Arc::new(BinaryArray::from(values)) + } + }; + + RecordBatch::try_new(Arc::clone(&self.schema), vec![column]) + .expect("Failed to create record batch") + } + + /// Generates all batches. + /// + /// Returns a vector of `num_batches` record batches, each containing + /// `rows_per_batch` rows with values randomly selected from `distinct_count` unique values. + pub fn generate_batches(&mut self) -> Vec { + (0..self.num_batches).map(|_| self.generate_batch()).collect() + } +} + +// ============================================================================ +// Join Build Side Data Generation +// ============================================================================ + +/// Type of join column to generate. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum JoinColumnType { + /// Int32 column + Int, + /// Utf8 string column + String, + /// Utf8View string column + StringView, + /// Dictionary(Int16, Utf8) column + DictionaryString, + /// Dictionary(Int16, Utf8View) column + DictionaryStringView, +} + +/// Generates single-column record batches for the build side of join benchmarks. +/// +/// This generator creates data with controlled match rates and key repetition patterns, +/// useful for testing JOIN performance at different selectivities. The generated data +/// is typically used as the build side (hash table) in hash joins. +/// +/// Data patterns: +/// - Generates integers from 0 to `(match_rate * 5000) - 1` +/// - Each distinct key is repeated `repeated_keys` times +/// - Total rows = `(match_rate * 5000) * repeated_keys` +/// - Sequential/deterministic: 0, 0, ..., 0, 1, 1, ..., 1, etc. +pub struct JoinBuildSideGenerator { + /// Schema for generated batches + schema: SchemaRef, + /// Match rate (0.5 or 1.0) - determines the range of keys + match_rate: f64, + /// Number of times each key is repeated + repeated_keys: usize, + /// Type of column to generate + column_type: JoinColumnType, +} + +impl JoinBuildSideGenerator { + /// Creates a new join build side generator. + /// + /// # Arguments + /// * `match_rate` - Fraction of probe-side keys that will match (0.5 or 1.0) + /// * `repeated_keys` - Number of times each distinct key appears + /// + /// # Returns + /// A new generator configured for the specified match rate and repetition. + pub fn new(match_rate: f64, repeated_keys: usize) -> Self { + Self::new_with_column_type(match_rate, repeated_keys, JoinColumnType::Int) + } + + /// Creates a new join build side generator with a specific column type. + /// + /// # Arguments + /// * `match_rate` - Fraction of probe-side keys that will match (0.5 or 1.0) + /// * `repeated_keys` - Number of times each distinct key appears + /// * `column_type` - Type of column to generate + /// + /// # Returns + /// A new generator configured for the specified match rate, repetition, and column type. + pub fn new_with_column_type( + match_rate: f64, + repeated_keys: usize, + column_type: JoinColumnType, + ) -> Self { + let schema = match column_type { + JoinColumnType::Int => create_join_build_schema(), + JoinColumnType::String => create_join_build_schema_string(), + JoinColumnType::StringView => create_join_build_schema_string_view(), + JoinColumnType::DictionaryString => create_join_build_schema_dictionary_string(), + JoinColumnType::DictionaryStringView => create_join_build_schema_dictionary_string_view(), + }; + Self { + schema, + match_rate, + repeated_keys, + column_type, + } + } + + /// Returns the schema of the generated batches. + pub fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + /// Returns the number of distinct keys that will be generated. + pub fn distinct_keys(&self) -> usize { + (self.match_rate * 5000.0) as usize + } + + /// Returns the total number of rows that will be generated. + pub fn total_rows(&self) -> usize { + self.distinct_keys() * self.repeated_keys + } + + /// Generates all batches for the build side of the join. + /// + /// Creates a single batch containing all rows. Each key from 0 to + /// `(match_rate * 5000) - 1` appears `repeated_keys` times consecutively. + /// + /// Example with match_rate=0.5 (2500 keys) and repeated_keys=2: + /// - Int: `[0, 0, 1, 1, 2, 2, ..., 2499, 2499]` + /// - String: `["str_0000", "str_0000", "str_0001", "str_0001", ..., "str_2499", "str_2499"]` + pub fn generate_batches(&self) -> Vec { + let distinct_keys = self.distinct_keys(); + let total_rows = self.total_rows(); + + let column: ArrayRef = match self.column_type { + JoinColumnType::Int => { + // Generate integer values: each key from 0 to distinct_keys-1 repeated repeated_keys times + let values: Vec = (0..distinct_keys) + .flat_map(|key| std::iter::repeat(key as i32).take(self.repeated_keys)) + .collect(); + assert_eq!(values.len(), total_rows); + Arc::new(Int32Array::from(values)) + } + JoinColumnType::String => { + // Generate string values: each key as "str_{key:04}" repeated repeated_keys times + let values: Vec = (0..distinct_keys) + .flat_map(|key| std::iter::repeat(format!("str_{:04}", key)).take(self.repeated_keys)) + .collect(); + assert_eq!(values.len(), total_rows); + Arc::new(StringArray::from(values)) + } + JoinColumnType::StringView => { + // Generate string view values + let values: Vec = (0..distinct_keys) + .flat_map(|key| std::iter::repeat(format!("str_{:04}", key)).take(self.repeated_keys)) + .collect(); + assert_eq!(values.len(), total_rows); + Arc::new(StringViewArray::from(values)) + } + JoinColumnType::DictionaryString => { + // Generate dictionary-encoded string values + // Dictionary contains distinct_keys unique strings + // Keys array contains indices that repeat according to repeated_keys + let dict_values_vec: Vec = (0..distinct_keys) + .map(|key| format!("str_{:04}", key)) + .collect(); + let dict_values = StringArray::from(dict_values_vec); + + let keys: Int16Array = (0..distinct_keys) + .flat_map(|key| std::iter::repeat(key as i16).take(self.repeated_keys)) + .collect(); + + assert_eq!(keys.len(), total_rows); + Arc::new(DictionaryArray::::try_new(keys, Arc::new(dict_values)).unwrap()) + } + JoinColumnType::DictionaryStringView => { + // Generate dictionary-encoded string view values + let dict_values_vec: Vec = (0..distinct_keys) + .map(|key| format!("str_{:04}", key)) + .collect(); + let dict_values = StringViewArray::from(dict_values_vec); + + let keys: Int16Array = (0..distinct_keys) + .flat_map(|key| std::iter::repeat(key as i16).take(self.repeated_keys)) + .collect(); + + assert_eq!(keys.len(), total_rows); + Arc::new(DictionaryArray::::try_new(keys, Arc::new(dict_values)).unwrap()) + } + }; + + let batch = RecordBatch::try_new(Arc::clone(&self.schema), vec![column]) + .expect("Failed to create record batch"); + + vec![batch] + } +} + +// ============================================================================ +// Arrow IPC Serialization / Deserialization +// ============================================================================ + +/// Serializes record batches to Arrow IPC file format (in-memory). +/// +/// This simulates receiving Arrow data over a network or reading from storage. +/// Uses the IPC file format (with footer) to enable zero-copy deserialization. +/// +/// # Arguments +/// * `batches` - Record batches to serialize +/// * `schema` - Arrow schema (must match batches) +/// +/// # Returns +/// Serialized IPC data as a byte vector +pub fn serialize_to_ipc(batches: &[RecordBatch], schema: &SchemaRef) -> Vec { + let mut buffer = Vec::new(); + { + let mut writer = FileWriter::try_new(&mut buffer, schema).unwrap(); + for batch in batches { + writer.write(batch).unwrap(); + } + writer.finish().unwrap(); + } + buffer +} + +/// Deserializes record batches from Arrow IPC file format using zero-copy. +/// +/// This is the operation being benchmarked - converting serialized Arrow IPC +/// data back into in-memory record batches that can be processed by DataFusion. +/// +/// Zero-copy means the Arrow arrays refer directly to the provided buffer, +/// avoiding memory copying during deserialization. +/// +/// # Arguments +/// * `buffer` - Serialized IPC data +/// +/// # Returns +/// Tuple of (schema, batches) extracted from the IPC data +pub fn deserialize_zero_copy(buffer: &Buffer) -> (SchemaRef, Vec) { + // Read the footer to get schema and batch locations + let trailer_start = buffer.len() - 10; + let footer_len = read_footer_length(buffer[trailer_start..].try_into().unwrap()).unwrap(); + let footer = root_as_footer(&buffer[trailer_start - footer_len..trailer_start]).unwrap(); + + let schema = Arc::new(fb_to_schema(footer.schema().unwrap())); + let mut decoder = FileDecoder::new(Arc::clone(&schema), footer.version()); + + // Read dictionaries if present + for block in footer.dictionaries().iter().flatten() { + let block_len = block.bodyLength() as usize + block.metaDataLength() as usize; + let data = buffer.slice_with_length(block.offset() as _, block_len); + decoder.read_dictionary(block, &data).unwrap(); + } + + // Read all record batches + let mut batches = Vec::new(); + if let Some(batch_blocks) = footer.recordBatches() { + for block in batch_blocks { + let block_len = block.bodyLength() as usize + block.metaDataLength() as usize; + let data = buffer.slice_with_length(block.offset() as _, block_len); + if let Some(batch) = decoder.read_record_batch(&block, &data).unwrap() { + batches.push(batch); + } + } + } + + (schema, batches) +} + +/// Serializes execution results (record batches) back to Arrow IPC format. +/// +/// This measures the overhead of serializing query results, which is relevant +/// for scenarios where results need to be sent over a network or stored. +/// +/// # Arguments +/// * `batches` - Result record batches from query execution +/// +/// # Returns +/// Serialized IPC data as a byte vector +pub fn serialize_results_to_ipc(batches: &[RecordBatch]) -> Vec { + if batches.is_empty() { + return Vec::new(); + } + let schema = batches[0].schema(); + serialize_to_ipc(batches, &schema) +} + +/// A writer that discards all data written to it. +/// +/// This is useful for benchmarking serialization overhead without +/// including actual I/O or memory allocation costs. +struct SinkWriter { + bytes_written: usize, +} + +impl SinkWriter { + fn new() -> Self { + Self { bytes_written: 0 } + } + + /// Returns the total number of bytes that would have been written. + fn bytes_written(&self) -> usize { + self.bytes_written + } +} + +impl Write for SinkWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.bytes_written += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +/// Serializes a record batch to a sink that drops all data. +/// +/// This function measures pure serialization overhead by writing to a sink +/// that discards data instead of allocating memory or performing I/O. +/// Useful for benchmarking the CPU cost of serialization alone. +/// +/// # Arguments +/// * `batch` - The record batch to serialize +/// +/// # Returns +/// The number of bytes that would have been written +/// +/// # Example +/// ```ignore +/// let batch = create_test_batch(); +/// let bytes_written = serialize_to_sink(&batch); +/// println!("Serialization would write {} bytes", bytes_written); +/// ``` +pub fn serialize_to_sink(batch: &RecordBatch) -> usize { + let schema = batch.schema(); + let mut sink = SinkWriter::new(); + { + let mut writer = FileWriter::try_new(&mut sink, &schema).unwrap(); + writer.write(batch).unwrap(); + writer.finish().unwrap(); + } + sink.bytes_written() +} + +/// Serializes multiple record batches to a sink that drops all data. +/// +/// This function measures pure serialization overhead by writing to a sink +/// that discards data instead of allocating memory or performing I/O. +/// Useful for benchmarking the CPU cost of serialization alone. +/// +/// # Arguments +/// * `batches` - The record batches to serialize +/// * `schema` - The schema for the batches +/// +/// # Returns +/// The number of bytes that would have been written +/// +/// # Example +/// ```ignore +/// let batches = create_test_batches(); +/// let schema = batches[0].schema(); +/// let bytes_written = serialize_batches_to_sink(&batches, &schema); +/// println!("Serialization would write {} bytes", bytes_written); +/// ``` +pub fn serialize_batches_to_sink(batches: &[RecordBatch], schema: &SchemaRef) -> usize { + let mut sink = SinkWriter::new(); + { + let mut writer = FileWriter::try_new(&mut sink, schema).unwrap(); + for batch in batches { + writer.write(batch).unwrap(); + } + writer.finish().unwrap(); + } + sink.bytes_written() +} + +// ============================================================================ +// Execution Plan Source +// ============================================================================ + +/// A simple execution plan that serves pre-loaded record batches. +/// +/// This is used as the leaf node in benchmark execution plans. It wraps +/// already-deserialized batches and makes them available to downstream +/// operators (Filter, Sort, etc.) via the standard ExecutionPlan interface. +/// +/// Unlike file-based sources, this has no I/O overhead - it simply streams +/// the in-memory batches, allowing us to isolate operator performance. +#[derive(Debug)] +pub struct BatchSourceExec { + /// Schema of the batches + schema: SchemaRef, + /// Pre-loaded record batches to serve + batches: Vec, + /// Cached plan properties (partitioning, ordering, etc.) + cache: PlanProperties, +} + +impl BatchSourceExec { + /// Creates a new batch source execution plan. + /// + /// # Arguments + /// * `schema` - Schema for the batches + /// * `batches` - Pre-loaded record batches to serve + pub fn new(schema: SchemaRef, batches: Vec) -> Self { + let cache = PlanProperties::new( + EquivalenceProperties::new(Arc::clone(&schema)), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + ); + Self { + schema, + batches, + cache, + } + } +} + +impl DisplayAs for BatchSourceExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "BatchSourceExec: batches={}", self.batches.len()) + } +} + +impl ExecutionPlan for BatchSourceExec { + fn name(&self) -> &'static str { + "BatchSourceExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn properties(&self) -> &PlanProperties { + &self.cache + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> datafusion_common::Result> { + Ok(self) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> datafusion_common::Result { + Ok(Box::pin(MemoryStream::try_new( + self.batches.clone(), + Arc::clone(&self.schema), + None, + )?)) + } +} diff --git a/datafusion/physical-plan/benches/count_group_by_bench.rs b/datafusion/physical-plan/benches/count_group_by_bench.rs new file mode 100644 index 000000000000..afe99c31515d --- /dev/null +++ b/datafusion/physical-plan/benches/count_group_by_bench.rs @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for DataFusion AggregateExec (COUNT(*) GROUP BY). +//! +//! This benchmark measures the performance of hash-based aggregation with +//! varying group cardinalities and column types. +//! +//! ## Test configurations: +//! +//! - **Column types**: Int32, Binary (10B), Binary (1024B) +//! - **Distinct counts**: 4096, 16384, 65536 +//! - **Data size**: 1M rows total (100 batches × 10K rows) +//! +//! The benchmark helps understand: +//! - How grouping performance scales with cardinality +//! - Impact of key type (fixed-width int vs variable-length binary) +//! - Impact of key size (10B vs 1024B binary) +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Run specific configuration (e.g., only int column benchmarks) +//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan -- "int_col" +//! +//! # Run specific cardinality +//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan -- "distinct_4096" +//! ``` + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::buffer::Buffer; +use criterion::{ + BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use datafusion_execution::TaskContext; +use datafusion_functions_aggregate::count::count_udaf; +use datafusion_physical_expr::aggregate::AggregateExprBuilder; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; +use datafusion_physical_plan::{ExecutionPlan, collect}; + +use bench_utils::{ + BatchSourceExec, SingleColumnBatchGenerator, SingleColumnType, deserialize_zero_copy, + serialize_results_to_ipc, serialize_to_ipc, +}; + +// ============================================================================ +// Aggregate Plan Creation +// ============================================================================ + +/// Creates an AggregateExec that performs COUNT(*) GROUP BY groupCol. +/// +/// This simulates a common aggregation pattern where we count the number +/// of rows for each distinct value in the grouping column. +/// +/// # Arguments +/// * `input` - The input execution plan (typically BatchSourceExec) +/// +/// # Returns +/// An AggregateExec wrapped in Arc +fn create_count_groupby_plan(input: Arc) -> Arc { + let schema = input.schema(); + + // Build GROUP BY expression: GROUP BY groupCol + let group_col = + Arc::new(Column::new_with_schema("groupCol", &schema).unwrap()) as Arc; + let group_expr = vec![(Arc::clone(&group_col), "groupCol".to_string())]; + let group_by = PhysicalGroupBy::new_single(group_expr); + + // Build aggregate expression: COUNT(groupCol) + // We use groupCol as the argument since it's non-null; effectively COUNT(*) + let aggr_expr = vec![Arc::new( + AggregateExprBuilder::new(count_udaf(), vec![group_col]) + .schema(Arc::clone(&schema)) + .alias("count") + .build() + .unwrap(), + )]; + + // Create the aggregate execution plan + // Using Single mode (not partial/final) for simplicity in benchmarks + Arc::new( + AggregateExec::try_new( + AggregateMode::Single, + group_by, + aggr_expr, + vec![None], // No filter expressions + input, + schema, + ) + .unwrap(), + ) +} + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Benchmark configurations for different column types. +#[derive(Debug, Clone)] +struct BenchConfig { + /// Human-readable name for the configuration + name: &'static str, + /// Column type to generate + column_type: SingleColumnType, +} + +/// Main benchmark function for COUNT(*) GROUP BY execution. +/// +/// This benchmark measures COUNT(*) GROUP BY performance across: +/// - Different column types (Int32, Binary 10B, Binary 1024B) +/// - Different cardinalities (4096, 16384, 65536 distinct values) +/// +/// For each configuration, we measure: +/// - **agg_only**: Pure aggregation execution using pre-generated batches +/// This isolates the AggregateExec performance from serialization overhead +/// - **full_pipeline**: Complete deser + aggregation + output serialization +/// Real-world end-to-end latency including IPC serde +fn bench_count_group_by(c: &mut Criterion) { + // Create a single-threaded Tokio runtime for async execution. + // We use current_thread to ensure all async work runs on the benchmark thread, + // making results comparable to single-threaded Java benchmarks. + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + let mut group = c.benchmark_group("count_group_by_bench"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Configuration: 1M rows total (10K rows × 100 batches) + let rows_per_batch = 10_000; + let num_batches = 100; + let total_rows = rows_per_batch * num_batches; + + // Column type configurations + let configs = vec![ + BenchConfig { + name: "int_col", + column_type: SingleColumnType::Int, + }, + BenchConfig { + name: "binary_10B", + column_type: SingleColumnType::Binary { binary_size: 10 }, + }, + BenchConfig { + name: "binary_1024B", + column_type: SingleColumnType::Binary { binary_size: 1024 }, + }, + ]; + + // Cardinality configurations (number of distinct values) + let distinct_counts = vec![4096, 16384, 65536]; + + for config in &configs { + for &distinct_count in &distinct_counts { + let label = format!("{}/distinct_{}", config.name, distinct_count); + + // Generate test data + let mut generator = SingleColumnBatchGenerator::new( + config.column_type, + rows_per_batch, + num_batches, + distinct_count, + ); + let schema = generator.schema(); + let batches = generator.generate_batches(); + + // Serialize batches to IPC format for full pipeline benchmark + let ipc_data = serialize_to_ipc(&batches, &schema); + let ipc_size = ipc_data.len(); + + // Calculate approximate data size for throughput metric + let data_size: usize = batches + .iter() + .map(|b| b.get_array_memory_size()) + .sum(); + + // Log configuration for visibility in benchmark output + println!( + "Config: {} rows, {}, distinct={}, data size={:.2} MB, IPC size={:.2} MB", + total_rows, + config.name, + distinct_count, + data_size as f64 / (1024.0 * 1024.0), + ipc_size as f64 / (1024.0 * 1024.0) + ); + + // Set throughput metric for bytes/second calculations + group.throughput(Throughput::Bytes(ipc_size as u64)); + + // Benchmark 1: Aggregation execution only + // Uses pre-generated batches directly, isolating AggregateExec performance + group.bench_with_input( + BenchmarkId::new("agg_only", &label), + &batches, + |b, batches| { + b.iter_batched( + // Setup: clone batches (NOT timed) - needed because execution consumes them + || batches.clone(), + // Benchmark: execute aggregation (TIMED) + |batches| { + rt.block_on(async { + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_count_groupby_plan(source); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + black_box(results) + }) + }, + BatchSize::SmallInput, + ) + }, + ); + + let data_buffer = Buffer::from_vec(ipc_data); + + // Benchmark 2: Full pipeline (deser + aggregation + output serialization) + // Measures complete round-trip: IPC in -> aggregate -> IPC out + // Relevant for scenarios where results are sent over network or stored + group.bench_with_input( + BenchmarkId::new("full_pipeline", &label), + &data_buffer, + |b, data_buffer| { + b.iter(|| { + rt.block_on(async { + let (schema, batches) = deserialize_zero_copy(data_buffer); + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_count_groupby_plan(source); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + // Serialize results back to IPC format + let output_ipc = serialize_results_to_ipc(&results); + black_box(output_ipc) + }) + }) + }, + ); + } + } + + group.finish(); +} + +criterion_group!(benches, bench_count_group_by); +criterion_main!(benches); diff --git a/datafusion/physical-plan/benches/deser.rs b/datafusion/physical-plan/benches/deser.rs new file mode 100644 index 000000000000..e40616377b6a --- /dev/null +++ b/datafusion/physical-plan/benches/deser.rs @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for Arrow IPC deserialization performance. +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench deser -p datafusion-physical-plan +//! +//! # Run only the standard deserialization benchmark +//! cargo bench --bench deser -p datafusion-physical-plan -- deserialize_standard +//! +//! # Run only the zero-copy deserialization benchmark +//! cargo bench --bench deser -p datafusion-physical-plan -- deserialize_zero_copy +//! +//! # Change measurement time (per benchmark, default is 5 seconds) +//! cargo bench --bench deser -p datafusion-physical-plan -- --measurement-time 10 +//! +//! # Run specific configuration +//! cargo bench --bench deser -p datafusion-physical-plan -- "1M_rows_binary_10B" +//! ``` +//! +//! ## Baseline Management +//! +//! ```bash +//! # Save current results as a named baseline +//! cargo bench --bench deser -p datafusion-physical-plan -- --save-baseline my-baseline +//! +//! # Compare against a specific baseline +//! cargo bench --bench deser -p datafusion-physical-plan -- --baseline my-baseline +//! +//! # Delete all benchmark history and start fresh +//! rm -rf target/criterion +//! ``` + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::buffer::Buffer; +use criterion::{ + BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; + +use bench_utils::{ + FunctionalBatchGenerator, create_schema, deserialize_zero_copy, serialize_to_ipc, +}; + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Benchmarks zero-copy IPC deserialization. +/// +/// This measures the cost of Arrow IPC deserialization using zero-copy +/// techniques where Arrow arrays reference the original buffer directly +/// via Buffer slicing. This avoids copying the actual data and only +/// creates lightweight views into the existing buffer. +/// +/// This is the most efficient deserialization approach when you have +/// a contiguous buffer (e.g., mmap'd file or received network buffer). +fn bench_deserialize(c: &mut Criterion) { + let mut group = c.benchmark_group("deserialize_standard"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Configuration: 1M rows total (10K rows × 100 batches) + let rows_per_batch = 10_000; + let num_batches_vec = vec![1, 100]; + + // Test different binary column sizes to understand deserialization overhead + for num_batches in num_batches_vec { + let total_rows = rows_per_batch * num_batches; + let binary_sizes = vec![10, 1024, 2048]; + for binary_size in binary_sizes { + let label = format!("{num_batches}_batches/rows_binary_{binary_size}B"); + + // Generate test data and serialize to IPC format + let schema = create_schema(); + let mut generator = FunctionalBatchGenerator::new( + Arc::clone(&schema), + rows_per_batch, + num_batches, + binary_size, + ); + let batches = generator.generate_batches(); + let ipc_data = serialize_to_ipc(&batches, &schema); + + // Convert to Buffer for zero-copy deserialization + let buffer = Buffer::from_vec(ipc_data); + + // Set throughput metric for bytes/second calculations + group.throughput(Throughput::Bytes(buffer.len() as u64)); + + // Log configuration + println!( + "Config (zero-copy): {} rows, binary_size={} bytes, IPC size={:.2} MB", + total_rows, + binary_size, + buffer.len() as f64 / (1024.0 * 1024.0) + ); + + group.bench_with_input( + BenchmarkId::from_parameter(&label), + &buffer, + |b, buffer| { + b.iter(|| { + let (schema, batches) = deserialize_zero_copy(buffer); + // black_box prevents compiler from optimizing away unused results + black_box((schema, batches)) + }) + }, + ); + } + } + + group.finish(); +} + +criterion_group!(benches, bench_deserialize); +criterion_main!(benches); diff --git a/datafusion/physical-plan/benches/distinct_group_by_bench.rs b/datafusion/physical-plan/benches/distinct_group_by_bench.rs new file mode 100644 index 000000000000..03728c4459a9 --- /dev/null +++ b/datafusion/physical-plan/benches/distinct_group_by_bench.rs @@ -0,0 +1,325 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for DataFusion AggregateExec (COUNT(DISTINCT bytes) GROUP BY colInt). +//! +//! This benchmark measures the performance of hash-based aggregation with +//! distinct counting over binary columns, varying binary sizes and group cardinalities. +//! +//! ## Test configurations: +//! +//! - **Binary sizes**: 10B, 1000B +//! - **Distinct keys (group cardinalities)**: 4096, 16384, 65536 +//! - **Data size**: 1M rows total (100 batches × 10K rows) +//! +//! The benchmark helps understand: +//! - How COUNT(DISTINCT) performance scales with group cardinality +//! - Impact of binary value size on distinct counting +//! - Performance characteristics of two-column aggregation (GROUP BY colInt, COUNT(DISTINCT bytes)) +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Run specific configuration +//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan -- "INT_BYTES_10" +//! +//! # Run specific cardinality +//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan -- "distinct_4096" +//! ``` + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::fs::File; +use std::hint::black_box; +use std::path::PathBuf; +use std::sync::Arc; + +use arrow::array::{RecordBatch, ArrayRef, BinaryArray, BinaryViewArray}; +use arrow::datatypes::SchemaRef; +use arrow::ipc::reader::FileReader; +use criterion::{BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main}; +use datafusion_execution::TaskContext; +use datafusion_functions_aggregate::count::count_udaf; +use datafusion_physical_expr::aggregate::AggregateExprBuilder; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; +use datafusion_physical_plan::{ExecutionPlan, collect}; + +use bench_utils::{BatchSourceExec, serialize_results_to_ipc}; + +// ============================================================================ +// Aggregate Plan Creation +// ============================================================================ + +/// Creates an AggregateExec that performs COUNT(DISTINCT bytes) GROUP BY colInt. +/// +/// This benchmark doesn't generate data. Instead, we have to run the equivalent JMH benchmark in +/// Apache Pinot and then copy the generated Arrow IPC files into the `benches/` folder, keeping +/// the name conventions used in the JMH benchmark. +/// +/// This simulates a common aggregation pattern where we count the number of +/// distinct binary values for each integer group value. +/// +/// # Arguments +/// * `input` - The input execution plan (typically BatchSourceExec) +/// +/// # Returns +/// An AggregateExec wrapped in Arc +fn create_distinct_count_groupby_plan(input: Arc) -> Arc { + let schema = input.schema(); + + // Build GROUP BY expression: GROUP BY colInt + let group_col = + Arc::new(Column::new_with_schema("colint", &schema).unwrap()) as Arc; + let group_expr = vec![(Arc::clone(&group_col), "colint".to_string())]; + let group_by = PhysicalGroupBy::new_single(group_expr); + + // Build aggregate expression: COUNT(DISTINCT bytes) + let bytes_col = + Arc::new(Column::new_with_schema("bytes", &schema).unwrap()) as Arc; + let aggr_expr = vec![Arc::new( + AggregateExprBuilder::new(count_udaf(), vec![bytes_col]) + .schema(Arc::clone(&schema)) + .alias("count_distinct") + .distinct() // Enable DISTINCT counting + .build() + .unwrap(), + )]; + + // Create the aggregate execution plan + // Using Single mode (not partial/final) for simplicity in benchmarks + Arc::new( + AggregateExec::try_new( + AggregateMode::Single, + group_by, + aggr_expr, + vec![None], // No filter expressions + input, + schema, + ) + .unwrap(), + ) +} + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Benchmark configurations for different binary sizes. +#[derive(Debug, Clone, Copy)] +struct BenchConfig { + /// Human-readable name for the configuration + name: &'static str, + /// Size of binary values in bytes (not used, but kept for compatibility) + #[allow(dead_code)] + bytes_length: usize, +} + +/// Enum to select which binary array type to use +#[derive(Debug, Clone, Copy)] +enum BinaryType { + Binary, + BinaryView, +} + +fn get_arrow_file_path( + folder: &str, + config_name: &str, + num_groups: usize, + distinct_values_per_group: usize, +) -> PathBuf { + let file_name = format!( + "group_distinct_by_{}_groups_{}_distinctPerGroup_{}.arrow", + config_name, + num_groups, + distinct_values_per_group + ); + PathBuf::from(folder).join(file_name) +} + +fn load_batches_from_arrow_file(path: &PathBuf, binary_type: BinaryType) -> (SchemaRef, Vec) { + let file = File::open(path).unwrap_or_else(|_| panic!("Arrow file not found: {}", path.display())); + let mut reader = FileReader::try_new(file, None).expect("Failed to open Arrow IPC file"); + let orig_schema = reader.schema(); + let orig_batches = reader.collect::>>().expect("Failed to read batches from Arrow file"); + + match binary_type { + BinaryType::Binary => (orig_schema, orig_batches), + BinaryType::BinaryView => { + // Find the index of the "bytes" column + let bytes_idx = orig_schema.fields().iter().position(|f| f.name() == "bytes").expect("No 'bytes' column"); + // Create new schema with bytes as BinaryView + let mut new_fields: Vec> = orig_schema.fields().to_vec(); + new_fields[bytes_idx] = Arc::new(arrow::datatypes::Field::new("bytes", arrow::datatypes::DataType::BinaryView, false)); + let new_schema = Arc::new(arrow::datatypes::Schema::new( + new_fields.iter().map(|f| f.as_ref().clone()).collect::>() + )); + // Convert each batch + let new_batches = orig_batches.into_iter().map(|batch| { + let mut columns: Vec = batch.columns().to_vec(); + let binary_array = batch.column(bytes_idx).as_any().downcast_ref::().expect("'bytes' column is not BinaryArray"); + let binaryview_vec: Vec<&[u8]> = (0..batch.num_rows()).map(|i| binary_array.value(i)).collect(); + let binaryview_array = BinaryViewArray::from(binaryview_vec); + columns[bytes_idx] = Arc::new(binaryview_array); + RecordBatch::try_new(Arc::clone(&new_schema), columns).expect("Failed to create BinaryView batch") + }).collect(); + (new_schema, new_batches) + } + } +} + +fn bench_distinct_group_by(c: &mut Criterion) { + // Create a single-threaded Tokio runtime for async execution. + // We use current_thread to ensure all async work runs on the benchmark thread, + // making results comparable to single-threaded Java benchmarks. + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + let mut group = c.benchmark_group("distinct_group_by_bench"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Binary size configurations + let configs = vec![ + BenchConfig { + name: "INT_BYTES_10", + bytes_length: 10, + }, + BenchConfig { + name: "INT_BYTES_1000", + bytes_length: 1000, + }, + ]; + + // Distinct values per group (JMH param) + let distinct_values_per_group_list = vec![1, 4, 16, 64, 256, 1024]; + + let binary_types = vec![BinaryType::Binary, BinaryType::BinaryView]; + + let arrow_folder = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("benches"); + + for config in &configs { + for &binary_type in &binary_types { + let binary_type_label = match binary_type { + BinaryType::Binary => "Binary", + BinaryType::BinaryView => "BinaryView", + }; + for &distinct_values_per_group in &distinct_values_per_group_list { + let num_groups = 512; + let num_batches = 1; + let rows_per_batch = num_groups * distinct_values_per_group; + let total_rows = rows_per_batch * num_batches; + + let label = format!("{}/{}/dvg_{}", config.name, binary_type_label, distinct_values_per_group); + + // Load test data from Arrow file + let arrow_file_path = get_arrow_file_path( + arrow_folder.to_str().unwrap(), + config.name, // Use config.name for the file name + num_groups, + distinct_values_per_group, + ); + println!("Reading Arrow file: {}", arrow_file_path.display()); + let (schema, batches) = load_batches_from_arrow_file(&arrow_file_path, binary_type); + + // Calculate approximate data size for throughput metric + let data_size: usize = batches + .iter() + .map(|b| b.get_array_memory_size()) + .sum(); + + // Log configuration for visibility in benchmark output + println!( + "Config: {} rows, {}, {}, dvg={}, data size={:.2} MB, Arrow file: {}", + total_rows, + config.name, + binary_type_label, + distinct_values_per_group, + data_size as f64 / (1024.0 * 1024.0), + arrow_file_path.display() + ); + + // Set throughput metric for bytes/second calculations + group.throughput(Throughput::Bytes(data_size as u64)); + + // Validation (NOT timed - run once before benchmarking) + { + let validation_batches = batches.clone(); + let validation_result = rt.block_on(async { + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + validation_batches, + )) as Arc; + let plan = create_distinct_count_groupby_plan(source); + let task_ctx = Arc::new(TaskContext::default()); + collect(plan, task_ctx).await.unwrap() + }); + let total_result_rows: usize = validation_result.iter() + .map(|batch| batch.num_rows()) + .sum(); + assert_eq!( + total_result_rows, + num_groups, + "Expected {} distinct groups, got {}", + num_groups, + total_result_rows + ); + } + + // Benchmark 2: Full pipeline (deser + aggregation + output serialization) + // Measures complete round-trip: IPC in -> aggregate -> IPC out + // Relevant for scenarios where results are sent over network or stored + group.bench_with_input( + BenchmarkId::new("full_pipeline", &label), + &batches, + |b, batches| { + b.iter(|| { + rt.block_on(async { + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches.clone(), + )) as Arc; + let plan = create_distinct_count_groupby_plan(source); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + // Serialize results back to IPC format + let output_ipc = serialize_results_to_ipc(&results); + black_box(output_ipc) + }) + }) + }, + ); + } + } + } + group.finish(); +} + +criterion_group!(benches, bench_distinct_group_by); +criterion_main!(benches); + diff --git a/datafusion/physical-plan/benches/filter_bench.rs b/datafusion/physical-plan/benches/filter_bench.rs new file mode 100644 index 000000000000..9da637195e7f --- /dev/null +++ b/datafusion/physical-plan/benches/filter_bench.rs @@ -0,0 +1,264 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for DataFusion FilterExec with Arrow IPC serialization. +//! +//! This benchmark measures the end-to-end latency of: +//! 1. Deserializing Arrow IPC data into RecordBatches +//! 2. Executing a FilterExec operator (predicate: colInt > 2500) +//! 3. Serializing the output back to Arrow IPC format +//! +//! The benchmark helps understand the overhead of IPC deserialization +//! and serialization relative to actual query execution, and how filter +//! performance scales with data size. +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench filter_bench -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Run only the deser_only benchmark +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- deser_only +//! +//! # Change measurement time (per benchmark, default is 5 seconds) +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --measurement-time 10 +//! +//! # Run specific configuration +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- "1M_rows_binary_10B" +//! ``` +//! +//! ## Baseline Management +//! +//! Criterion stores benchmark results in `target/criterion/` and automatically compares +//! new runs against previous results. Each benchmark has three states: +//! - **base/**: The baseline for comparison (saved with --save-baseline) +//! - **new/**: The most recent benchmark run +//! - **change/**: Statistics about the change from base to new +//! +//! ```bash +//! # Save current results as a named baseline (e.g., "main" or "before-optimization") +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --save-baseline my-baseline +//! +//! # Compare against a specific baseline +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --baseline my-baseline +//! +//! # List all saved baselines (stored in target/criterion///) +//! ls target/criterion/filter_bench/deser_only/1M_rows_binary_10B/ +//! +//! # Delete all benchmark history and start fresh +//! rm -rf target/criterion +//! +//! # Run without saving results (useful for quick checks) +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --profile-time 1 +//! ``` +//! +//! **Typical workflow for tracking performance:** +//! 1. Before making changes: `cargo bench --bench filter_bench -- --save-baseline before` +//! 2. Make your code changes +//! 3. Compare: `cargo bench --bench filter_bench -- --baseline before` +//! 4. Criterion will show % change from the "before" baseline + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::hint::black_box; +use std::sync::Arc; +use arrow::buffer::Buffer; +use arrow::datatypes::SchemaRef; +use criterion::{ + BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use datafusion_common::ScalarValue; +use datafusion_execution::TaskContext; +use datafusion_expr::Operator; +use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::filter::FilterExecBuilder; +use datafusion_physical_plan::{ExecutionPlan, collect}; + +use bench_utils::{ + BatchSourceExec, FunctionalBatchGenerator, create_schema, deserialize_zero_copy, + serialize_batches_to_sink, serialize_to_ipc, +}; + +// ============================================================================ +// Filter Plan Creation +// ============================================================================ + +/// Creates a FilterExec that evaluates `colInt > 2500`. +/// +/// With the data generation pattern `colInt = i % 5000`, this predicate +/// has approximately 50% selectivity (values 2501-4999 pass, 0-2500 don't). +/// +/// # Arguments +/// * `input` - The input execution plan (typically BatchSourceExec) +/// * `schema` - Schema of the input data +/// +/// # Returns +/// A FilterExec wrapped in Arc +fn create_filter_plan( + input: Arc, + schema: &SchemaRef, +) -> Arc { + // Build the predicate: colInt > 2500 + let col_int = Arc::new(Column::new_with_schema("colint", schema).unwrap()) + as Arc; + let threshold = + Arc::new(Literal::new(ScalarValue::Int32(Some(2500)))) as Arc; + let predicate = + Arc::new(BinaryExpr::new(col_int, Operator::Gt, threshold)) as Arc; + + Arc::new(FilterExecBuilder::new(predicate, input).build().unwrap()) +} + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Main benchmark function for filter execution. +/// +/// This benchmark measures four scenarios for each binary column size: +/// +/// 1. **deser_only**: Just IPC deserialization, no execution +/// - Establishes baseline deserialization cost +/// - Useful for understanding I/O vs compute ratio +/// +/// 2. **ser_only**: Just IPC serialization, no execution +/// - Establishes baseline serialization cost +/// - Uses pre-generated batches directly +/// +/// 3. **filter_only**: Filter execution only +/// - Isolates the FilterExec performance +/// - Uses pre-generated batches directly (no deserialization) +/// - Timed phase runs only the filter operator +/// +/// 4. **full_pipeline**: Complete deser + filter + output serialization +/// - Real-world end-to-end latency including result serialization +/// - Relevant for scenarios where results are sent over network +fn bench_filter(c: &mut Criterion) { + // Create a single-threaded Tokio runtime for async execution. + // We use current_thread to ensure all async work runs on the benchmark thread, + // making results comparable to single-threaded Java benchmarks. + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + let mut group = c.benchmark_group("filter_bench"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Set measurement time (default is 5 seconds) + // Uncomment and adjust the duration as needed: + // group.measurement_time(std::time::Duration::from_secs(10)); + + // Configuration: 1M rows total (10K rows × 100 batches) + let rows_per_batch = 10_000; + let num_batches = 100; + let total_rows = rows_per_batch * num_batches; + + // Test different binary column sizes to understand serialization overhead + let binary_sizes = vec![10, 1024, 2048]; + + for binary_size in binary_sizes { + let label = format!("1M_rows_binary_{binary_size}B"); + + // Generate test data and serialize to IPC format + let schema = create_schema(); + let mut generator = FunctionalBatchGenerator::new( + Arc::clone(&schema), + rows_per_batch, + num_batches, + binary_size, + ); + let batches = generator.generate_batches(); + let ipc_data = serialize_to_ipc(&batches, &schema); + let ipc_size = ipc_data.len(); + let ipc_buffer = Buffer::from_vec(ipc_data); + + // Log configuration for visibility in benchmark output + println!( + "Config: {} rows, binary_size={} bytes, IPC size={:.2} MB", + total_rows, + binary_size, + ipc_size as f64 / (1024.0 * 1024.0) + ); + + // Set throughput metric for bytes/second calculations + group.throughput(Throughput::Bytes(ipc_size as u64)); + + // Benchmark 3: Filter execution only + // Uses pre-generated batches directly, isolating FilterExec performance + group.bench_with_input( + BenchmarkId::new("filter_only", &label), + &batches, + |b, batches| { + b.iter_batched( + // Setup: clone batches (NOT timed) - needed because execution consumes them + || batches.clone(), + // Benchmark: execute filter (TIMED) + |batches| { + rt.block_on(async { + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_filter_plan(source, &schema); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + black_box(results) + }) + }, + BatchSize::SmallInput, + ) + }, + ); + + // Benchmark 4: Full pipeline (deser + filter + output serialization) + // Measures complete round-trip: IPC in -> filter -> IPC out + // Relevant for scenarios where results are sent over network or stored + group.bench_with_input( + BenchmarkId::new("full_pipeline", &label), + &ipc_buffer, + |b, ipc_buffer| { + b.iter(|| { + rt.block_on(async { + let (schema, batches) = deserialize_zero_copy(ipc_buffer); + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_filter_plan(source, &schema); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + // Serialize results back to IPC format + black_box(serialize_batches_to_sink(&results, &schema)) + }) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_filter); +criterion_main!(benches); diff --git a/datafusion/physical-plan/benches/filter_jni_benchmark.rs b/datafusion/physical-plan/benches/filter_jni_benchmark.rs new file mode 100644 index 000000000000..f7e3e1e760d1 --- /dev/null +++ b/datafusion/physical-plan/benches/filter_jni_benchmark.rs @@ -0,0 +1,590 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for DataFusion FilterExec with Java UDF via JNI and Arrow FFI. +//! +//! This benchmark measures the overhead of calling a Java UDF for filtering +//! using JNI and zero-copy Arrow C Data Interface, compared to native Rust filtering. +//! +//! ## Prerequisites +//! +//! Before running this benchmark, you must compile the Java code: +//! +//! ```bash +//! cd datafusion/physical-plan/benches/jvm +//! mvn compile +//! cd ../../../../ +//! ``` +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all JNI filter benchmarks +//! cargo bench --bench filter_jni_benchmark -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench filter_jni_benchmark -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Compare with native filter benchmark +//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --save-baseline native +//! cargo bench --bench filter_jni_benchmark -p datafusion-physical-plan -- --baseline native +//! ``` +//! +//! ## What it measures +//! +//! - **JNI call overhead**: Cost of calling Java methods from Rust +//! - **FFI conversion overhead**: Cost of converting between Rust and C Data Interface +//! - **Java Arrow operations**: Cost of import/filter/export in Java +//! - **Total overhead**: End-to-end comparison with native Rust filtering +//! +//! ## Architecture +//! +//! ```text +//! Rust (DataFusion) JNI Boundary Java (Arrow) +//! ───────────────────────────────────────────────────────────────── +//! RecordBatch +//! ↓ +//! arrow::ffi::to_ffi() +//! ↓ +//! FFI_ArrowSchema* ────────→ long schemaPtr +//! FFI_ArrowArray* ────────→ long arrayPtr +//! ↓ +//! Data.importRecordBatch() +//! ↓ +//! Apply filter: colInt > 2500 +//! ↓ +//! Data.exportRecordBatch() +//! ↓ +//! FFI_ArrowSchema* ←──────── long[] [schemaPtr, arrayPtr] +//! FFI_ArrowArray* ←──────── +//! ↓ +//! arrow::ffi::from_ffi() +//! ↓ +//! RecordBatch (filtered) +//! ``` + +// Note: JVM library loading is handled at runtime by the jni crate (with invocation feature) +// via java-locator. No manual #[link] directive is needed. Ensure JAVA_HOME is set before running. + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::any::Any; + + + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{Array, BooleanArray, Int32Array, StructArray}; +use arrow::buffer::Buffer; +use arrow::datatypes::DataType; +use arrow::ffi::{from_ffi, to_ffi, FFI_ArrowArray, FFI_ArrowSchema}; +use criterion::{ + criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, SamplingMode, + Throughput, +}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_execution::TaskContext; +use datafusion_expr::{col, ColumnarValue, ScalarFunctionImplementation, Volatility}; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::{collect, ExecutionPlan}; +use jni::objects::{JLongArray, JValue}; +use jni::sys::jlong; +use jni::JavaVM; + +use bench_utils::{ + create_schema, deserialize_zero_copy, serialize_batches_to_sink, serialize_to_ipc, + BatchSourceExec, FunctionalBatchGenerator, +}; + +// ============================================================================ +// JVM Initialization +// ============================================================================ + +/// Global JVM instance initialized once before benchmarks +static mut JVM: Option = None; + +/// Initialize JVM with classpath pointing to compiled Java classes. +/// +/// This function should be called once before running any benchmarks. +/// The classpath is hardcoded to point to the Maven JAR file. +/// +/// The JVM library is dynamically loaded by the jni crate via java-locator, +/// which uses JAVA_HOME environment variable to locate the JVM installation. +fn init_jvm() { + use jni::InitArgsBuilder; + + unsafe { + let jvm_ptr = std::ptr::addr_of_mut!(JVM); + if (*jvm_ptr).is_some() { + return; // Already initialized + } + + // Set up JVM arguments with classpath pointing to the compiled JAR + // Notice you need to run `mvn package` in the jvm directory to compile and package the Java code first + let classpath = "benches/jvm/target/datafusion-jni-benchmark-1.0-SNAPSHOT.jar"; + + // Check if the JAR file exists + let classpath_path = std::path::Path::new(classpath); + if !classpath_path.exists() { + let absolute_path = std::env::current_dir() + .map(|p| p.join(classpath)) + .unwrap_or_else(|_| classpath_path.to_path_buf()); + + panic!( + "JAR file not found at: {}\n\ + Absolute path: {}\n\ + Please compile the Java code first by running:\n\ + cd datafusion/physical-plan/benches/jvm\n\ + mvn package\n\ + cd ../../../../", + classpath, + absolute_path.display() + ); + } + + let classpath_option = format!("-Djava.class.path={}", classpath); + + let jvm_args = InitArgsBuilder::new() + .option(&classpath_option) + .option("--add-opens=java.base/java.nio=ALL-UNNAMED") + .build() + .expect("Failed to build JVM arguments"); + + let jvm = JavaVM::new(jvm_args) + .expect("Failed to create JVM. Ensure JAVA_HOME is set and Java is installed."); + + *jvm_ptr = Some(jvm); + } +} + +/// Get JVM instance (panics if not initialized) +fn get_jvm() -> &'static JavaVM { + unsafe { + let jvm_ptr = std::ptr::addr_of!(JVM); + (*jvm_ptr) + .as_ref() + .expect("JVM not initialized. Call init_jvm() first.") + } +} + +// ============================================================================ +// Java UDF Wrapper - Scalar UDF Implementation +// ============================================================================ + +/// Calls Java evaluatePredicate method with Arrow FFI pointers for a single column. +/// +/// This function: +/// 1. Wraps Int32Array in a RecordBatch and converts to FFI pointers +/// 2. Calls Java Udf.evaluatePredicate(schemaPtr, arrayPtr) via JNI +/// 3. Receives result pointers from Java (boolean array in a RecordBatch) +/// 4. Converts result back to BooleanArray using arrow::ffi::from_ffi() +/// +/// # Memory Management +/// - Input FFI pointers are released after Java imports the data +/// - Output FFI pointers are managed by Arrow's Drop implementation +/// - Java is responsible for releasing exported data after Rust imports it +fn call_java_predicate(int_array: &Int32Array) -> Result { + use arrow::datatypes::{Schema, Field}; + use arrow::record_batch::RecordBatch; + + // Create a schema for the input (single Int32 column) + let input_schema = Schema::new(vec![Field::new("colint", DataType::Int32, true)]); + + // Create a RecordBatch with the Int32Array + let record_batch = RecordBatch::try_new( + Arc::new(input_schema), + vec![Arc::new(int_array.clone()) as Arc], + )?; + + // Convert RecordBatch to FFI pointers via StructArray + let struct_array: StructArray = record_batch.into(); + let (ffi_array, ffi_schema) = to_ffi(&struct_array.to_data())?; + + // Get raw pointers for JNI call + let schema_ptr = &ffi_schema as *const FFI_ArrowSchema as jlong; + let array_ptr = &ffi_array as *const FFI_ArrowArray as jlong; + + // Call Java UDF via JNI + let jvm = get_jvm(); + let mut env = jvm.attach_current_thread() + .map_err(|e| datafusion_common::DataFusionError::Execution( + format!("Failed to attach JVM thread: {}", e) + ))?; + + // Call static method: Udf.evaluatePredicate(long, long) -> long[] + let result_ptrs = env.call_static_method( + "org/apache/datafusion/benchmark/Udf", + "evaluatePredicate", + "(JJ)[J", + &[JValue::Long(schema_ptr), JValue::Long(array_ptr)], + ).map_err(|e| datafusion_common::DataFusionError::Execution( + format!("Java UDF call failed: {}", e) + ))?; + + // Extract the long[] result containing [schemaPtr, arrayPtr] + let result_array = result_ptrs.l() + .map_err(|e| datafusion_common::DataFusionError::Execution( + format!("Failed to extract result array: {}", e) + ))?; + + let result_array = JLongArray::from(result_array); + + // Get the two pointers from the result array + let mut ptrs = [0i64; 2]; + env.get_long_array_region(&result_array, 0, &mut ptrs) + .map_err(|e| datafusion_common::DataFusionError::Execution( + format!("Failed to read result pointers: {}", e) + ))?; + + let result_schema_ptr = ptrs[0] as *mut FFI_ArrowSchema; + let result_array_ptr = ptrs[1] as *mut FFI_ArrowArray; + + // Safety: We trust that Java has allocated valid FFI structures + // The from_ffi call will take ownership and handle cleanup via release callbacks + let result_array_data = unsafe { + let result_ffi_schema = FFI_ArrowSchema::from_raw(result_schema_ptr); + let result_ffi_array = FFI_ArrowArray::from_raw(result_array_ptr); + from_ffi(result_ffi_array, &result_ffi_schema)? + }; + + // Java returns a VectorSchemaRoot (struct type) with one boolean column + // We need to extract the child array directly from the ArrayData + if !matches!(result_array_data.data_type(), DataType::Struct(_)) { + return Err(datafusion_common::DataFusionError::Execution( + format!("Expected Struct type from Java, got {:?}", result_array_data.data_type()) + )); + } + + // Get the first child array data (the boolean column) directly + // We don't use StructArray::from() because it tries to slice child arrays + // based on the struct's offset/length, which can cause issues + let child_data = result_array_data.child_data().get(0) + .ok_or_else(|| datafusion_common::DataFusionError::Execution( + "Expected at least one child in result struct".to_string() + ))?; + + // Construct the BooleanArray directly from the child ArrayData + let boolean_array = BooleanArray::from(child_data.clone()); + + Ok(boolean_array) +} + +/// Create a scalar UDF that wraps the Java predicate function +fn create_java_predicate_udf() -> ScalarFunctionImplementation { + Arc::new(move |args: &[ColumnarValue]| -> Result { + // Extract the Int32Array from the input + let int_array = match &args[0] { + ColumnarValue::Array(arr) => arr + .as_any() + .downcast_ref::() + .expect("Expected Int32Array") + .clone(), + ColumnarValue::Scalar(ScalarValue::Int32(Some(val))) => { + // Single value - create array with one element + Int32Array::from(vec![*val]) + } + ColumnarValue::Scalar(ScalarValue::Int32(None)) => { + // Null value - create array with one null + Int32Array::from(vec![None as Option]) + } + _ => { + return Err(datafusion_common::DataFusionError::Execution( + "Expected Int32 input".to_string(), + )) + } + }; + + // Call Java predicate + let result_array = call_java_predicate(&int_array)?; + + Ok(ColumnarValue::Array(Arc::new(result_array))) + }) +} + +// ============================================================================ +// Filter Plan Creation with Java UDF +// ============================================================================ + +/// Creates a FilterExec plan that uses a Java UDF for the predicate evaluation. +/// +/// This function creates a standard DataFusion FilterExec that applies the +/// Java UDF predicate (colInt > 2500) via JNI. +fn create_java_filter_plan(input: Arc) -> Result> { + use datafusion_expr::create_udf; + use datafusion_physical_expr::create_physical_expr; + use datafusion_expr::Expr; + use datafusion_common::DFSchema; + use datafusion_expr::execution_props::ExecutionProps; + + let schema = input.schema(); + + // Create the Java predicate UDF + let java_udf_impl = create_java_predicate_udf(); + + // Create UDF with signature + let java_udf = create_udf( + "java_gt_2500", + vec![DataType::Int32], + DataType::Boolean, + Volatility::Immutable, + java_udf_impl, + ); + + // Create the expression: java_gt_2500(colint) + let col_expr = col("colint"); + let udf_expr = Expr::ScalarFunction(datafusion_expr::expr::ScalarFunction::new_udf( + Arc::new(java_udf), + vec![col_expr], + )); + + // Convert logical expression to physical expression + let df_schema = DFSchema::try_from(schema.as_ref().clone())?; + let execution_props = ExecutionProps::new(); + let physical_expr = create_physical_expr( + &udf_expr, + &df_schema, + &execution_props, + )?; + + // Create FilterExec with the Java UDF predicate + Ok(Arc::new(FilterExec::try_new(physical_expr, input)?)) +} + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Main benchmark function for JNI filter execution. +/// +/// This benchmark measures two scenarios: +/// +/// 1. **filter_only**: Filter execution only (using Java UDF) +/// - Isolates the JNI/FFI overhead and Java filter performance +/// - Uses pre-generated batches directly (no deserialization) +/// +/// 2. **full_pipeline**: Complete deser + Java filter + output serialization +/// - Real-world end-to-end latency including JNI overhead +/// - Relevant for understanding total cost of Java UDF integration +fn bench_filter(c: &mut Criterion) { + // Initialize JVM once before all benchmarks + init_jvm(); + + // Create a single-threaded Tokio runtime for async execution + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + let mut group = c.benchmark_group("filter_jni_benchmark"); + group.sampling_mode(SamplingMode::Flat); + + // Configuration: 1M rows total (10K rows × 100 batches) + let rows_per_batch = 10_000; + let num_batches = 100; + let total_rows = rows_per_batch * num_batches; + + // Test different binary column sizes to understand serialization overhead + let binary_sizes = vec![10, 1024, 2048]; + + for binary_size in binary_sizes { + let label = format!("1M_rows_binary_{binary_size}B"); + + // Generate test data and serialize to IPC format + let schema = create_schema(); + let mut generator = + FunctionalBatchGenerator::new(Arc::clone(&schema), rows_per_batch, num_batches, binary_size); + let batches = generator.generate_batches(); + let ipc_data = serialize_to_ipc(&batches, &schema); + let ipc_size = ipc_data.len(); + let ipc_buffer = Buffer::from_vec(ipc_data); + + // Log configuration + println!( + "Config: {} rows, binary_size={} bytes, IPC size={:.2} MB", + total_rows, + binary_size, + ipc_size as f64 / (1024.0 * 1024.0) + ); + + group.throughput(Throughput::Bytes(ipc_size as u64)); + + // Benchmark 2: Full pipeline (deser + Java filter + output serialization) + group.bench_with_input( + BenchmarkId::new("full_pipeline", &label), + &ipc_buffer, + |b, ipc_buffer| { + b.iter(|| { + rt.block_on(async { + let (schema, batches) = deserialize_zero_copy(ipc_buffer); + let source = Arc::new(BatchSourceExec::new(Arc::clone(&schema), batches)) + as Arc; + let plan = create_java_filter_plan(source).unwrap(); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + black_box(serialize_batches_to_sink(&results, &schema)) + }) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_filter); +criterion_main!(benches); + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + /// Test that the Java filter returns the correct number of rows. + /// + /// The filter predicate is `colInt > 2500`, and colInt values follow the pattern `i % 5000`. + /// This means: + /// - Values range from 0 to 4999 + /// - Values > 2500 are: 2501, 2502, ..., 4999 (2499 values) + /// - Expected selectivity: 2499/5000 = 49.98% + /// + /// For 1M total rows (100 batches × 10K rows), we expect: + /// - Filtered rows: 1,000,000 × 0.4998 = 499,800 rows + #[test] + fn test_java_filter_row_count() { + // Initialize JVM + init_jvm(); + + // Create Tokio runtime + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + // Configuration matching the benchmark + let rows_per_batch = 10_000; + let num_batches = 100; + let total_rows = rows_per_batch * num_batches; + let binary_size = 10; + + // Generate test data + let schema = create_schema(); + let mut generator = + FunctionalBatchGenerator::new(Arc::clone(&schema), rows_per_batch, num_batches, binary_size); + let batches = generator.generate_batches(); + + // Create execution plan with Java filter + let source = Arc::new(BatchSourceExec::new(Arc::clone(&schema), batches)) + as Arc; + let plan = create_java_filter_plan(source).unwrap(); + + // Execute the plan + let task_ctx = Arc::new(TaskContext::default()); + let results = rt.block_on(async { + collect(plan, task_ctx).await.unwrap() + }); + + // Count total rows in results + let filtered_row_count: usize = results.iter().map(|batch| batch.num_rows()).sum(); + + // Calculate expected count + // colInt values: i % 5000, so values are 0..4999 + // Filter: colInt > 2500, so we keep 2501..4999 = 2499 values per 5000 + // Expected: (total_rows / 5000) * 2499 + let expected_count = (total_rows / 500) * 2499; + + assert_eq!( + filtered_row_count, expected_count, + "Java filter returned {} rows, expected {} rows ({}% selectivity)", + filtered_row_count, + expected_count, + (expected_count as f64 / total_rows as f64) * 100.0 + ); + + println!( + "✓ Java filter correctness test passed: {} rows filtered from {} total rows ({:.2}% selectivity)", + filtered_row_count, + total_rows, + (filtered_row_count as f64 / total_rows as f64) * 100.0 + ); + } + + /// Test that the Java filter produces the same results as the expected filter logic + /// by verifying that all returned values actually satisfy the predicate. + #[test] + fn test_java_filter_correctness() { + // Initialize JVM + init_jvm(); + + // Create Tokio runtime + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + // Use smaller dataset for detailed validation + let rows_per_batch = 1_000; + let num_batches = 10; + let binary_size = 10; + + // Generate test data + let schema = create_schema(); + let mut generator = + FunctionalBatchGenerator::new(Arc::clone(&schema), rows_per_batch, num_batches, binary_size); + let batches = generator.generate_batches(); + + // Create execution plan with Java filter + let source = Arc::new(BatchSourceExec::new(Arc::clone(&schema), batches)) + as Arc; + let plan = create_java_filter_plan(source).unwrap(); + + // Execute the plan + let task_ctx = Arc::new(TaskContext::default()); + let results = rt.block_on(async { + collect(plan, task_ctx).await.unwrap() + }); + + // Verify all returned rows satisfy the predicate: colInt > 2500 + for (batch_idx, batch) in results.iter().enumerate() { + let colint_array = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected Int32Array for colint"); + + for row_idx in 0..colint_array.len() { + let value = colint_array.value(row_idx); + assert!( + value > 2500, + "Batch {}, row {}: expected value > 2500, got {}", + batch_idx, + row_idx, + value + ); + } + } + + let total_filtered_rows: usize = results.iter().map(|batch| batch.num_rows()).sum(); + println!( + "✓ Java filter correctness test passed: all {} filtered rows satisfy colInt > 2500", + total_filtered_rows + ); + } +} + diff --git a/datafusion/physical-plan/benches/hash_join_bench.rs b/datafusion/physical-plan/benches/hash_join_bench.rs new file mode 100644 index 000000000000..c3d3491626ba --- /dev/null +++ b/datafusion/physical-plan/benches/hash_join_bench.rs @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for DataFusion HashJoinExec (inner equi-join). +//! +//! This benchmark measures the performance of hash-based inner joins with +//! varying match rates, key repetition patterns, and data sizes. +//! +//! ## Test configurations: +//! +//! - **Binary sizes**: 10B, 4096B (affects left side row size) +//! - **Match rate**: 0.5, 1.0 (fraction of left keys that match right keys) +//! - **Repeated right keys**: 1, 10, 100 (join fan-out factor) +//! +//! ## Data setup: +//! +//! - **Left side (probe)**: 1M rows (100 batches × 10K rows) +//! - Schema: colInt, colLong, colFloat, colDouble, colString, colBinary +//! - colInt values: 0-4999 (using `i % 5000` pattern) +//! +//! - **Right side (build)**: Variable size based on parameters +//! - Schema: colInt (single column) +//! - colInt values: 0 to (matchRate × 5000 - 1) +//! - Each key repeated `repeatedRightKeys` times +//! - This smaller side is used to build the hash table +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench hash_join_bench -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench hash_join_bench -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Run specific configuration +//! cargo bench --bench hash_join_bench -p datafusion-physical-plan -- "binary_10B" +//! cargo bench --bench hash_join_bench -p datafusion-physical-plan -- "match_1.0" +//! ``` + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::buffer::Buffer; +use criterion::{ + BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use datafusion_common::{JoinType, NullEquality}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; +use datafusion_physical_plan::{ExecutionPlan, collect}; + +use bench_utils::{ + BatchSourceExec, FunctionalBatchGenerator, JoinBuildSideGenerator, create_schema, + deserialize_zero_copy, serialize_results_to_ipc, serialize_to_ipc, +}; + +// ============================================================================ +// Hash Join Plan Creation +// ============================================================================ + +/// Creates a HashJoinExec that performs an inner equi-join on colInt. +/// +/// The join is: `probe.colInt = build.colInt` +/// +/// In DataFusion's HashJoinExec: +/// - First argument (left) is the BUILD side (gets hashed into hash table) +/// - Second argument (right) is the PROBE side (scans and probes hash table) +/// +/// # Arguments +/// * `probe` - Probe side execution plan (larger, 1M rows) +/// * `build` - Build side execution plan (smaller, variable rows) +/// +/// # Returns +/// A HashJoinExec wrapped in Arc +fn create_hash_join_plan( + probe: Arc, + build: Arc, +) -> Arc { + let probe_schema = probe.schema(); + let build_schema = build.schema(); + + // Build join condition: build.colInt = probe.colInt + // Note: In HashJoinExec, the "on" condition is (left_col, right_col) = (build_col, probe_col) + let build_col = Arc::new(Column::new_with_schema("colInt", &build_schema).unwrap()) + as Arc; + let probe_col = Arc::new(Column::new_with_schema("colInt", &probe_schema).unwrap()) + as Arc; + + let on = vec![(build_col, probe_col)]; + + Arc::new( + HashJoinExec::try_new( + build, // Left = build side (gets hashed) + probe, // Right = probe side (probes hash table) + on, + None, // No additional filter + &JoinType::Inner, // Inner join + None, // No projection + PartitionMode::CollectLeft, // Build hash table from left (build) side + NullEquality::NullEqualsNothing, // NULLs don't match + false, // Not null-aware + ) + .unwrap(), + ) +} + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Benchmark configurations for different scenarios. +#[derive(Debug, Clone)] +struct BenchConfig { + /// Binary column size for left side (affects row size) + binary_size: usize, + /// Match rate: fraction of left keys that match right keys + match_rate: f64, + /// Number of times each right key is repeated (fan-out factor) + repeated_right_keys: usize, +} + +impl BenchConfig { + fn label(&self) -> String { + format!( + "binary_{}B/match_{}/repeat_{}", + self.binary_size, self.match_rate, self.repeated_right_keys + ) + } +} + +/// Main benchmark function for hash join execution. +/// +/// This benchmark measures inner equi-join performance across: +/// - Different left side row sizes (binary column 10B vs 4096B) +/// - Different match rates (0.5 vs 1.0) +/// - Different fan-out factors (repeated right keys: 1, 10, 100) +/// +/// For each configuration, we measure: +/// - **join_only**: Pure HashJoinExec performance using pre-generated batches +/// - **full_pipeline**: Complete deser + join + output serialization +fn bench_hash_join(c: &mut Criterion) { + // Create a single-threaded Tokio runtime for async execution. + // We use current_thread to ensure all async work runs on the benchmark thread, + // making results comparable to single-threaded Java benchmarks. + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + let mut group = c.benchmark_group("hash_join_bench"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Probe side configuration: 1M rows total (10K rows × 100 batches) + let rows_per_batch = 10_000; + let num_batches = 1; + let total_probe_rows = rows_per_batch * num_batches; + + // Generate all benchmark configurations + let configs: Vec = vec![ + // Binary size 10B configurations + BenchConfig { binary_size: 10, match_rate: 0.5, repeated_right_keys: 1 }, + BenchConfig { binary_size: 10, match_rate: 0.5, repeated_right_keys: 10 }, + BenchConfig { binary_size: 10, match_rate: 0.5, repeated_right_keys: 100 }, + BenchConfig { binary_size: 10, match_rate: 1.0, repeated_right_keys: 1 }, + BenchConfig { binary_size: 10, match_rate: 1.0, repeated_right_keys: 10 }, + BenchConfig { binary_size: 10, match_rate: 1.0, repeated_right_keys: 100 }, + // Binary size 4096B configurations + BenchConfig { binary_size: 4096, match_rate: 0.5, repeated_right_keys: 1 }, + BenchConfig { binary_size: 4096, match_rate: 0.5, repeated_right_keys: 10 }, + BenchConfig { binary_size: 4096, match_rate: 0.5, repeated_right_keys: 100 }, + BenchConfig { binary_size: 4096, match_rate: 1.0, repeated_right_keys: 1 }, + BenchConfig { binary_size: 4096, match_rate: 1.0, repeated_right_keys: 10 }, + BenchConfig { binary_size: 4096, match_rate: 1.0, repeated_right_keys: 100 }, + ]; + + for config in &configs { + let label = config.label(); + + // Generate probe side data (left side - larger, 1M rows) + let probe_schema = create_schema(); + let mut probe_generator = FunctionalBatchGenerator::new( + Arc::clone(&probe_schema), + rows_per_batch, + num_batches, + config.binary_size, + ); + let probe_batches = probe_generator.generate_batches(); + + // Generate build side data (smaller, variable rows - used for hash table) + let build_generator = + JoinBuildSideGenerator::new(config.match_rate, config.repeated_right_keys); + let build_schema = build_generator.schema(); + let build_batches = build_generator.generate_batches(); + let total_build_rows = build_generator.total_rows(); + + // Serialize batches to IPC format for full pipeline benchmark + let probe_ipc_data = serialize_to_ipc(&probe_batches, &probe_schema); + let build_ipc_data = serialize_to_ipc(&build_batches, &build_schema); + let total_ipc_size = probe_ipc_data.len() + build_ipc_data.len(); + + // Calculate approximate data sizes + let probe_data_size: usize = + probe_batches.iter().map(|b| b.get_array_memory_size()).sum(); + let build_data_size: usize = + build_batches.iter().map(|b| b.get_array_memory_size()).sum(); + + // Log configuration for visibility in benchmark output + println!( + "Config: {}, probe={} rows ({:.2} MB), build={} rows ({:.2} MB), IPC={:.2} MB", + label, + total_probe_rows, + probe_data_size as f64 / (1024.0 * 1024.0), + total_build_rows, + build_data_size as f64 / (1024.0 * 1024.0), + total_ipc_size as f64 / (1024.0 * 1024.0) + ); + + // Set throughput metric for bytes/second calculations (based on input size) + group.throughput(Throughput::Bytes(total_ipc_size as u64)); + + // Benchmark 1: Join execution only + // Uses pre-generated batches directly, isolating HashJoinExec performance + // group.bench_with_input( + // BenchmarkId::new("join_only", &label), + // &(&probe_batches, &build_batches), + // |b, (probe_batches, build_batches)| { + // b.iter_batched( + // // Setup: clone batches (NOT timed) - needed because execution consumes them + // || ((*probe_batches).clone(), (*build_batches).clone()), + // // Benchmark: execute join (TIMED) + // |(probe_batches, build_batches)| { + // rt.block_on(async { + // let probe_source = Arc::new(BatchSourceExec::new( + // Arc::clone(&probe_schema), + // probe_batches, + // )) as Arc; + // let build_source = Arc::new(BatchSourceExec::new( + // Arc::clone(&build_schema), + // build_batches, + // )) as Arc; + // let plan = create_hash_join_plan(probe_source, build_source); + // let task_ctx = Arc::new(TaskContext::default()); + // let results = collect(plan, task_ctx).await.unwrap(); + // black_box(results) + // }) + // }, + // BatchSize::SmallInput, + // ) + // }, + // ); + + // Convert to Buffer for zero-copy deserialization + let probe_buffer = Buffer::from_vec(probe_ipc_data.clone()); + let build_buffer = Buffer::from_vec(build_ipc_data.clone()); + + // Benchmark 2: Full pipeline (deser + join + output serialization) + // Measures complete round-trip: IPC in -> join -> IPC out + group.bench_with_input( + BenchmarkId::new("full_pipeline", &label), + &(&probe_buffer, &build_buffer), + |b, (probe_buffer, build_buffer)| { + b.iter(|| { + rt.block_on(async { + let (probe_schema, probe_batches) = deserialize_zero_copy(probe_buffer); + let (build_schema, build_batches) = deserialize_zero_copy(build_buffer); + + let probe_source = Arc::new(BatchSourceExec::new( + Arc::clone(&probe_schema), + probe_batches, + )) as Arc; + let build_source = Arc::new(BatchSourceExec::new( + Arc::clone(&build_schema), + build_batches, + )) as Arc; + + let plan = create_hash_join_plan(probe_source, build_source); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + + // Serialize results back to IPC format + let output_ipc = serialize_results_to_ipc(&results); + black_box(output_ipc) + }) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_hash_join); +criterion_main!(benches); diff --git a/datafusion/physical-plan/benches/hash_join_by_type.rs b/datafusion/physical-plan/benches/hash_join_by_type.rs new file mode 100644 index 000000000000..d0c4a7e96791 --- /dev/null +++ b/datafusion/physical-plan/benches/hash_join_by_type.rs @@ -0,0 +1,508 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for DataFusion HashJoinExec comparing different column types. +//! +//! This benchmark measures the performance of hash-based inner joins across +//! different column types (Int32, Utf8, Utf8View, Dictionary-encoded strings). +//! +//! ## Test configurations: +//! +//! - **Column types**: Int32, Utf8, Utf8View, Dictionary(Int16, Utf8), Dictionary(Int16, Utf8View) +//! - **Match rate**: 0.5, 1.0 (fraction of left keys that match right keys) +//! - **Repeated right keys**: 1, 10 (join fan-out factor) +//! +//! ## Data setup: +//! +//! - **Left side (probe)**: 10K rows (1 batch × 10K rows) +//! - Schema: colInt, colLong, colFloat, colDouble, colString, colBinary +//! - Binary column: 10 bytes (small rows) +//! - colInt values: 0-4999 (using `i % 5000` pattern) +//! - colString values: "str_0000" to "str_4999" (9 bytes each, 5000 distinct values) +//! +//! - **Right side (build)**: Variable size based on parameters +//! - Schema: Single column (colInt or colString depending on join type) +//! - Values: 0 to (matchRate × 5000 - 1) for Int, "str_0000" to "str_{matchRate × 5000 - 1}" for String +//! - Each key repeated `repeatedRightKeys` times +//! - This smaller side is used to build the hash table +//! +//! ## Column type details: +//! +//! - **Int**: Standard Int32 join on colInt +//! - **String**: Utf8 join on colString (9-byte strings) +//! - **StringView**: Utf8View join on colString (optimized for strings ≤12 bytes) +//! - **DictionaryString**: Dictionary(Int16, Utf8) join on colString +//! - **DictionaryStringView**: Dictionary(Int16, Utf8View) join on colString +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Run specific column type +//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "joinInt" +//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "joinStr" +//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "joinStrView" +//! +//! # Run specific configuration +//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "match_1.0" +//! ``` + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::buffer::Buffer; +use criterion::{ + BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use datafusion_common::{JoinType, NullEquality}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode}; +use datafusion_physical_plan::{ExecutionPlan, collect}; + +use bench_utils::{ + BatchSourceExec, FunctionalBatchGenerator, JoinBuildSideGenerator, JoinColumnType, + StringColumnType, create_schema_with_string_type, deserialize_zero_copy, + serialize_results_to_ipc, serialize_to_ipc, +}; + +// ============================================================================ +// Hash Join Plan Creation +// ============================================================================ + +/// Creates a HashJoinExec that performs an inner equi-join. +/// +/// In DataFusion's HashJoinExec: +/// - First argument (left) is the BUILD side (gets hashed into hash table) +/// - Second argument (right) is the PROBE side (scans and probes hash table) +/// +/// # Arguments +/// * `probe` - Probe side execution plan (larger, 10K rows) +/// * `build` - Build side execution plan (smaller, variable rows) +/// * `join_col` - Name of the column to join on ("colInt" or "colString") +/// +/// # Returns +/// A HashJoinExec wrapped in Arc +fn create_hash_join_plan( + probe: Arc, + build: Arc, + join_col: &str, +) -> Arc { + let probe_schema = probe.schema(); + let build_schema = build.schema(); + + // Build join condition: build.{join_col} = probe.{join_col} + // Note: In HashJoinExec, the "on" condition is (left_col, right_col) = (build_col, probe_col) + let build_col = Arc::new(Column::new_with_schema(join_col, &build_schema).unwrap()) + as Arc; + let probe_col = Arc::new(Column::new_with_schema(join_col, &probe_schema).unwrap()) + as Arc; + + let on = vec![(build_col, probe_col)]; + + Arc::new( + HashJoinExec::try_new( + build, // Left = build side (gets hashed) + probe, // Right = probe side (probes hash table) + on, + None, // No additional filter + &JoinType::Inner, // Inner join + None, // No projection + PartitionMode::CollectLeft, // Build hash table from left (build) side + NullEquality::NullEqualsNothing, // NULLs don't match + false, // Not null-aware + ) + .unwrap(), + ) +} + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Benchmark configurations for different scenarios. +#[derive(Debug, Clone)] +struct BenchConfig { + /// Match rate: fraction of left keys that match right keys + match_rate: f64, + /// Number of times each right key is repeated (fan-out factor) + repeated_right_keys: usize, + /// Type of column to join on + join_column_type: JoinColumnType, +} + +impl BenchConfig { + fn label(&self) -> String { + format!( + "match_{}/repeat_{}", + self.match_rate, self.repeated_right_keys + ) + } + + fn benchmark_name(&self) -> &'static str { + match self.join_column_type { + JoinColumnType::Int => "joinInt", + JoinColumnType::String => "joinStr", + JoinColumnType::StringView => "joinStrView", + JoinColumnType::DictionaryString => "joinDictStr", + JoinColumnType::DictionaryStringView => "joinDictStrView", + } + } + + fn join_column_name(&self) -> &'static str { + match self.join_column_type { + JoinColumnType::Int => "colint", + JoinColumnType::String + | JoinColumnType::StringView + | JoinColumnType::DictionaryString + | JoinColumnType::DictionaryStringView => "colstring", + } + } + + fn string_column_type(&self) -> StringColumnType { + match self.join_column_type { + JoinColumnType::Int => StringColumnType::Utf8, // Not used for Int joins + JoinColumnType::String => StringColumnType::Utf8, + JoinColumnType::StringView => StringColumnType::Utf8View, + JoinColumnType::DictionaryString => StringColumnType::DictionaryUtf8, + JoinColumnType::DictionaryStringView => StringColumnType::DictionaryUtf8View, + } + } +} + +/// Main benchmark function for hash join execution across different column types. +/// +/// This benchmark measures inner equi-join performance across: +/// - Different column types (Int32, Utf8, Utf8View, Dictionary variants) +/// - Different match rates (0.5 vs 1.0) +/// - Different fan-out factors (repeated right keys: 1, 10) +/// +/// For each configuration, we measure the complete pipeline: +/// deser + join + output serialization +fn bench_hash_join_by_type(c: &mut Criterion) { + // Create a single-threaded Tokio runtime for async execution. + // We use current_thread to ensure all async work runs on the benchmark thread, + // making results comparable to single-threaded Java benchmarks. + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + let mut group = c.benchmark_group("hash_join_by_type"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Probe side configuration: 10K rows total (10K rows × 1 batch) + let rows_per_batch = 10_000; + let num_batches = 1; + let total_probe_rows = rows_per_batch * num_batches; + let binary_size = 10; // Small binary column (10 bytes) + + // Generate all benchmark configurations + let mut configs: Vec = Vec::new(); + + // For each base configuration (match_rate × repeated_keys) + for match_rate in &[0.5, 1.0] { + for repeated_right_keys in &[1, 10] { + // Create a config for each column type + for column_type in &[ + JoinColumnType::Int, + JoinColumnType::String, + JoinColumnType::StringView, + JoinColumnType::DictionaryString, + JoinColumnType::DictionaryStringView, + ] { + configs.push(BenchConfig { + match_rate: *match_rate, + repeated_right_keys: *repeated_right_keys, + join_column_type: *column_type, + }); + } + } + } + + for config in &configs { + let label = config.label(); + let benchmark_name = config.benchmark_name(); + let join_column_name = config.join_column_name(); + + // Generate probe side data + let probe_schema = create_schema_with_string_type(config.string_column_type()); + let mut probe_generator = FunctionalBatchGenerator::new_with_string_type( + Arc::clone(&probe_schema), + rows_per_batch, + num_batches, + binary_size, + config.string_column_type(), + ); + let probe_batches = probe_generator.generate_batches(); + + // Generate build side data + let build_generator = JoinBuildSideGenerator::new_with_column_type( + config.match_rate, + config.repeated_right_keys, + config.join_column_type, + ); + let build_schema = build_generator.schema(); + let build_batches = build_generator.generate_batches(); + let total_build_rows = build_generator.total_rows(); + + // Serialize batches to IPC format for full pipeline benchmark + let probe_ipc_data = serialize_to_ipc(&probe_batches, &probe_schema); + let build_ipc_data = serialize_to_ipc(&build_batches, &build_schema); + let total_ipc_size = probe_ipc_data.len() + build_ipc_data.len(); + + // Calculate approximate data sizes + let probe_data_size: usize = + probe_batches.iter().map(|b| b.get_array_memory_size()).sum(); + let build_data_size: usize = + build_batches.iter().map(|b| b.get_array_memory_size()).sum(); + + // Log configuration for visibility in benchmark output + println!( + "Config: {} ({}), probe={} rows ({:.2} MB), build={} rows ({:.2} MB), IPC={:.2} MB", + benchmark_name, + label, + total_probe_rows, + probe_data_size as f64 / (1024.0 * 1024.0), + total_build_rows, + build_data_size as f64 / (1024.0 * 1024.0), + total_ipc_size as f64 / (1024.0 * 1024.0) + ); + + // Set throughput metric for bytes/second calculations (based on input size) + group.throughput(Throughput::Bytes(total_ipc_size as u64)); + + // Convert to Buffer for zero-copy deserialization + let probe_buffer = Buffer::from_vec(probe_ipc_data.clone()); + let build_buffer = Buffer::from_vec(build_ipc_data.clone()); + + // Benchmark: Full pipeline (deser + join + output serialization) + // Measures complete round-trip: IPC in -> join -> IPC out + group.bench_with_input( + BenchmarkId::new(benchmark_name, &label), + &(&probe_buffer, &build_buffer), + |b, (probe_buffer, build_buffer)| { + b.iter(|| { + rt.block_on(async { + let (probe_schema, probe_batches) = deserialize_zero_copy(probe_buffer); + let (build_schema, build_batches) = deserialize_zero_copy(build_buffer); + + let probe_source = Arc::new(BatchSourceExec::new( + Arc::clone(&probe_schema), + probe_batches, + )) as Arc; + let build_source = Arc::new(BatchSourceExec::new( + Arc::clone(&build_schema), + build_batches, + )) as Arc; + + let plan = create_hash_join_plan(probe_source, build_source, join_column_name); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + + // Serialize results back to IPC format + let output_ipc = serialize_results_to_ipc(&results); + black_box(output_ipc) + }) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_hash_join_by_type); +criterion_main!(benches); + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + /// Calculates the expected number of rows from a hash join. + /// + /// For an inner join: + /// - Probe side: 10K rows with 5000 distinct keys (each key appears 2 times) + /// - Build side: (match_rate * 5000) distinct keys, each repeated `repeated_keys` times + /// + /// Expected output rows: + /// - For each matching key, output = probe_occurrences * build_occurrences + /// - Total = matching_keys * 2 * repeated_keys + fn expected_row_count(match_rate: f64, repeated_keys: usize) -> usize { + let total_probe_rows = 10_000; + let probe_distinct_keys = 5_000; + let probe_key_occurrences = total_probe_rows / probe_distinct_keys; // = 2 + + let build_distinct_keys = (match_rate * 5000.0) as usize; + + // Each matching key produces: probe_occurrences * build_occurrences rows + build_distinct_keys * probe_key_occurrences * repeated_keys + } + + /// Test helper to execute a join and verify row count + async fn test_join_row_count( + match_rate: f64, + repeated_keys: usize, + column_type: JoinColumnType, + ) { + let rows_per_batch = 10_000; + let num_batches = 1; + let binary_size = 10; + + let string_column_type = match column_type { + JoinColumnType::Int => StringColumnType::Utf8, + JoinColumnType::String => StringColumnType::Utf8, + JoinColumnType::StringView => StringColumnType::Utf8View, + JoinColumnType::DictionaryString => StringColumnType::DictionaryUtf8, + JoinColumnType::DictionaryStringView => StringColumnType::DictionaryUtf8View, + }; + + let join_column_name = match column_type { + JoinColumnType::Int => "colint", + _ => "colstring", + }; + + // Generate probe side data + let probe_schema = create_schema_with_string_type(string_column_type); + let mut probe_generator = FunctionalBatchGenerator::new_with_string_type( + Arc::clone(&probe_schema), + rows_per_batch, + num_batches, + binary_size, + string_column_type, + ); + let probe_batches = probe_generator.generate_batches(); + + // Generate build side data + let build_generator = JoinBuildSideGenerator::new_with_column_type( + match_rate, + repeated_keys, + column_type, + ); + let build_schema = build_generator.schema(); + let build_batches = build_generator.generate_batches(); + + // Execute join + let probe_source = Arc::new(BatchSourceExec::new( + Arc::clone(&probe_schema), + probe_batches, + )) as Arc; + let build_source = Arc::new(BatchSourceExec::new( + Arc::clone(&build_schema), + build_batches, + )) as Arc; + + let plan = create_hash_join_plan(probe_source, build_source, join_column_name); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + + // Calculate actual row count + let actual_row_count: usize = results.iter().map(|batch| batch.num_rows()).sum(); + let expected = expected_row_count(match_rate, repeated_keys); + + assert_eq!( + actual_row_count, expected, + "Row count mismatch for match_rate={}, repeated_keys={}, column_type={:?}. Expected {}, got {}", + match_rate, repeated_keys, column_type, expected, actual_row_count + ); + } + + #[tokio::test] + async fn test_join_row_counts_int() { + // Test Int column type with different configurations + test_join_row_count(0.5, 1, JoinColumnType::Int).await; + test_join_row_count(0.5, 10, JoinColumnType::Int).await; + test_join_row_count(1.0, 1, JoinColumnType::Int).await; + test_join_row_count(1.0, 10, JoinColumnType::Int).await; + } + + #[tokio::test] + async fn test_join_row_counts_string() { + // Test String column type with different configurations + test_join_row_count(0.5, 1, JoinColumnType::String).await; + test_join_row_count(0.5, 10, JoinColumnType::String).await; + test_join_row_count(1.0, 1, JoinColumnType::String).await; + test_join_row_count(1.0, 10, JoinColumnType::String).await; + } + + #[tokio::test] + async fn test_join_row_counts_string_view() { + // Test StringView column type with different configurations + test_join_row_count(0.5, 1, JoinColumnType::StringView).await; + test_join_row_count(0.5, 10, JoinColumnType::StringView).await; + test_join_row_count(1.0, 1, JoinColumnType::StringView).await; + test_join_row_count(1.0, 10, JoinColumnType::StringView).await; + } + + #[tokio::test] + async fn test_join_row_counts_dictionary_string() { + // Test DictionaryString column type with different configurations + test_join_row_count(0.5, 1, JoinColumnType::DictionaryString).await; + test_join_row_count(0.5, 10, JoinColumnType::DictionaryString).await; + test_join_row_count(1.0, 1, JoinColumnType::DictionaryString).await; + test_join_row_count(1.0, 10, JoinColumnType::DictionaryString).await; + } + + #[tokio::test] + async fn test_join_row_counts_dictionary_string_view() { + // Test DictionaryStringView column type with different configurations + test_join_row_count(0.5, 1, JoinColumnType::DictionaryStringView).await; + test_join_row_count(0.5, 10, JoinColumnType::DictionaryStringView).await; + test_join_row_count(1.0, 1, JoinColumnType::DictionaryStringView).await; + test_join_row_count(1.0, 10, JoinColumnType::DictionaryStringView).await; + } + + #[test] + fn test_expected_row_count_calculation() { + // Verify the expected row count formula + + // match_rate=0.5, repeated_keys=1 + // - Matching keys: 2500 + // - Probe occurrences per key: 2 + // - Build occurrences per key: 1 + // - Total: 2500 * 2 * 1 = 5000 + assert_eq!(expected_row_count(0.5, 1), 5_000); + + // match_rate=0.5, repeated_keys=10 + // - Total: 2500 * 2 * 10 = 50000 + assert_eq!(expected_row_count(0.5, 10), 50_000); + + // match_rate=1.0, repeated_keys=1 + // - Matching keys: 5000 + // - Total: 5000 * 2 * 1 = 10000 + assert_eq!(expected_row_count(1.0, 1), 10_000); + + // match_rate=1.0, repeated_keys=10 + // - Total: 5000 * 2 * 10 = 100000 + assert_eq!(expected_row_count(1.0, 10), 100_000); + } +} + diff --git a/datafusion/physical-plan/benches/jvm/mvnw b/datafusion/physical-plan/benches/jvm/mvnw new file mode 100755 index 000000000000..19529ddf8c6e --- /dev/null +++ b/datafusion/physical-plan/benches/jvm/mvnw @@ -0,0 +1,259 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Apache Maven Wrapper startup batch script, version 3.3.2 +# +# Optional ENV vars +# ----------------- +# JAVA_HOME - location of a JDK home dir, required when download maven via java source +# MVNW_REPOURL - repo url base for downloading maven distribution +# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven +# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output +# ---------------------------------------------------------------------------- + +set -euf +[ "${MVNW_VERBOSE-}" != debug ] || set -x + +# OS specific support. +native_path() { printf %s\\n "$1"; } +case "$(uname)" in +CYGWIN* | MINGW*) + [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")" + native_path() { cygpath --path --windows "$1"; } + ;; +esac + +# set JAVACMD and JAVACCMD +set_java_home() { + # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched + if [ -n "${JAVA_HOME-}" ]; then + if [ -x "$JAVA_HOME/jre/sh/java" ]; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACCMD="$JAVA_HOME/jre/sh/javac" + else + JAVACMD="$JAVA_HOME/bin/java" + JAVACCMD="$JAVA_HOME/bin/javac" + + if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then + echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2 + echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2 + return 1 + fi + fi + else + JAVACMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v java + )" || : + JAVACCMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v javac + )" || : + + if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then + echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2 + return 1 + fi + fi +} + +# hash string like Java String::hashCode +hash_string() { + str="${1:-}" h=0 + while [ -n "$str" ]; do + char="${str%"${str#?}"}" + h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296)) + str="${str#?}" + done + printf %x\\n $h +} + +verbose() { :; } +[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; } + +die() { + printf %s\\n "$1" >&2 + exit 1 +} + +trim() { + # MWRAPPER-139: + # Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds. + # Needed for removing poorly interpreted newline sequences when running in more + # exotic environments such as mingw bash on Windows. + printf "%s" "${1}" | tr -d '[:space:]' +} + +# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties +while IFS="=" read -r key value; do + case "${key-}" in + distributionUrl) distributionUrl=$(trim "${value-}") ;; + distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;; + esac +done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties" +[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties" + +case "${distributionUrl##*/}" in +maven-mvnd-*bin.*) + MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ + case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in + *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;; + :Darwin*x86_64) distributionPlatform=darwin-amd64 ;; + :Darwin*arm64) distributionPlatform=darwin-aarch64 ;; + :Linux*x86_64*) distributionPlatform=linux-amd64 ;; + *) + echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2 + distributionPlatform=linux-amd64 + ;; + esac + distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip" + ;; +maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;; +*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;; +esac + +# apply MVNW_REPOURL and calculate MAVEN_HOME +# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-,maven-mvnd--}/ +[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}" +distributionUrlName="${distributionUrl##*/}" +distributionUrlNameMain="${distributionUrlName%.*}" +distributionUrlNameMain="${distributionUrlNameMain%-bin}" +MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}" +MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")" + +exec_maven() { + unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || : + exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD" +} + +if [ -d "$MAVEN_HOME" ]; then + verbose "found existing MAVEN_HOME at $MAVEN_HOME" + exec_maven "$@" +fi + +case "${distributionUrl-}" in +*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;; +*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;; +esac + +# prepare tmp dir +if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then + clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; } + trap clean HUP INT TERM EXIT +else + die "cannot create temp dir" +fi + +mkdir -p -- "${MAVEN_HOME%/*}" + +# Download and Install Apache Maven +verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." +verbose "Downloading from: $distributionUrl" +verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" + +# select .zip or .tar.gz +if ! command -v unzip >/dev/null; then + distributionUrl="${distributionUrl%.zip}.tar.gz" + distributionUrlName="${distributionUrl##*/}" +fi + +# verbose opt +__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR='' +[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v + +# normalize http auth +case "${MVNW_PASSWORD:+has-password}" in +'') MVNW_USERNAME='' MVNW_PASSWORD='' ;; +has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;; +esac + +if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then + verbose "Found wget ... using wget" + wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl" +elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then + verbose "Found curl ... using curl" + curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl" +elif set_java_home; then + verbose "Falling back to use Java to download" + javaSource="$TMP_DOWNLOAD_DIR/Downloader.java" + targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName" + cat >"$javaSource" <<-END + public class Downloader extends java.net.Authenticator + { + protected java.net.PasswordAuthentication getPasswordAuthentication() + { + return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() ); + } + public static void main( String[] args ) throws Exception + { + setDefault( new Downloader() ); + java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() ); + } + } + END + # For Cygwin/MinGW, switch paths to Windows format before running javac and java + verbose " - Compiling Downloader.java ..." + "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java" + verbose " - Running Downloader.java ..." + "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")" +fi + +# If specified, validate the SHA-256 sum of the Maven distribution zip file +if [ -n "${distributionSha256Sum-}" ]; then + distributionSha256Result=false + if [ "$MVN_CMD" = mvnd.sh ]; then + echo "Checksum validation is not supported for maven-mvnd." >&2 + echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + elif command -v sha256sum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + elif command -v shasum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + else + echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2 + echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + fi + if [ $distributionSha256Result = false ]; then + echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2 + echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2 + exit 1 + fi +fi + +# unzip and move +if command -v unzip >/dev/null; then + unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip" +else + tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar" +fi +printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url" +mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME" + +clean || : +exec_maven "$@" diff --git a/datafusion/physical-plan/benches/jvm/pom.xml b/datafusion/physical-plan/benches/jvm/pom.xml new file mode 100644 index 000000000000..86b399ef58ab --- /dev/null +++ b/datafusion/physical-plan/benches/jvm/pom.xml @@ -0,0 +1,125 @@ + + + + 4.0.0 + org.apache.datafusion + datafusion-jni-benchmark + 1.0-SNAPSHOT + DataFusion JNI Benchmark + + + 21 + 21 + UTF-8 + 15.0.2 + + + + + + org.apache.arrow + arrow-c-data + ${arrow.version} + + + + + org.apache.arrow + arrow-memory-netty + ${arrow.version} + + + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + + + org.junit.jupiter + junit-jupiter + 5.10.1 + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.2.3 + + --add-opens=java.base/java.nio=ALL-UNNAMED + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.11.0 + + 21 + 21 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.1 + + + package + + shade + + + false + + + + ${project.name} + ${project.version} + + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + false + + + + + + + + diff --git a/datafusion/physical-plan/benches/jvm/src/main/java/org/apache/datafusion/benchmark/Udf.java b/datafusion/physical-plan/benches/jvm/src/main/java/org/apache/datafusion/benchmark/Udf.java new file mode 100644 index 000000000000..f0eb5f76ac31 --- /dev/null +++ b/datafusion/physical-plan/benches/jvm/src/main/java/org/apache/datafusion/benchmark/Udf.java @@ -0,0 +1,116 @@ +package org.apache.datafusion.benchmark; + + +import org.apache.arrow.c.ArrowArray; +import org.apache.arrow.c.ArrowSchema; +import org.apache.arrow.c.CDataDictionaryProvider; +import org.apache.arrow.c.Data; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; + + +/** + * Java UDF for filtering Arrow RecordBatches via JNI with zero-copy FFI. + * + * This class receives Arrow data from Rust via the C Data Interface, + * applies a filter predicate (colInt > 2500), and returns a boolean array + * indicating which rows pass the filter. + */ +public class Udf { + + // Static allocator initialized once for all operations + private static final BufferAllocator allocator = new RootAllocator(); + + /** + * Evaluates the predicate: value > 2500 + * + * Returns a boolean array indicating which rows pass the filter. For each filter, + * there is a corresponding boolean value: true if the row passes, false otherwise. + * + * @param schemaPtr Pointer to FFI_ArrowSchema (C Data Interface) + * @param arrayPtr Pointer to FFI_ArrowArray (C Data Interface) + * @return Array of [newSchemaPtr, newArrayPtr] for the boolean result array + */ + public static long[] evaluatePredicate(long schemaPtr, long arrayPtr) { + try { + // Import Array from FFI pointers + ArrowSchema arrowSchema = ArrowSchema.wrap(schemaPtr); + ArrowArray arrowArray = ArrowArray.wrap(arrayPtr); + + VectorSchemaRoot root = Data.importVectorSchemaRoot( + allocator, + arrowArray, + arrowSchema, + new CDataDictionaryProvider() + ); + + // Get the integer column (assuming single column input) + IntVector intVector = (IntVector) root.getVector(0); + if (intVector == null) { + throw new RuntimeException("Expected integer vector as input"); + } + + int rowCount = root.getRowCount(); + + // Create result schema root with single boolean column + Field field = new Field("result", FieldType.nullable(new ArrowType.Bool()), null); + Schema resultSchema = new Schema(java.util.Collections.singletonList(field)); + + VectorSchemaRoot resultRoot = VectorSchemaRoot.create(resultSchema, allocator); + resultRoot.setRowCount(rowCount); + + // Get the BitVector from the result root + BitVector resultVector = (BitVector) resultRoot.getVector(0); + resultVector.allocateNew(rowCount); + + // Evaluate predicate for each row + for (int i = 0; i < rowCount; i++) { + boolean passes = !intVector.isNull(i) && intVector.get(i) > 2500; + if (passes) { + resultVector.set(i, 1); + } + // Note: BitVector bits are initialized to 0 by allocateNew(), so no need to explicitly set false values + } + resultVector.setValueCount(rowCount); + + // Export result to FFI pointers + ArrowArray resultArray = ArrowArray.allocateNew(allocator); + ArrowSchema resultArrowSchema = ArrowSchema.allocateNew(allocator); + + Data.exportVectorSchemaRoot( + allocator, + resultRoot, + new CDataDictionaryProvider(), + resultArray, + resultArrowSchema + ); + + // Clean up input root (can be closed after export since we've consumed it) + root.close(); + + // NOTE: resultRoot is NOT closed here because the exported FFI pointers + // reference its memory. Arrow's C Data Interface handles cleanup via + // release callbacks when Rust calls from_ffi() to import the data. + // The release callback will eventually free the resultRoot memory. + + return new long[] { resultArrowSchema.memoryAddress(), resultArray.memoryAddress() }; + } catch (Exception e) { + throw new RuntimeException("Error in Java UDF evaluation", e); + } + } + + /** + * Legacy method for backward compatibility - filters entire batches + * For use with standard DataFusion filter, use evaluatePredicate instead + */ + public static long[] filterBatch(long schemaPtr, long arrayPtr) { + return evaluatePredicate(schemaPtr, arrayPtr); + } +} diff --git a/datafusion/physical-plan/benches/jvm/src/test/java/org/apache/datafusion/benchmark/UdfTest.java b/datafusion/physical-plan/benches/jvm/src/test/java/org/apache/datafusion/benchmark/UdfTest.java new file mode 100644 index 000000000000..6021c5234032 --- /dev/null +++ b/datafusion/physical-plan/benches/jvm/src/test/java/org/apache/datafusion/benchmark/UdfTest.java @@ -0,0 +1,284 @@ +package org.apache.datafusion.benchmark; + +import org.apache.arrow.c.ArrowArray; +import org.apache.arrow.c.ArrowSchema; +import org.apache.arrow.c.CDataDictionaryProvider; +import org.apache.arrow.c.Data; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.Collections; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Test for the Udf.evaluatePredicate method to verify correct behavior + * when returning boolean arrays via Arrow FFI. + */ +public class UdfTest { + + private BufferAllocator allocator; + + @BeforeEach + public void setup() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @AfterEach + public void tearDown() { + allocator.close(); + } + + @Test + public void testEvaluatePredicate_BasicFunctionality() { + // Create input with 10 rows + int rowCount = 10; + + // Create input schema and data + Field intField = new Field("colint", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema inputSchema = new Schema(Collections.singletonList(intField)); + + VectorSchemaRoot inputRoot = VectorSchemaRoot.create(inputSchema, allocator); + inputRoot.setRowCount(rowCount); + + IntVector intVector = (IntVector) inputRoot.getVector(0); + + // Fill with test data: [1000, 2000, 2500, 2501, 3000, 3500, 4000, null, 1500, 2600] + int[] testValues = {1000, 2000, 2500, 2501, 3000, 3500, 4000, -1, 1500, 2600}; + boolean[] expectedResults = {false, false, false, true, true, true, true, false, false, true}; + + for (int i = 0; i < rowCount; i++) { + if (testValues[i] == -1) { + intVector.setNull(i); + } else { + intVector.set(i, testValues[i]); + } + } + intVector.setValueCount(rowCount); + + // Export input to FFI + ArrowArray inputArray = ArrowArray.allocateNew(allocator); + ArrowSchema inputArrowSchema = ArrowSchema.allocateNew(allocator); + + Data.exportVectorSchemaRoot( + allocator, + inputRoot, + new CDataDictionaryProvider(), + inputArray, + inputArrowSchema + ); + + long inputSchemaPtr = inputArrowSchema.memoryAddress(); + long inputArrayPtr = inputArray.memoryAddress(); + + // Call the UDF + long[] resultPtrs = Udf.evaluatePredicate(inputSchemaPtr, inputArrayPtr); + + assertNotNull(resultPtrs, "Result pointers should not be null"); + assertEquals(2, resultPtrs.length, "Should return array with 2 pointers [schemaPtr, arrayPtr]"); + + // Import result from FFI + ArrowSchema resultArrowSchema = ArrowSchema.wrap(resultPtrs[0]); + ArrowArray resultArray = ArrowArray.wrap(resultPtrs[1]); + + VectorSchemaRoot resultRoot = Data.importVectorSchemaRoot( + allocator, + resultArray, + resultArrowSchema, + new CDataDictionaryProvider() + ); + + // Clean up input resources after UDF call + inputArray.close(); + inputArrowSchema.close(); + inputRoot.close(); + + // Verify result structure + assertEquals(1, resultRoot.getFieldVectors().size(), "Result should have 1 column"); + assertEquals(rowCount, resultRoot.getRowCount(), "Result row count should match input"); + + // Verify result is a BitVector (boolean) + assertTrue(resultRoot.getVector(0) instanceof BitVector, + "Result column should be BitVector, got: " + resultRoot.getVector(0).getClass().getName()); + + BitVector resultVector = (BitVector) resultRoot.getVector(0); + assertEquals(rowCount, resultVector.getValueCount(), + "Result vector value count should be " + rowCount + ", got: " + resultVector.getValueCount()); + + // Verify predicate results + for (int i = 0; i < rowCount; i++) { + boolean actual = resultVector.isSet(i) != 0; + assertEquals(expectedResults[i], actual, + String.format("Row %d: value=%s, expected=%s, got=%s", + i, + testValues[i] == -1 ? "null" : testValues[i], + expectedResults[i], + actual)); + } + + // Cleanup + resultRoot.close(); + resultArrowSchema.close(); + resultArray.close(); + } + + @Test + public void testEvaluatePredicate_LargeDataset() { + // Test with 10,000 rows (same as benchmark) + int rowCount = 10000; + + Field intField = new Field("colint", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema inputSchema = new Schema(Collections.singletonList(intField)); + + VectorSchemaRoot inputRoot = VectorSchemaRoot.create(inputSchema, allocator); + inputRoot.setRowCount(rowCount); + + IntVector intVector = (IntVector) inputRoot.getVector(0); + + // Fill with values 0 to 9999 + int expectedPassCount = 0; + for (int i = 0; i < rowCount; i++) { + intVector.set(i, i); + if (i > 2500) { + expectedPassCount++; + } + } + intVector.setValueCount(rowCount); + + // Export input to FFI + ArrowArray inputArray = ArrowArray.allocateNew(allocator); + ArrowSchema inputArrowSchema = ArrowSchema.allocateNew(allocator); + + Data.exportVectorSchemaRoot( + allocator, + inputRoot, + new CDataDictionaryProvider(), + inputArray, + inputArrowSchema + ); + + long inputSchemaPtr = inputArrowSchema.memoryAddress(); + long inputArrayPtr = inputArray.memoryAddress(); + + // Call the UDF + long[] resultPtrs = Udf.evaluatePredicate(inputSchemaPtr, inputArrayPtr); + + // Import result + ArrowSchema resultArrowSchema = ArrowSchema.wrap(resultPtrs[0]); + ArrowArray resultArray = ArrowArray.wrap(resultPtrs[1]); + + VectorSchemaRoot resultRoot = Data.importVectorSchemaRoot( + allocator, + resultArray, + resultArrowSchema, + new CDataDictionaryProvider() + ); + + // Clean up input resources after UDF call + inputArray.close(); + inputArrowSchema.close(); + inputRoot.close(); + + // Verify structure + assertEquals(rowCount, resultRoot.getRowCount(), + "Result row count should be " + rowCount + ", got: " + resultRoot.getRowCount()); + + BitVector resultVector = (BitVector) resultRoot.getVector(0); + assertEquals(rowCount, resultVector.getValueCount(), + "Result vector value count should be " + rowCount + ", got: " + resultVector.getValueCount()); + + // Count how many pass the predicate + int actualPassCount = 0; + for (int i = 0; i < rowCount; i++) { + if (resultVector.isSet(i) != 0) { + actualPassCount++; + } + } + + assertEquals(expectedPassCount, actualPassCount, + String.format("Expected %d rows to pass predicate (> 2500), but got %d", + expectedPassCount, actualPassCount)); + + // Verify specific values + assertFalse(resultVector.isSet(0) != 0, "Value 0 should not pass (0 <= 2500)"); + assertFalse(resultVector.isSet(2500) != 0, "Value 2500 should not pass (2500 <= 2500)"); + assertTrue(resultVector.isSet(2501) != 0, "Value 2501 should pass (2501 > 2500)"); + assertTrue(resultVector.isSet(9999) != 0, "Value 9999 should pass (9999 > 2500)"); + + // Cleanup + resultRoot.close(); + resultArrowSchema.close(); + resultArray.close(); + } + + @Test + public void testEvaluatePredicate_AllNull() { + int rowCount = 100; + + Field intField = new Field("colint", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema inputSchema = new Schema(Collections.singletonList(intField)); + + VectorSchemaRoot inputRoot = VectorSchemaRoot.create(inputSchema, allocator); + inputRoot.setRowCount(rowCount); + + IntVector intVector = (IntVector) inputRoot.getVector(0); + + // All null values + for (int i = 0; i < rowCount; i++) { + intVector.setNull(i); + } + intVector.setValueCount(rowCount); + + // Export and call UDF + ArrowArray inputArray = ArrowArray.allocateNew(allocator); + ArrowSchema inputArrowSchema = ArrowSchema.allocateNew(allocator); + + Data.exportVectorSchemaRoot( + allocator, + inputRoot, + new CDataDictionaryProvider(), + inputArray, + inputArrowSchema + ); + + long[] resultPtrs = Udf.evaluatePredicate(inputArrowSchema.memoryAddress(), inputArray.memoryAddress()); + + // Import result + ArrowSchema resultArrowSchema = ArrowSchema.wrap(resultPtrs[0]); + ArrowArray resultArray = ArrowArray.wrap(resultPtrs[1]); + + VectorSchemaRoot resultRoot = Data.importVectorSchemaRoot( + allocator, + resultArray, + resultArrowSchema, + new CDataDictionaryProvider() + ); + + // Clean up input resources after UDF call + inputArray.close(); + inputArrowSchema.close(); + inputRoot.close(); + + // All nulls should result in false (not passing predicate) + BitVector resultVector = (BitVector) resultRoot.getVector(0); + for (int i = 0; i < rowCount; i++) { + assertFalse(resultVector.isSet(i) != 0, + "Null values should not pass predicate"); + } + + // Cleanup + resultRoot.close(); + resultArrowSchema.close(); + resultArray.close(); + } +} diff --git a/datafusion/physical-plan/benches/serde.rs b/datafusion/physical-plan/benches/serde.rs new file mode 100644 index 000000000000..ed598d724a18 --- /dev/null +++ b/datafusion/physical-plan/benches/serde.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for Arrow IPC serialization performance. +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench serde -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench serde -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Run only the sink benchmark +//! cargo bench --bench serde -p datafusion-physical-plan -- serialize_to_sink +//! +//! # Run only the memory benchmark +//! cargo bench --bench serde -p datafusion-physical-plan -- serialize_to_memory +//! +//! # Change measurement time (per benchmark, default is 5 seconds) +//! cargo bench --bench serde -p datafusion-physical-plan -- --measurement-time 10 +//! +//! # Run specific configuration +//! cargo bench --bench serde -p datafusion-physical-plan -- "1M_rows_binary_10B" +//! ``` +//! +//! ## Baseline Management +//! +//! ```bash +//! # Save current results as a named baseline +//! cargo bench --bench serde -p datafusion-physical-plan -- --save-baseline my-baseline +//! +//! # Compare against a specific baseline +//! cargo bench --bench serde -p datafusion-physical-plan -- --baseline my-baseline +//! +//! # Delete all benchmark history and start fresh +//! rm -rf target/criterion +//! ``` + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::hint::black_box; +use std::sync::Arc; + +use criterion::{ + BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; + +use bench_utils::{ + FunctionalBatchGenerator, create_schema, serialize_batches_to_sink +}; + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Benchmarks serialization to a sink that drops all data. +/// +/// This measures the pure CPU cost of Arrow IPC serialization without +/// including memory allocation or I/O overhead. Useful for understanding +/// the baseline serialization cost. +fn bench_serialize(c: &mut Criterion) { + let mut group = c.benchmark_group("serialize"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Configuration: 1M rows total (10K rows × 100 batches) + let rows_per_batch = 10_000; + let num_batches_vec = vec![1, 100]; + for num_batches in num_batches_vec { + let total_rows = rows_per_batch * num_batches; + + // Test different binary column sizes to understand serialization overhead + let binary_sizes = vec![10, 1024, 2048]; + + for binary_size in binary_sizes { + let label = format!("{num_batches}_batches/rows_binary_{binary_size}B"); + + // Generate test data + let schema = create_schema(); + let mut generator = FunctionalBatchGenerator::new( + Arc::clone(&schema), + rows_per_batch, + num_batches, + binary_size, + ); + let batches = generator.generate_batches(); + + // Calculate expected output size for throughput metric + let expected_size = estimate_serialized_size(&batches, binary_size, total_rows); + + // Set throughput metric for bytes/second calculations + group.throughput(Throughput::Bytes(expected_size as u64)); + + // Log configuration + println!( + "Config (sink): {} rows, binary_size={} bytes, estimated output={:.2} MB", + total_rows, + binary_size, + expected_size as f64 / (1024.0 * 1024.0) + ); + + group.bench_with_input( + BenchmarkId::from_parameter(&label), + &batches, + |b, batches| { + b.iter(|| { + let bytes_written = serialize_batches_to_sink(batches, &schema); + // black_box prevents compiler from optimizing away unused results + black_box(bytes_written) + }) + }, + ); + } + } + + group.finish(); +} + +/// Estimates the serialized size of batches for throughput calculations. +/// +/// This is an approximation based on the data types and sizes. For accurate +/// measurements, we could do one actual serialization, but this is good enough +/// for throughput reporting. +fn estimate_serialized_size(batches: &[arrow::array::RecordBatch], binary_size: usize, total_rows: usize) -> usize { + // Rough estimate of IPC overhead: + // - File header/footer: ~1KB + // - Per-batch metadata: ~200 bytes per batch + // - Per-column data: actual column size + alignment padding + + let num_batches = batches.len(); + let overhead = 1024 + (num_batches * 200); + + // Data size estimation per row: + // - Int32: 4 bytes + // - Int64: 8 bytes + // - Float32: 4 bytes + // - Float64: 8 bytes + // - StringView: ~16 bytes (view) + actual string data (~6 bytes for "str_XX") + // - BinaryView: ~16 bytes (view) + binary_size bytes + let per_row_size = 4 + 8 + 4 + 8 + 16 + 6 + 16 + binary_size; + + overhead + (total_rows * per_row_size) +} + +criterion_group!(benches, bench_serialize); +criterion_main!(benches); + diff --git a/datafusion/physical-plan/benches/sort_bench.rs b/datafusion/physical-plan/benches/sort_bench.rs new file mode 100644 index 000000000000..27bddd59a07f --- /dev/null +++ b/datafusion/physical-plan/benches/sort_bench.rs @@ -0,0 +1,291 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for DataFusion SortExec with Arrow IPC serialization. +//! +//! This benchmark measures the end-to-end latency of: +//! 1. Deserializing Arrow IPC data into RecordBatches +//! 2. Executing a SortExec operator (ORDER BY colInt ASC) +//! 3. Serializing the output back to Arrow IPC format +//! +//! Two sort variants are tested: +//! - **Full sort**: Sort all 1M rows (no limit) +//! - **TopK sort**: Sort with LIMIT 10,000 (uses heap-based TopK algorithm) +//! +//! The benchmark helps understand sort performance characteristics, +//! how the TopK optimization affects latency, and the overhead of +//! serializing sorted results. +//! +//! ## Running the benchmark +//! +//! ```bash +//! # Run all configurations +//! cargo bench --bench sort_bench -p datafusion-physical-plan +//! +//! # Run with fewer samples for quick testing +//! cargo bench --bench sort_bench -p datafusion-physical-plan -- --sample-size 10 +//! +//! # Run specific configuration +//! cargo bench --bench sort_bench -p datafusion-physical-plan -- "sort_no_limit" +//! ``` + +// Include shared benchmark utilities +#[path = "bench_utils.rs"] +mod bench_utils; + +use std::hint::black_box; +use std::sync::Arc; +use arrow::buffer::Buffer; +use arrow::compute::SortOptions; +use arrow::datatypes::SchemaRef; +use criterion::{ + BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; +use datafusion_physical_plan::sorts::sort::SortExec; +use datafusion_physical_plan::{ExecutionPlan, collect}; + +use bench_utils::{ + BatchSourceExec, FunctionalBatchGenerator, create_schema, deserialize_zero_copy, + serialize_batches_to_sink, serialize_to_ipc, +}; + +// ============================================================================ +// Sort Plan Creation +// ============================================================================ + +/// Creates a SortExec that sorts by `colInt ASC`. +/// +/// With the data generation pattern `colInt = i % 5000`, the sort will +/// group all rows with the same colInt value together, with values +/// ranging from 0 to 4999. +/// +/// # Arguments +/// * `input` - The input execution plan (typically BatchSourceExec) +/// * `schema` - Schema of the input data +/// * `fetch` - Optional limit for TopK optimization: +/// - `None`: Full sort of all rows +/// - `Some(n)`: TopK sort returning only top n rows +/// +/// # Returns +/// A SortExec wrapped in Arc +fn create_sort_plan( + input: Arc, + schema: &SchemaRef, + fetch: Option, +) -> Arc { + // Build sort expression: ORDER BY colInt ASC + let col_int = Arc::new(Column::new_with_schema("colInt", schema).unwrap()); + let sort_expr = PhysicalSortExpr::new(col_int, SortOptions::default()); + let sort_exprs = LexOrdering::new(vec![sort_expr]).unwrap(); + + // Create SortExec with optional fetch limit + let sort = SortExec::new(sort_exprs, input); + let sort = if let Some(limit) = fetch { + // TopK optimization: uses a heap to track only the top `limit` rows + sort.with_fetch(Some(limit)) + } else { + sort + }; + + Arc::new(sort) +} + +// ============================================================================ +// Benchmark Implementation +// ============================================================================ + +/// Main benchmark function for sort execution. +/// +/// This benchmark measures six scenarios for each binary column size: +/// +/// 1. **deser_only**: Just IPC deserialization, no execution +/// - Establishes baseline deserialization cost +/// +/// 2. **ser_only**: Just IPC serialization, no execution +/// - Establishes baseline serialization cost +/// - Uses pre-generated batches directly +/// +/// 3. **sort_no_limit**: Full sort execution only +/// - Sorts all 1M rows by colInt +/// - Uses pre-generated batches directly (no deserialization) +/// - Isolates SortExec performance +/// +/// 4. **sort_limit_10k**: TopK sort execution only +/// - Uses TopK algorithm to find top 10,000 rows +/// - Uses pre-generated batches directly (no deserialization) +/// - Should be faster than full sort for large datasets +/// +/// 5. **full_pipeline_no_limit**: Complete deser + full sort + output serialization +/// - Real-world latency for ORDER BY queries including result serialization +/// +/// 6. **full_pipeline_limit_10k**: Complete deser + TopK sort + output serialization +/// - Real-world latency for LIMIT queries including result serialization +fn bench_sort(c: &mut Criterion) { + // Create a single-threaded Tokio runtime for async execution. + // We use current_thread to ensure all async work runs on the benchmark thread, + // making results comparable to single-threaded Java benchmarks. + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + let mut group = c.benchmark_group("sort_bench"); + + // Use flat sampling to collect exactly the requested samples without time constraints + group.sampling_mode(SamplingMode::Flat); + + // Configuration: 1M rows total (10K rows × 100 batches) + let rows_per_batch = 10_000; + let num_batches = 100; + let total_rows = rows_per_batch * num_batches; + + // Test different binary column sizes to understand serialization overhead + let binary_sizes = vec![10, 1024, 2048]; + + for binary_size in binary_sizes { + let label = format!("1M_rows_binary_{binary_size}B"); + + // Generate test data and serialize to IPC format + let schema = create_schema(); + let mut generator = FunctionalBatchGenerator::new( + Arc::clone(&schema), + rows_per_batch, + num_batches, + binary_size, + ); + let batches = generator.generate_batches(); + let ipc_data = serialize_to_ipc(&batches, &schema); + let ipc_size = ipc_data.len(); + let ipc_buffer = Buffer::from_vec(ipc_data); + + // Log configuration for visibility in benchmark output + println!( + "Config: {} rows, binary_size={} bytes, IPC size={:.2} MB", + total_rows, + binary_size, + ipc_size as f64 / (1024.0 * 1024.0) + ); + + // Set throughput metric for bytes/second calculations + group.throughput(Throughput::Bytes(ipc_size as u64)); + + // Benchmark 3: Full sort (no limit) - execution only + // Uses pre-generated batches directly, isolating SortExec performance + group.bench_with_input( + BenchmarkId::new("sort_no_limit", &label), + &batches, + |b, batches| { + b.iter_batched( + // Setup: clone batches (NOT timed) - needed because execution consumes them + || batches.clone(), + // Benchmark: execute sort (TIMED) + |batches| { + rt.block_on(async { + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_sort_plan(source, &schema, None); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + black_box(results) + }) + }, + BatchSize::SmallInput, + ) + }, + ); + + // Benchmark 4: TopK sort (LIMIT 10,000) - execution only + // Uses pre-generated batches directly; should be faster than full sort + group.bench_with_input( + BenchmarkId::new("sort_limit_10k", &label), + &batches, + |b, batches| { + b.iter_batched( + // Setup: clone batches (NOT timed) - needed because execution consumes them + || batches.clone(), + // Benchmark: execute TopK sort (TIMED) + |batches| { + rt.block_on(async { + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_sort_plan(source, &schema, Some(10_000)); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + black_box(results) + }) + }, + BatchSize::SmallInput, + ) + }, + ); + + // Benchmark 5: Full pipeline with full sort + output serialization + // Measures complete round-trip: IPC in -> sort all rows -> IPC out + group.bench_with_input( + BenchmarkId::new("full_pipeline_no_limit", &label), + &ipc_buffer, + |b, ipc_buffer| { + b.iter(|| { + rt.block_on(async { + let (schema, batches) = deserialize_zero_copy(ipc_buffer); + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_sort_plan(source, &schema, None); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + black_box(serialize_batches_to_sink(&results, &schema)) + }) + }) + }, + ); + + // Benchmark 6: Full pipeline with TopK sort + output serialization + // Measures complete round-trip: IPC in -> TopK sort -> IPC out + // Output size is limited to 10K rows, so serialization should be faster + group.bench_with_input( + BenchmarkId::new("full_pipeline_limit_10k", &label), + &ipc_buffer, + |b, ipc_buffer| { + b.iter(|| { + rt.block_on(async { + let (schema, batches) = deserialize_zero_copy(ipc_buffer); + let source = Arc::new(BatchSourceExec::new( + Arc::clone(&schema), + batches, + )) as Arc; + let plan = create_sort_plan(source, &schema, Some(10_000)); + let task_ctx = Arc::new(TaskContext::default()); + let results = collect(plan, task_ctx).await.unwrap(); + black_box(serialize_batches_to_sink(&results, &schema)) + }) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_sort); +criterion_main!(benches);