diff --git a/Cargo.lock b/Cargo.lock
index 3dc276d7c231..918f026c0be2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -204,7 +204,7 @@ dependencies = [
  "snap",
  "strum",
  "strum_macros",
- "thiserror",
+ "thiserror 2.0.18",
  "uuid",
  "zstd",
 ]
@@ -1101,7 +1101,7 @@ dependencies = [
  "serde_json",
  "serde_repr",
  "serde_urlencoded",
- "thiserror",
+ "thiserror 2.0.18",
  "tokio",
  "tokio-stream",
  "tokio-util",
@@ -1252,6 +1252,12 @@ dependencies = [
  "shlex",
 ]
 
+[[package]]
+name = "cesu8"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -1379,6 +1385,16 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
 
+[[package]]
+name = "combine"
+version = "4.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd"
+dependencies = [
+ "bytes",
+ "memchr",
+]
+
 [[package]]
 name = "comfy-table"
 version = "7.2.1"
@@ -2469,6 +2485,7 @@ dependencies = [
  "indexmap 2.13.0",
  "insta",
  "itertools 0.14.0",
+ "jni",
  "log",
  "num-traits",
  "parking_lot",
@@ -2633,7 +2650,7 @@ dependencies = [
  "sqlparser",
  "tempfile",
  "testcontainers-modules",
- "thiserror",
+ "thiserror 2.0.18",
  "tokio",
  "tokio-postgres",
 ]
@@ -3744,6 +3761,15 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
 
+[[package]]
+name = "java-locator"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09c46c1fe465c59b1474e665e85e1256c3893dd00927b8d55f63b09044c1e64f"
+dependencies = [
+ "glob",
+]
+
 [[package]]
 name = "jiff"
 version = "0.2.18"
@@ -3768,6 +3794,30 @@ dependencies = [
  "syn 2.0.114",
 ]
 
+[[package]]
+name = "jni"
+version = "0.21.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97"
+dependencies = [
+ "cesu8",
+ "cfg-if",
+ "combine",
+ "java-locator",
+ "jni-sys",
+ "libloading",
+ "log",
+ "thiserror 1.0.69",
+ "walkdir",
+ "windows-sys 0.45.0",
+]
+
+[[package]]
+name = "jni-sys"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
+
 [[package]]
 name = "jobserver"
 version = "0.1.34"
@@ -4225,7 +4275,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_urlencoded",
- "thiserror",
+ "thiserror 2.0.18",
  "tokio",
  "tracing",
  "url",
@@ -4787,7 +4837,7 @@ dependencies = [
  "rustc-hash",
  "rustls",
  "socket2",
- "thiserror",
+ "thiserror 2.0.18",
  "tokio",
  "tracing",
  "web-time",
@@ -4808,7 +4858,7 @@ dependencies = [
  "rustls",
  "rustls-pki-types",
  "slab",
- "thiserror",
+ "thiserror 2.0.18",
  "tinyvec",
  "tracing",
  "web-time",
@@ -4988,7 +5038,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac"
 dependencies = [
  "getrandom 0.2.16",
  "libredox",
- "thiserror",
+ "thiserror 2.0.18",
 ]
 
 [[package]]
@@ -5680,7 +5730,7 @@ dependencies = [
  "similar",
  "subst",
  "tempfile",
- "thiserror",
+ "thiserror 2.0.18",
  "tracing",
 ]
 
@@ -5928,7 +5978,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with",
- "thiserror",
+ "thiserror 2.0.18",
  "tokio",
  "tokio-stream",
  "tokio-util",
@@ -5944,13 +5994,33 @@ dependencies = [
  "testcontainers",
 ]
 
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
 [[package]]
 name = "thiserror"
 version = "2.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.114",
 ]
 
 [[package]]
@@ -6394,7 +6464,7 @@ dependencies = [
  "serde",
  "serde_json",
  "syn 2.0.114",
- "thiserror",
+ "thiserror 2.0.18",
  "unicode-ident",
 ]
 
@@ -6915,6 +6985,15 @@ dependencies = [
  "windows-link 0.1.3",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+dependencies = [
+ "windows-targets 0.42.2",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"
@@ -6951,6 +7030,21 @@ dependencies = [
  "windows-link 0.2.1",
 ]
 
+[[package]]
+name = "windows-targets"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
+dependencies = [
+ "windows_aarch64_gnullvm 0.42.2",
+ "windows_aarch64_msvc 0.42.2",
+ "windows_i686_gnu 0.42.2",
+ "windows_i686_msvc 0.42.2",
+ "windows_x86_64_gnu 0.42.2",
+ "windows_x86_64_gnullvm 0.42.2",
+ "windows_x86_64_msvc 0.42.2",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
@@ -6993,6 +7087,12 @@ dependencies = [
  "windows-link 0.1.3",
 ]
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
+
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.52.6"
@@ -7005,6 +7105,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
@@ -7017,6 +7123,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
@@ -7041,6 +7153,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
@@ -7053,6 +7171,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
@@ -7065,6 +7189,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
@@ -7077,6 +7207,12 @@ version = "0.53.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.42.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml
index 13f91fd7d4ea..d4f0b15cfa82 100644
--- a/datafusion/physical-plan/Cargo.toml
+++ b/datafusion/physical-plan/Cargo.toml
@@ -73,10 +73,12 @@ pin-project-lite = "^0.2.7"
 tokio = { workspace = true }
 
 [dev-dependencies]
+arrow = { workspace = true, features = ["ffi"] }
 criterion = { workspace = true, features = ["async_futures"] }
 datafusion-functions-aggregate = { workspace = true }
 datafusion-functions-window = { workspace = true }
 insta = { workspace = true }
+jni = { version = "0.21.1", features = ["invocation"] }
 rand = { workspace = true }
 rstest = { workspace = true }
 rstest_reuse = "0.7.0"
@@ -102,3 +104,39 @@ name = "sort_preserving_merge"
 harness = false
 name = "aggregate_vectorized"
 required-features = ["test_utils"]
+
+[[bench]]
+harness = false
+name = "filter_bench"
+
+[[bench]]
+harness = false
+name = "sort_bench"
+
+[[bench]]
+harness = false
+name = "deser"
+
+[[bench]]
+harness = false
+name = "serde"
+
+[[bench]]
+harness = false
+name = "count_group_by_bench"
+
+[[bench]]
+harness = false
+name = "hash_join_bench"
+
+[[bench]]
+harness = false
+name = "hash_join_by_type"
+
+[[bench]]
+harness = false
+name = "distinct_group_by_bench"
+
+[[bench]]
+harness = false
+name = "filter_jni_benchmark"
diff --git a/datafusion/physical-plan/benches/Untitled b/datafusion/physical-plan/benches/Untitled
new file mode 100644
index 000000000000..36ea8ed8754a
--- /dev/null
+++ b/datafusion/physical-plan/benches/Untitled
@@ -0,0 +1 @@
+create_schema
\ No newline at end of file
diff --git a/datafusion/physical-plan/benches/bench_utils.rs b/datafusion/physical-plan/benches/bench_utils.rs
new file mode 100644
index 000000000000..2e9feff889ac
--- /dev/null
+++ b/datafusion/physical-plan/benches/bench_utils.rs
@@ -0,0 +1,957 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Shared utilities for deserialization + execution benchmarks.
+//!
+//! This module provides common functionality for benchmarks that measure
+//! Arrow IPC deserialization combined with DataFusion execution plans.
+
+use std::io::Write;
+use std::sync::Arc;
+
+use arrow::array::{
+    ArrayRef, BinaryArray, DictionaryArray, Float32Array, Float64Array, Int16Array,
+    Int32Array, Int64Array, RecordBatch, StringArray, StringViewArray,
+};
+use arrow::buffer::Buffer;
+use arrow::datatypes::{DataType, Field, Int16Type, Schema, SchemaRef};
+use arrow::ipc::convert::fb_to_schema;
+use arrow::ipc::reader::{FileDecoder, read_footer_length};
+use arrow::ipc::writer::FileWriter;
+use arrow::ipc::root_as_footer;
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::EquivalenceProperties;
+use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion_physical_plan::memory::MemoryStream;
+use datafusion_physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    SendableRecordBatchStream,
+};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+// ============================================================================
+// Schema Definition
+// ============================================================================
+
+/// String column type for benchmarks.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum StringColumnType {
+    /// Regular Utf8 strings
+    Utf8,
+    /// Utf8View strings (optimized for strings ≤12 bytes)
+    Utf8View,
+    /// Dictionary-encoded Utf8 strings with Int16 keys
+    DictionaryUtf8,
+    /// Dictionary-encoded Utf8View strings with Int16 keys
+    DictionaryUtf8View,
+}
+
+/// Creates the benchmark schema with the following columns:
+/// - colint: Int32 - integer values with modulo pattern
+/// - collong: Int64 - long values with modulo pattern
+/// - colfloat: Float32 - floating point values
+/// - coldouble: Float64 - double precision values
+/// - colstring: Utf8 - string values with limited cardinality
+/// - colbinary: Binary - random binary data of configurable size
+pub fn create_schema() -> SchemaRef {
+    create_schema_with_string_type(StringColumnType::Utf8)
+}
+
+/// Creates the benchmark schema with a specific string column type.
+pub fn create_schema_with_string_type(string_type: StringColumnType) -> SchemaRef {
+    let string_data_type = match string_type {
+        StringColumnType::Utf8 => DataType::Utf8,
+        StringColumnType::Utf8View => DataType::Utf8View,
+        StringColumnType::DictionaryUtf8 => {
+            DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8))
+        }
+        StringColumnType::DictionaryUtf8View => {
+            DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8View))
+        }
+    };
+
+    Arc::new(Schema::new(vec![
+        Field::new("colint", DataType::Int32, false),
+        Field::new("collong", DataType::Int64, false),
+        Field::new("colfloat", DataType::Float32, false),
+        Field::new("coldouble", DataType::Float64, false),
+        Field::new("colstring", string_data_type, false),
+        Field::new("colbinary", DataType::Binary, false),
+    ]))
+}
+
+/// Creates a single-column schema with an Int32 column.
+///
+/// Used for aggregation benchmarks where we want to isolate
+/// the grouping column performance without other columns.
+pub fn create_int_column_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "groupCol",
+        DataType::Int32,
+        false,
+    )]))
+}
+
+/// Creates a single-column schema with a Binary column.
+///
+/// Used for aggregation benchmarks to test grouping performance
+/// with variable-length binary keys.
+pub fn create_binary_column_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "groupCol",
+        DataType::Binary,
+        false,
+    )]))
+}
+
+/// Creates a single-column schema with an Int32 column named "colInt".
+///
+/// Used for join benchmarks where the build side has only the join key column.
+pub fn create_join_build_schema() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "colint",
+        DataType::Int32,
+        false,
+    )]))
+}
+
+/// Creates a single-column schema with a Utf8 column named "colString".
+pub fn create_join_build_schema_string() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "colstring",
+        DataType::Utf8,
+        false,
+    )]))
+}
+
+/// Creates a single-column schema with a Utf8View column named "colString".
+pub fn create_join_build_schema_string_view() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "colstring",
+        DataType::Utf8View,
+        false,
+    )]))
+}
+
+/// Creates a single-column schema with a Dictionary(Int16, Utf8) column named "colString".
+pub fn create_join_build_schema_dictionary_string() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "colstring",
+        DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)),
+        false,
+    )]))
+}
+
+/// Creates a single-column schema with a Dictionary(Int16, Utf8View) column named "colString".
+pub fn create_join_build_schema_dictionary_string_view() -> SchemaRef {
+    Arc::new(Schema::new(vec![Field::new(
+        "colstring",
+        DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8View)),
+        false,
+    )]))
+}
+
+// ============================================================================
+// Data Generation
+// ============================================================================
+
+/// Generates record batches with deterministic data for benchmarking.
+///
+/// Each cell value is computed as a function of the row index, ensuring
+/// reproducible benchmark results. The data patterns are designed to be
+/// realistic for query processing benchmarks:
+///
+/// - `colInt`: `i % 5000` - creates 5000 distinct values
+/// - `colLong`: `i % 5000` - same pattern as colInt
+/// - `colFloat`: `i / 2.0` - monotonically increasing
+/// - `colDouble`: `i / 3.0` - monotonically increasing
+/// - `colString`: `format!("str_{:04}", (start_row + i) % 5000)` - 100 distinct string values
+/// - `colBinary`: random bytes of configurable size (seeded for reproducibility)
+pub struct FunctionalBatchGenerator {
+    /// Schema for generated batches
+    schema: SchemaRef,
+    /// Number of rows in each batch
+    rows_per_batch: usize,
+    /// Total number of batches to generate
+    num_batches: usize,
+    /// Size in bytes for the binary column
+    binary_size: usize,
+    /// Type of string column to generate
+    string_column_type: StringColumnType,
+    /// Random number generator for binary data (seeded for reproducibility)
+    rng: StdRng,
+}
+
+impl FunctionalBatchGenerator {
+    /// Creates a new batch generator.
+    ///
+    /// # Arguments
+    /// * `schema` - Arrow schema for the generated batches
+    /// * `rows_per_batch` - Number of rows per batch
+    /// * `num_batches` - Total number of batches to generate
+    /// * `binary_size` - Size in bytes for the binary column values
+    pub fn new(
+        schema: SchemaRef,
+        rows_per_batch: usize,
+        num_batches: usize,
+        binary_size: usize,
+    ) -> Self {
+        Self::new_with_string_type(schema, rows_per_batch, num_batches, binary_size, StringColumnType::Utf8)
+    }
+
+    /// Creates a new batch generator with a specific string column type.
+    ///
+    /// # Arguments
+    /// * `schema` - Arrow schema for the generated batches
+    /// * `rows_per_batch` - Number of rows per batch
+    /// * `num_batches` - Total number of batches to generate
+    /// * `binary_size` - Size in bytes for the binary column values
+    /// * `string_column_type` - Type of string column to generate
+    pub fn new_with_string_type(
+        schema: SchemaRef,
+        rows_per_batch: usize,
+        num_batches: usize,
+        binary_size: usize,
+        string_column_type: StringColumnType,
+    ) -> Self {
+        // Use a fixed seed for reproducible benchmarks
+        let rng = StdRng::seed_from_u64(42);
+        Self {
+            schema,
+            rows_per_batch,
+            num_batches,
+            binary_size,
+            string_column_type,
+            rng,
+        }
+    }
+
+    /// Generates a single record batch for the given batch index.
+    ///
+    /// Row indices are calculated as: `batch_index * rows_per_batch + local_row_index`
+    fn generate_batch(&mut self, batch_index: usize) -> RecordBatch {
+        let start_row = batch_index * self.rows_per_batch;
+        let num_rows = self.rows_per_batch;
+
+        // Clone field names to avoid borrowing self while calling generate_column
+        let field_names: Vec<String> = self
+            .schema
+            .fields()
+            .iter()
+            .map(|f| f.name().clone())
+            .collect();
+
+        let columns: Vec<ArrayRef> = field_names
+            .iter()
+            .map(|name| self.generate_column(name, start_row, num_rows))
+            .collect();
+
+        RecordBatch::try_new(Arc::clone(&self.schema), columns)
+            .expect("Failed to create record batch")
+    }
+
+    /// Generates a single column array based on field name.
+    ///
+    /// Values are deterministic functions of the global row index `i`:
+    /// - colint: `i % 5000` (5000 distinct values)
+    /// - collong: `i % 5000` (5000 distinct values)
+    /// - colfloat: `i / 2.0` (monotonically increasing)
+    /// - coldouble: `i / 3.0` (monotonically increasing)
+    /// - colstring: format!("str_{:04}", (start_row + i) % 5000) (5000 distinct strings)
+    /// - colbinary: random bytes of `binary_size` length
+    fn generate_column(&mut self, field_name: &str, start_row: usize, num_rows: usize) -> ArrayRef {
+        match field_name {
+            "colint" => {
+                // Integer values with modulo 5000 pattern for reasonable cardinality
+                let values: Vec<i32> = (0..num_rows)
+                    .map(|i| ((start_row + i) % 5000) as i32)
+                    .collect();
+                Arc::new(Int32Array::from(values))
+            }
+            "collong" => {
+                // Long values with same modulo pattern as colint
+                let values: Vec<i64> = (0..num_rows)
+                    .map(|i| ((start_row + i) % 5000) as i64)
+                    .collect();
+                Arc::new(Int64Array::from(values))
+            }
+            "colfloat" => {
+                // Monotonically increasing float values
+                let values: Vec<f32> = (0..num_rows)
+                    .map(|i| ((start_row + i) as f32) / 2.0)
+                    .collect();
+                Arc::new(Float32Array::from(values))
+            }
+            "coldouble" => {
+                // Monotonically increasing double values
+                let values: Vec<f64> = (0..num_rows)
+                    .map(|i| ((start_row + i) as f64) / 3.0)
+                    .collect();
+                Arc::new(Float64Array::from(values))
+            }
+            "colstring" => {
+                // String values with 5000 distinct values (9 bytes each: "str_0000" to "str_4999")
+                let string_values: Vec<String> = (0..num_rows)
+                    .map(|i| format!("str_{:04}", (start_row + i) % 5000))
+                    .collect();
+
+                match self.string_column_type {
+                    StringColumnType::Utf8 => {
+                        Arc::new(StringArray::from(string_values))
+                    }
+                    StringColumnType::Utf8View => {
+                        Arc::new(StringViewArray::from(string_values))
+                    }
+                    StringColumnType::DictionaryUtf8 => {
+                        // Create dictionary from unique values, then create keys array
+                        let keys: Int16Array = (0..num_rows)
+                            .map(|i| ((start_row + i) % 5000) as i16)
+                            .collect();
+
+                        // Build dictionary values (unique strings)
+                        let dict_values_vec: Vec<String> = (0..5000)
+                            .map(|i| format!("str_{:04}", i))
+                            .collect();
+                        let dict_values = StringArray::from(dict_values_vec);
+
+                        Arc::new(DictionaryArray::<Int16Type>::try_new(keys, Arc::new(dict_values)).unwrap())
+                    }
+                    StringColumnType::DictionaryUtf8View => {
+                        let keys: Int16Array = (0..num_rows)
+                            .map(|i| ((start_row + i) % 5000) as i16)
+                            .collect();
+
+                        // Build dictionary values (unique strings as StringView)
+                        let dict_values_vec: Vec<String> = (0..5000)
+                            .map(|i| format!("str_{:04}", i))
+                            .collect();
+                        let dict_values = StringViewArray::from(dict_values_vec);
+
+                        Arc::new(DictionaryArray::<Int16Type>::try_new(keys, Arc::new(dict_values)).unwrap())
+                    }
+                }
+            }
+            "colbinary" => {
+                // Random binary data of configurable size
+                let values: Vec<Vec<u8>> = (0..num_rows)
+                    .map(|_| {
+                        let mut buf = vec![0u8; self.binary_size];
+                        self.rng.fill(&mut buf[..]);
+                        buf
+                    })
+                    .collect();
+                let values: Vec<&[u8]> = values.iter().map(|v| v.as_slice()).collect();
+                Arc::new(BinaryArray::from(values))
+            }
+            _ => panic!("Unknown column: {}", field_name),
+        }
+    }
+
+    /// Generates all batches.
+    ///
+    /// Returns a vector of `num_batches` record batches, each containing
+    /// `rows_per_batch` rows.
+    pub fn generate_batches(&mut self) -> Vec<RecordBatch> {
+        (0..self.num_batches)
+            .map(|i| self.generate_batch(i))
+            .collect()
+    }
+}
+
+// ============================================================================
+// Single Column Data Generation (for aggregation benchmarks)
+// ============================================================================
+
+/// Type of single-column data to generate.
+#[derive(Debug, Clone, Copy)]
+pub enum SingleColumnType {
+    /// Int32 column with uniform distribution over [0, distinct_count)
+    Int,
+    /// Binary column with configurable size and distinct count
+    Binary {
+        /// Size of each binary value in bytes
+        binary_size: usize,
+    },
+}
+
+/// Generates single-column record batches for aggregation benchmarks.
+///
+/// This generator creates data with a configurable number of distinct values,
+/// useful for testing GROUP BY or JOIN performance at different cardinalities.
+///
+/// Data patterns:
+/// - **Int column**: Random values with uniform distribution over [0, distinct_count).
+///   Uses a seeded RNG for reproducibility.
+/// - **Binary column**: Pre-generates `distinct_count` distinct random byte
+///   arrays of `binary_size` each, then selects randomly from them.
+///   This ensures exactly `distinct_count` unique binary values with uniform distribution.
+pub struct SingleColumnBatchGenerator {
+    /// Schema for generated batches (single column)
+    schema: SchemaRef,
+    /// Number of rows in each batch
+    rows_per_batch: usize,
+    /// Total number of batches to generate
+    num_batches: usize,
+    /// Number of distinct values to generate
+    distinct_count: usize,
+    /// Type of column to generate
+    column_type: SingleColumnType,
+    /// Pre-generated distinct binary values (only used for Binary column type)
+    distinct_binary_values: Vec<Vec<u8>>,
+    /// Random number generator for value selection (seeded for reproducibility)
+    rng: StdRng,
+}
+
+impl SingleColumnBatchGenerator {
+    /// Creates a new single-column batch generator.
+    ///
+    /// # Arguments
+    /// * `column_type` - Type of column to generate (Int or Binary)
+    /// * `rows_per_batch` - Number of rows per batch
+    /// * `num_batches` - Total number of batches to generate
+    /// * `distinct_count` - Number of distinct values in the column
+    ///
+    /// # Returns
+    /// A new generator configured for the specified column type and cardinality.
+    pub fn new(
+        column_type: SingleColumnType,
+        rows_per_batch: usize,
+        num_batches: usize,
+        distinct_count: usize,
+    ) -> Self {
+        // Create appropriate schema based on column type
+        let schema = match column_type {
+            SingleColumnType::Int => create_int_column_schema(),
+            SingleColumnType::Binary { .. } => create_binary_column_schema(),
+        };
+
+        // Use a fixed seed for reproducible benchmarks
+        let mut rng = StdRng::seed_from_u64(42);
+
+        // Pre-generate distinct binary values if needed
+        let distinct_binary_values = match column_type {
+            SingleColumnType::Binary { binary_size } => {
+                (0..distinct_count)
+                    .map(|_| {
+                        let mut buf = vec![0u8; binary_size];
+                        rng.fill(&mut buf[..]);
+                        buf
+                    })
+                    .collect()
+            }
+            SingleColumnType::Int => Vec::new(),
+        };
+
+        Self {
+            schema,
+            rows_per_batch,
+            num_batches,
+            distinct_count,
+            column_type,
+            distinct_binary_values,
+            rng,
+        }
+    }
+
+    /// Returns the schema of the generated batches.
+    pub fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    /// Generates a single record batch.
+    ///
+    /// Values are randomly selected from [0, distinct_count) with uniform distribution.
+    fn generate_batch(&mut self) -> RecordBatch {
+        let num_rows = self.rows_per_batch;
+
+        let column: ArrayRef = match self.column_type {
+            SingleColumnType::Int => {
+                // Generate random int values with uniform distribution over [0, distinct_count)
+                let values: Vec<i32> = (0..num_rows)
+                    .map(|_| self.rng.random_range(0..self.distinct_count) as i32)
+                    .collect();
+                Arc::new(Int32Array::from(values))
+            }
+            SingleColumnType::Binary { .. } => {
+                // Randomly select from pre-generated distinct binary values
+                let values: Vec<&[u8]> = (0..num_rows)
+                    .map(|_| {
+                        let idx = self.rng.random_range(0..self.distinct_count);
+                        self.distinct_binary_values[idx].as_slice()
+                    })
+                    .collect();
+                Arc::new(BinaryArray::from(values))
+            }
+        };
+
+        RecordBatch::try_new(Arc::clone(&self.schema), vec![column])
+            .expect("Failed to create record batch")
+    }
+
+    /// Generates all batches.
+    ///
+    /// Returns a vector of `num_batches` record batches, each containing
+    /// `rows_per_batch` rows with values randomly selected from `distinct_count` unique values.
+    pub fn generate_batches(&mut self) -> Vec<RecordBatch> {
+        (0..self.num_batches).map(|_| self.generate_batch()).collect()
+    }
+}
+
+// ============================================================================
+// Join Build Side Data Generation
+// ============================================================================
+
+/// Type of join column to generate.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum JoinColumnType {
+    /// Int32 column
+    Int,
+    /// Utf8 string column
+    String,
+    /// Utf8View string column
+    StringView,
+    /// Dictionary(Int16, Utf8) column
+    DictionaryString,
+    /// Dictionary(Int16, Utf8View) column
+    DictionaryStringView,
+}
+
+/// Generates single-column record batches for the build side of join benchmarks.
+///
+/// This generator creates data with controlled match rates and key repetition patterns,
+/// useful for testing JOIN performance at different selectivities. The generated data
+/// is typically used as the build side (hash table) in hash joins.
+///
+/// Data patterns:
+/// - Generates integers from 0 to `(match_rate * 5000) - 1`
+/// - Each distinct key is repeated `repeated_keys` times
+/// - Total rows = `(match_rate * 5000) * repeated_keys`
+/// - Sequential/deterministic: 0, 0, ..., 0, 1, 1, ..., 1, etc.
+pub struct JoinBuildSideGenerator {
+    /// Schema for generated batches
+    schema: SchemaRef,
+    /// Match rate (0.5 or 1.0) - determines the range of keys
+    match_rate: f64,
+    /// Number of times each key is repeated
+    repeated_keys: usize,
+    /// Type of column to generate
+    column_type: JoinColumnType,
+}
+
+impl JoinBuildSideGenerator {
+    /// Creates a new join build side generator.
+    ///
+    /// # Arguments
+    /// * `match_rate` - Fraction of probe-side keys that will match (0.5 or 1.0)
+    /// * `repeated_keys` - Number of times each distinct key appears
+    ///
+    /// # Returns
+    /// A new generator configured for the specified match rate and repetition.
+    pub fn new(match_rate: f64, repeated_keys: usize) -> Self {
+        Self::new_with_column_type(match_rate, repeated_keys, JoinColumnType::Int)
+    }
+
+    /// Creates a new join build side generator with a specific column type.
+    ///
+    /// # Arguments
+    /// * `match_rate` - Fraction of probe-side keys that will match (0.5 or 1.0)
+    /// * `repeated_keys` - Number of times each distinct key appears
+    /// * `column_type` - Type of column to generate
+    ///
+    /// # Returns
+    /// A new generator configured for the specified match rate, repetition, and column type.
+    pub fn new_with_column_type(
+        match_rate: f64,
+        repeated_keys: usize,
+        column_type: JoinColumnType,
+    ) -> Self {
+        let schema = match column_type {
+            JoinColumnType::Int => create_join_build_schema(),
+            JoinColumnType::String => create_join_build_schema_string(),
+            JoinColumnType::StringView => create_join_build_schema_string_view(),
+            JoinColumnType::DictionaryString => create_join_build_schema_dictionary_string(),
+            JoinColumnType::DictionaryStringView => create_join_build_schema_dictionary_string_view(),
+        };
+        Self {
+            schema,
+            match_rate,
+            repeated_keys,
+            column_type,
+        }
+    }
+
+    /// Returns the schema of the generated batches.
+    pub fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    /// Returns the number of distinct keys that will be generated.
+    pub fn distinct_keys(&self) -> usize {
+        (self.match_rate * 5000.0) as usize
+    }
+
+    /// Returns the total number of rows that will be generated.
+    pub fn total_rows(&self) -> usize {
+        self.distinct_keys() * self.repeated_keys
+    }
+
+    /// Generates all batches for the build side of the join.
+    ///
+    /// Creates a single batch containing all rows. Each key from 0 to
+    /// `(match_rate * 5000) - 1` appears `repeated_keys` times consecutively.
+    ///
+    /// Example with match_rate=0.5 (2500 keys) and repeated_keys=2:
+    /// - Int: `[0, 0, 1, 1, 2, 2, ..., 2499, 2499]`
+    /// - String: `["str_0000", "str_0000", "str_0001", "str_0001", ..., "str_2499", "str_2499"]`
+    pub fn generate_batches(&self) -> Vec<RecordBatch> {
+        let distinct_keys = self.distinct_keys();
+        let total_rows = self.total_rows();
+
+        let column: ArrayRef = match self.column_type {
+            JoinColumnType::Int => {
+                // Generate integer values: each key from 0 to distinct_keys-1 repeated repeated_keys times
+                let values: Vec<i32> = (0..distinct_keys)
+                    .flat_map(|key| std::iter::repeat(key as i32).take(self.repeated_keys))
+                    .collect();
+                assert_eq!(values.len(), total_rows);
+                Arc::new(Int32Array::from(values))
+            }
+            JoinColumnType::String => {
+                // Generate string values: each key as "str_{key:04}" repeated repeated_keys times
+                let values: Vec<String> = (0..distinct_keys)
+                    .flat_map(|key| std::iter::repeat(format!("str_{:04}", key)).take(self.repeated_keys))
+                    .collect();
+                assert_eq!(values.len(), total_rows);
+                Arc::new(StringArray::from(values))
+            }
+            JoinColumnType::StringView => {
+                // Generate string view values
+                let values: Vec<String> = (0..distinct_keys)
+                    .flat_map(|key| std::iter::repeat(format!("str_{:04}", key)).take(self.repeated_keys))
+                    .collect();
+                assert_eq!(values.len(), total_rows);
+                Arc::new(StringViewArray::from(values))
+            }
+            JoinColumnType::DictionaryString => {
+                // Generate dictionary-encoded string values
+                // Dictionary contains distinct_keys unique strings
+                // Keys array contains indices that repeat according to repeated_keys
+                let dict_values_vec: Vec<String> = (0..distinct_keys)
+                    .map(|key| format!("str_{:04}", key))
+                    .collect();
+                let dict_values = StringArray::from(dict_values_vec);
+
+                let keys: Int16Array = (0..distinct_keys)
+                    .flat_map(|key| std::iter::repeat(key as i16).take(self.repeated_keys))
+                    .collect();
+
+                assert_eq!(keys.len(), total_rows);
+                Arc::new(DictionaryArray::<Int16Type>::try_new(keys, Arc::new(dict_values)).unwrap())
+            }
+            JoinColumnType::DictionaryStringView => {
+                // Generate dictionary-encoded string view values
+                let dict_values_vec: Vec<String> = (0..distinct_keys)
+                    .map(|key| format!("str_{:04}", key))
+                    .collect();
+                let dict_values = StringViewArray::from(dict_values_vec);
+
+                let keys: Int16Array = (0..distinct_keys)
+                    .flat_map(|key| std::iter::repeat(key as i16).take(self.repeated_keys))
+                    .collect();
+
+                assert_eq!(keys.len(), total_rows);
+                Arc::new(DictionaryArray::<Int16Type>::try_new(keys, Arc::new(dict_values)).unwrap())
+            }
+        };
+
+        let batch = RecordBatch::try_new(Arc::clone(&self.schema), vec![column])
+            .expect("Failed to create record batch");
+
+        vec![batch]
+    }
+}
+
+// ============================================================================
+// Arrow IPC Serialization / Deserialization
+// ============================================================================
+
+/// Serializes record batches to Arrow IPC file format (in-memory).
+///
+/// This simulates receiving Arrow data over a network or reading from storage.
+/// Uses the IPC file format (with footer) to enable zero-copy deserialization.
+///
+/// # Arguments
+/// * `batches` - Record batches to serialize
+/// * `schema` - Arrow schema (must match batches)
+///
+/// # Returns
+/// Serialized IPC data as a byte vector
+pub fn serialize_to_ipc(batches: &[RecordBatch], schema: &SchemaRef) -> Vec<u8> {
+    let mut buffer = Vec::new();
+    {
+        let mut writer = FileWriter::try_new(&mut buffer, schema).unwrap();
+        for batch in batches {
+            writer.write(batch).unwrap();
+        }
+        writer.finish().unwrap();
+    }
+    buffer
+}
+
+/// Deserializes record batches from Arrow IPC file format using zero-copy.
+///
+/// This is the operation being benchmarked - converting serialized Arrow IPC
+/// data back into in-memory record batches that can be processed by DataFusion.
+///
+/// Zero-copy means the Arrow arrays refer directly to the provided buffer,
+/// avoiding memory copying during deserialization.
+///
+/// # Arguments
+/// * `buffer` - Serialized IPC data
+///
+/// # Returns
+/// Tuple of (schema, batches) extracted from the IPC data
+pub fn deserialize_zero_copy(buffer: &Buffer) -> (SchemaRef, Vec<RecordBatch>) {
+    // Read the footer to get schema and batch locations
+    let trailer_start = buffer.len() - 10;
+    let footer_len = read_footer_length(buffer[trailer_start..].try_into().unwrap()).unwrap();
+    let footer = root_as_footer(&buffer[trailer_start - footer_len..trailer_start]).unwrap();
+
+    let schema = Arc::new(fb_to_schema(footer.schema().unwrap()));
+    let mut decoder = FileDecoder::new(Arc::clone(&schema), footer.version());
+
+    // Read dictionaries if present
+    for block in footer.dictionaries().iter().flatten() {
+        let block_len = block.bodyLength() as usize + block.metaDataLength() as usize;
+        let data = buffer.slice_with_length(block.offset() as _, block_len);
+        decoder.read_dictionary(block, &data).unwrap();
+    }
+
+    // Read all record batches
+    let mut batches = Vec::new();
+    if let Some(batch_blocks) = footer.recordBatches() {
+        for block in batch_blocks {
+            let block_len = block.bodyLength() as usize + block.metaDataLength() as usize;
+            let data = buffer.slice_with_length(block.offset() as _, block_len);
+            if let Some(batch) = decoder.read_record_batch(&block, &data).unwrap() {
+                batches.push(batch);
+            }
+        }
+    }
+
+    (schema, batches)
+}
+
+/// Serializes execution results (record batches) back to Arrow IPC format.
+///
+/// This measures the overhead of serializing query results, which is relevant
+/// for scenarios where results need to be sent over a network or stored.
+///
+/// # Arguments
+/// * `batches` - Result record batches from query execution
+///
+/// # Returns
+/// Serialized IPC data as a byte vector
+pub fn serialize_results_to_ipc(batches: &[RecordBatch]) -> Vec<u8> {
+    if batches.is_empty() {
+        return Vec::new();
+    }
+    let schema = batches[0].schema();
+    serialize_to_ipc(batches, &schema)
+}
+
+/// A writer that discards all data written to it.
+///
+/// This is useful for benchmarking serialization overhead without
+/// including actual I/O or memory allocation costs.
+struct SinkWriter {
+    bytes_written: usize,
+}
+
+impl SinkWriter {
+    fn new() -> Self {
+        Self { bytes_written: 0 }
+    }
+
+    /// Returns the total number of bytes that would have been written.
+    fn bytes_written(&self) -> usize {
+        self.bytes_written
+    }
+}
+
+impl Write for SinkWriter {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.bytes_written += buf.len();
+        Ok(buf.len())
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        Ok(())
+    }
+}
+
+/// Serializes a record batch to a sink that drops all data.
+///
+/// This function measures pure serialization overhead by writing to a sink
+/// that discards data instead of allocating memory or performing I/O.
+/// Useful for benchmarking the CPU cost of serialization alone.
+///
+/// # Arguments
+/// * `batch` - The record batch to serialize
+///
+/// # Returns
+/// The number of bytes that would have been written
+///
+/// # Example
+/// ```ignore
+/// let batch = create_test_batch();
+/// let bytes_written = serialize_to_sink(&batch);
+/// println!("Serialization would write {} bytes", bytes_written);
+/// ```
+pub fn serialize_to_sink(batch: &RecordBatch) -> usize {
+    let schema = batch.schema();
+    let mut sink = SinkWriter::new();
+    {
+        let mut writer = FileWriter::try_new(&mut sink, &schema).unwrap();
+        writer.write(batch).unwrap();
+        writer.finish().unwrap();
+    }
+    sink.bytes_written()
+}
+
+/// Serializes multiple record batches to a sink that drops all data.
+///
+/// This function measures pure serialization overhead by writing to a sink
+/// that discards data instead of allocating memory or performing I/O.
+/// Useful for benchmarking the CPU cost of serialization alone.
+///
+/// # Arguments
+/// * `batches` - The record batches to serialize
+/// * `schema` - The schema for the batches
+///
+/// # Returns
+/// The number of bytes that would have been written
+///
+/// # Example
+/// ```ignore
+/// let batches = create_test_batches();
+/// let schema = batches[0].schema();
+/// let bytes_written = serialize_batches_to_sink(&batches, &schema);
+/// println!("Serialization would write {} bytes", bytes_written);
+/// ```
+pub fn serialize_batches_to_sink(batches: &[RecordBatch], schema: &SchemaRef) -> usize {
+    let mut sink = SinkWriter::new();
+    {
+        let mut writer = FileWriter::try_new(&mut sink, schema).unwrap();
+        for batch in batches {
+            writer.write(batch).unwrap();
+        }
+        writer.finish().unwrap();
+    }
+    sink.bytes_written()
+}
+
+// ============================================================================
+// Execution Plan Source
+// ============================================================================
+
+/// A simple execution plan that serves pre-loaded record batches.
+///
+/// This is used as the leaf node in benchmark execution plans. It wraps
+/// already-deserialized batches and makes them available to downstream
+/// operators (Filter, Sort, etc.) via the standard ExecutionPlan interface.
+///
+/// Unlike file-based sources, this has no I/O overhead - it simply streams
+/// the in-memory batches, allowing us to isolate operator performance.
+#[derive(Debug)]
+pub struct BatchSourceExec {
+    /// Schema of the batches
+    schema: SchemaRef,
+    /// Pre-loaded record batches to serve
+    batches: Vec<RecordBatch>,
+    /// Cached plan properties (partitioning, ordering, etc.)
+    cache: PlanProperties,
+}
+
+impl BatchSourceExec {
+    /// Creates a new batch source execution plan.
+    ///
+    /// # Arguments
+    /// * `schema` - Schema for the batches
+    /// * `batches` - Pre-loaded record batches to serve
+    pub fn new(schema: SchemaRef, batches: Vec<RecordBatch>) -> Self {
+        let cache = PlanProperties::new(
+            EquivalenceProperties::new(Arc::clone(&schema)),
+            Partitioning::UnknownPartitioning(1),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        );
+        Self {
+            schema,
+            batches,
+            cache,
+        }
+    }
+}
+
+impl DisplayAs for BatchSourceExec {
+    fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "BatchSourceExec: batches={}", self.batches.len())
+    }
+}
+
+impl ExecutionPlan for BatchSourceExec {
+    fn name(&self) -> &'static str {
+        "BatchSourceExec"
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        _partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> datafusion_common::Result<SendableRecordBatchStream> {
+        Ok(Box::pin(MemoryStream::try_new(
+            self.batches.clone(),
+            Arc::clone(&self.schema),
+            None,
+        )?))
+    }
+}
diff --git a/datafusion/physical-plan/benches/count_group_by_bench.rs b/datafusion/physical-plan/benches/count_group_by_bench.rs
new file mode 100644
index 000000000000..afe99c31515d
--- /dev/null
+++ b/datafusion/physical-plan/benches/count_group_by_bench.rs
@@ -0,0 +1,279 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for DataFusion AggregateExec (COUNT(*) GROUP BY).
+//!
+//! This benchmark measures the performance of hash-based aggregation with
+//! varying group cardinalities and column types.
+//!
+//! ## Test configurations:
+//!
+//! - **Column types**: Int32, Binary (10B), Binary (1024B)
+//! - **Distinct counts**: 4096, 16384, 65536
+//! - **Data size**: 1M rows total (100 batches × 10K rows)
+//!
+//! The benchmark helps understand:
+//! - How grouping performance scales with cardinality
+//! - Impact of key type (fixed-width int vs variable-length binary)
+//! - Impact of key size (10B vs 1024B binary)
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Run specific configuration (e.g., only int column benchmarks)
+//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan -- "int_col"
+//!
+//! # Run specific cardinality
+//! cargo bench --bench count_group_by_bench -p datafusion-physical-plan -- "distinct_4096"
+//! ```
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::buffer::Buffer;
+use criterion::{
+    BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
+};
+use datafusion_execution::TaskContext;
+use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+use datafusion_physical_plan::{ExecutionPlan, collect};
+
+use bench_utils::{
+    BatchSourceExec, SingleColumnBatchGenerator, SingleColumnType, deserialize_zero_copy,
+    serialize_results_to_ipc, serialize_to_ipc,
+};
+
+// ============================================================================
+// Aggregate Plan Creation
+// ============================================================================
+
+/// Creates an AggregateExec that performs COUNT(*) GROUP BY groupCol.
+///
+/// This simulates a common aggregation pattern where we count the number
+/// of rows for each distinct value in the grouping column.
+///
+/// # Arguments
+/// * `input` - The input execution plan (typically BatchSourceExec)
+///
+/// # Returns
+/// An AggregateExec wrapped in Arc<dyn ExecutionPlan>
+fn create_count_groupby_plan(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
+    let schema = input.schema();
+
+    // Build GROUP BY expression: GROUP BY groupCol
+    let group_col =
+        Arc::new(Column::new_with_schema("groupCol", &schema).unwrap()) as Arc<dyn PhysicalExpr>;
+    let group_expr = vec![(Arc::clone(&group_col), "groupCol".to_string())];
+    let group_by = PhysicalGroupBy::new_single(group_expr);
+
+    // Build aggregate expression: COUNT(groupCol)
+    // We use groupCol as the argument since it's non-null; effectively COUNT(*)
+    let aggr_expr = vec![Arc::new(
+        AggregateExprBuilder::new(count_udaf(), vec![group_col])
+            .schema(Arc::clone(&schema))
+            .alias("count")
+            .build()
+            .unwrap(),
+    )];
+
+    // Create the aggregate execution plan
+    // Using Single mode (not partial/final) for simplicity in benchmarks
+    Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Single,
+            group_by,
+            aggr_expr,
+            vec![None], // No filter expressions
+            input,
+            schema,
+        )
+        .unwrap(),
+    )
+}
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Benchmark configurations for different column types.
+#[derive(Debug, Clone)]
+struct BenchConfig {
+    /// Human-readable name for the configuration
+    name: &'static str,
+    /// Column type to generate
+    column_type: SingleColumnType,
+}
+
+/// Main benchmark function for COUNT(*) GROUP BY execution.
+///
+/// This benchmark measures COUNT(*) GROUP BY performance across:
+/// - Different column types (Int32, Binary 10B, Binary 1024B)
+/// - Different cardinalities (4096, 16384, 65536 distinct values)
+///
+/// For each configuration, we measure:
+/// - **agg_only**: Pure aggregation execution using pre-generated batches
+///   This isolates the AggregateExec performance from serialization overhead
+/// - **full_pipeline**: Complete deser + aggregation + output serialization
+///   Real-world end-to-end latency including IPC serde
+fn bench_count_group_by(c: &mut Criterion) {
+    // Create a single-threaded Tokio runtime for async execution.
+    // We use current_thread to ensure all async work runs on the benchmark thread,
+    // making results comparable to single-threaded Java benchmarks.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+    let mut group = c.benchmark_group("count_group_by_bench");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Configuration: 1M rows total (10K rows × 100 batches)
+    let rows_per_batch = 10_000;
+    let num_batches = 100;
+    let total_rows = rows_per_batch * num_batches;
+
+    // Column type configurations
+    let configs = vec![
+        BenchConfig {
+            name: "int_col",
+            column_type: SingleColumnType::Int,
+        },
+        BenchConfig {
+            name: "binary_10B",
+            column_type: SingleColumnType::Binary { binary_size: 10 },
+        },
+        BenchConfig {
+            name: "binary_1024B",
+            column_type: SingleColumnType::Binary { binary_size: 1024 },
+        },
+    ];
+
+    // Cardinality configurations (number of distinct values)
+    let distinct_counts = vec![4096, 16384, 65536];
+
+    for config in &configs {
+        for &distinct_count in &distinct_counts {
+            let label = format!("{}/distinct_{}", config.name, distinct_count);
+
+            // Generate test data
+            let mut generator = SingleColumnBatchGenerator::new(
+                config.column_type,
+                rows_per_batch,
+                num_batches,
+                distinct_count,
+            );
+            let schema = generator.schema();
+            let batches = generator.generate_batches();
+
+            // Serialize batches to IPC format for full pipeline benchmark
+            let ipc_data = serialize_to_ipc(&batches, &schema);
+            let ipc_size = ipc_data.len();
+
+            // Calculate approximate data size for throughput metric
+            let data_size: usize = batches
+                .iter()
+                .map(|b| b.get_array_memory_size())
+                .sum();
+
+            // Log configuration for visibility in benchmark output
+            println!(
+                "Config: {} rows, {}, distinct={}, data size={:.2} MB, IPC size={:.2} MB",
+                total_rows,
+                config.name,
+                distinct_count,
+                data_size as f64 / (1024.0 * 1024.0),
+                ipc_size as f64 / (1024.0 * 1024.0)
+            );
+
+            // Set throughput metric for bytes/second calculations
+            group.throughput(Throughput::Bytes(ipc_size as u64));
+
+            // Benchmark 1: Aggregation execution only
+            // Uses pre-generated batches directly, isolating AggregateExec performance
+            group.bench_with_input(
+                BenchmarkId::new("agg_only", &label),
+                &batches,
+                |b, batches| {
+                    b.iter_batched(
+                        // Setup: clone batches (NOT timed) - needed because execution consumes them
+                        || batches.clone(),
+                        // Benchmark: execute aggregation (TIMED)
+                        |batches| {
+                            rt.block_on(async {
+                                let source = Arc::new(BatchSourceExec::new(
+                                    Arc::clone(&schema),
+                                    batches,
+                                )) as Arc<dyn ExecutionPlan>;
+                                let plan = create_count_groupby_plan(source);
+                                let task_ctx = Arc::new(TaskContext::default());
+                                let results = collect(plan, task_ctx).await.unwrap();
+                                black_box(results)
+                            })
+                        },
+                        BatchSize::SmallInput,
+                    )
+                },
+            );
+
+            let data_buffer = Buffer::from_vec(ipc_data);
+
+            // Benchmark 2: Full pipeline (deser + aggregation + output serialization)
+            // Measures complete round-trip: IPC in -> aggregate -> IPC out
+            // Relevant for scenarios where results are sent over network or stored
+            group.bench_with_input(
+                BenchmarkId::new("full_pipeline", &label),
+                &data_buffer,
+                |b, data_buffer| {
+                    b.iter(|| {
+                        rt.block_on(async {
+                            let (schema, batches) = deserialize_zero_copy(data_buffer);
+                            let source = Arc::new(BatchSourceExec::new(
+                                Arc::clone(&schema),
+                                batches,
+                            )) as Arc<dyn ExecutionPlan>;
+                            let plan = create_count_groupby_plan(source);
+                            let task_ctx = Arc::new(TaskContext::default());
+                            let results = collect(plan, task_ctx).await.unwrap();
+                            // Serialize results back to IPC format
+                            let output_ipc = serialize_results_to_ipc(&results);
+                            black_box(output_ipc)
+                        })
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_count_group_by);
+criterion_main!(benches);
diff --git a/datafusion/physical-plan/benches/deser.rs b/datafusion/physical-plan/benches/deser.rs
new file mode 100644
index 000000000000..e40616377b6a
--- /dev/null
+++ b/datafusion/physical-plan/benches/deser.rs
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for Arrow IPC deserialization performance.
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench deser -p datafusion-physical-plan
+//!
+//! # Run only the standard deserialization benchmark
+//! cargo bench --bench deser -p datafusion-physical-plan -- deserialize_standard
+//!
+//! # Run only the zero-copy deserialization benchmark
+//! cargo bench --bench deser -p datafusion-physical-plan -- deserialize_zero_copy
+//!
+//! # Change measurement time (per benchmark, default is 5 seconds)
+//! cargo bench --bench deser -p datafusion-physical-plan -- --measurement-time 10
+//!
+//! # Run specific configuration
+//! cargo bench --bench deser -p datafusion-physical-plan -- "1M_rows_binary_10B"
+//! ```
+//!
+//! ## Baseline Management
+//!
+//! ```bash
+//! # Save current results as a named baseline
+//! cargo bench --bench deser -p datafusion-physical-plan -- --save-baseline my-baseline
+//!
+//! # Compare against a specific baseline
+//! cargo bench --bench deser -p datafusion-physical-plan -- --baseline my-baseline
+//!
+//! # Delete all benchmark history and start fresh
+//! rm -rf target/criterion
+//! ```
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::buffer::Buffer;
+use criterion::{
+    BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
+};
+
+use bench_utils::{
+    FunctionalBatchGenerator, create_schema, deserialize_zero_copy, serialize_to_ipc,
+};
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Benchmarks zero-copy IPC deserialization.
+///
+/// This measures the cost of Arrow IPC deserialization using zero-copy
+/// techniques where Arrow arrays reference the original buffer directly
+/// via Buffer slicing. This avoids copying the actual data and only
+/// creates lightweight views into the existing buffer.
+///
+/// This is the most efficient deserialization approach when you have
+/// a contiguous buffer (e.g., mmap'd file or received network buffer).
+fn bench_deserialize(c: &mut Criterion) {
+    let mut group = c.benchmark_group("deserialize_standard");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Configuration: 1M rows total (10K rows × 100 batches)
+    let rows_per_batch = 10_000;
+    let num_batches_vec = vec![1, 100];
+
+    // Test different binary column sizes to understand deserialization overhead
+    for num_batches in num_batches_vec {
+        let total_rows = rows_per_batch * num_batches;
+        let binary_sizes = vec![10, 1024, 2048];
+        for binary_size in binary_sizes {
+            let label = format!("{num_batches}_batches/rows_binary_{binary_size}B");
+
+            // Generate test data and serialize to IPC format
+            let schema = create_schema();
+            let mut generator = FunctionalBatchGenerator::new(
+                Arc::clone(&schema),
+                rows_per_batch,
+                num_batches,
+                binary_size,
+            );
+            let batches = generator.generate_batches();
+            let ipc_data = serialize_to_ipc(&batches, &schema);
+
+            // Convert to Buffer for zero-copy deserialization
+            let buffer = Buffer::from_vec(ipc_data);
+
+            // Set throughput metric for bytes/second calculations
+            group.throughput(Throughput::Bytes(buffer.len() as u64));
+
+            // Log configuration
+            println!(
+                "Config (zero-copy): {} rows, binary_size={} bytes, IPC size={:.2} MB",
+                total_rows,
+                binary_size,
+                buffer.len() as f64 / (1024.0 * 1024.0)
+            );
+
+            group.bench_with_input(
+                BenchmarkId::from_parameter(&label),
+                &buffer,
+                |b, buffer| {
+                    b.iter(|| {
+                        let (schema, batches) = deserialize_zero_copy(buffer);
+                        // black_box prevents compiler from optimizing away unused results
+                        black_box((schema, batches))
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_deserialize);
+criterion_main!(benches);
diff --git a/datafusion/physical-plan/benches/distinct_group_by_bench.rs b/datafusion/physical-plan/benches/distinct_group_by_bench.rs
new file mode 100644
index 000000000000..03728c4459a9
--- /dev/null
+++ b/datafusion/physical-plan/benches/distinct_group_by_bench.rs
@@ -0,0 +1,325 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for DataFusion AggregateExec (COUNT(DISTINCT bytes) GROUP BY colInt).
+//!
+//! This benchmark measures the performance of hash-based aggregation with
+//! distinct counting over binary columns, varying binary sizes and group cardinalities.
+//!
+//! ## Test configurations:
+//!
+//! - **Binary sizes**: 10B, 1000B
+//! - **Distinct keys (group cardinalities)**: 4096, 16384, 65536
+//! - **Data size**: 1M rows total (100 batches × 10K rows)
+//!
+//! The benchmark helps understand:
+//! - How COUNT(DISTINCT) performance scales with group cardinality
+//! - Impact of binary value size on distinct counting
+//! - Performance characteristics of two-column aggregation (GROUP BY colInt, COUNT(DISTINCT bytes))
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Run specific configuration
+//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan -- "INT_BYTES_10"
+//!
+//! # Run specific cardinality
+//! cargo bench --bench distinct_group_by_bench -p datafusion-physical-plan -- "distinct_4096"
+//! ```
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::fs::File;
+use std::hint::black_box;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use arrow::array::{RecordBatch, ArrayRef, BinaryArray, BinaryViewArray};
+use arrow::datatypes::SchemaRef;
+use arrow::ipc::reader::FileReader;
+use criterion::{BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main};
+use datafusion_execution::TaskContext;
+use datafusion_functions_aggregate::count::count_udaf;
+use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
+use datafusion_physical_plan::{ExecutionPlan, collect};
+
+use bench_utils::{BatchSourceExec, serialize_results_to_ipc};
+
+// ============================================================================
+// Aggregate Plan Creation
+// ============================================================================
+
+/// Creates an AggregateExec that performs COUNT(DISTINCT bytes) GROUP BY colInt.
+///
+/// This benchmark doesn't generate data. Instead, we have to run the equivalent JMH benchmark in
+/// Apache Pinot and then copy the generated Arrow IPC files into the `benches/` folder, keeping
+/// the name conventions used in the JMH benchmark.
+///
+/// This simulates a common aggregation pattern where we count the number of
+/// distinct binary values for each integer group value.
+///
+/// # Arguments
+/// * `input` - The input execution plan (typically BatchSourceExec)
+///
+/// # Returns
+/// An AggregateExec wrapped in Arc<dyn ExecutionPlan>
+fn create_distinct_count_groupby_plan(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
+    let schema = input.schema();
+
+    // Build GROUP BY expression: GROUP BY colInt
+    let group_col =
+        Arc::new(Column::new_with_schema("colint", &schema).unwrap()) as Arc<dyn PhysicalExpr>;
+    let group_expr = vec![(Arc::clone(&group_col), "colint".to_string())];
+    let group_by = PhysicalGroupBy::new_single(group_expr);
+
+    // Build aggregate expression: COUNT(DISTINCT bytes)
+    let bytes_col =
+        Arc::new(Column::new_with_schema("bytes", &schema).unwrap()) as Arc<dyn PhysicalExpr>;
+    let aggr_expr = vec![Arc::new(
+        AggregateExprBuilder::new(count_udaf(), vec![bytes_col])
+            .schema(Arc::clone(&schema))
+            .alias("count_distinct")
+            .distinct()  // Enable DISTINCT counting
+            .build()
+            .unwrap(),
+    )];
+
+    // Create the aggregate execution plan
+    // Using Single mode (not partial/final) for simplicity in benchmarks
+    Arc::new(
+        AggregateExec::try_new(
+            AggregateMode::Single,
+            group_by,
+            aggr_expr,
+            vec![None], // No filter expressions
+            input,
+            schema,
+        )
+        .unwrap(),
+    )
+}
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Benchmark configurations for different binary sizes.
+#[derive(Debug, Clone, Copy)]
+struct BenchConfig {
+    /// Human-readable name for the configuration
+    name: &'static str,
+    /// Size of binary values in bytes (not used, but kept for compatibility)
+    #[allow(dead_code)]
+    bytes_length: usize,
+}
+
+/// Enum to select which binary array type to use
+#[derive(Debug, Clone, Copy)]
+enum BinaryType {
+    Binary,
+    BinaryView,
+}
+
+fn get_arrow_file_path(
+    folder: &str,
+    config_name: &str,
+    num_groups: usize,
+    distinct_values_per_group: usize,
+) -> PathBuf {
+    let file_name = format!(
+        "group_distinct_by_{}_groups_{}_distinctPerGroup_{}.arrow",
+        config_name,
+        num_groups,
+        distinct_values_per_group
+    );
+    PathBuf::from(folder).join(file_name)
+}
+
+fn load_batches_from_arrow_file(path: &PathBuf, binary_type: BinaryType) -> (SchemaRef, Vec<RecordBatch>) {
+    let file = File::open(path).unwrap_or_else(|_| panic!("Arrow file not found: {}", path.display()));
+    let mut reader = FileReader::try_new(file, None).expect("Failed to open Arrow IPC file");
+    let orig_schema = reader.schema();
+    let orig_batches = reader.collect::<arrow::error::Result<Vec<_>>>().expect("Failed to read batches from Arrow file");
+
+    match binary_type {
+        BinaryType::Binary => (orig_schema, orig_batches),
+        BinaryType::BinaryView => {
+            // Find the index of the "bytes" column
+            let bytes_idx = orig_schema.fields().iter().position(|f| f.name() == "bytes").expect("No 'bytes' column");
+            // Create new schema with bytes as BinaryView
+            let mut new_fields: Vec<Arc<arrow::datatypes::Field>> = orig_schema.fields().to_vec();
+            new_fields[bytes_idx] = Arc::new(arrow::datatypes::Field::new("bytes", arrow::datatypes::DataType::BinaryView, false));
+            let new_schema = Arc::new(arrow::datatypes::Schema::new(
+                new_fields.iter().map(|f| f.as_ref().clone()).collect::<Vec<arrow::datatypes::Field>>()
+            ));
+            // Convert each batch
+            let new_batches = orig_batches.into_iter().map(|batch| {
+                let mut columns: Vec<ArrayRef> = batch.columns().to_vec();
+                let binary_array = batch.column(bytes_idx).as_any().downcast_ref::<BinaryArray>().expect("'bytes' column is not BinaryArray");
+                let binaryview_vec: Vec<&[u8]> = (0..batch.num_rows()).map(|i| binary_array.value(i)).collect();
+                let binaryview_array = BinaryViewArray::from(binaryview_vec);
+                columns[bytes_idx] = Arc::new(binaryview_array);
+                RecordBatch::try_new(Arc::clone(&new_schema), columns).expect("Failed to create BinaryView batch")
+            }).collect();
+            (new_schema, new_batches)
+        }
+    }
+}
+
+fn bench_distinct_group_by(c: &mut Criterion) {
+    // Create a single-threaded Tokio runtime for async execution.
+    // We use current_thread to ensure all async work runs on the benchmark thread,
+    // making results comparable to single-threaded Java benchmarks.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+
+    let mut group = c.benchmark_group("distinct_group_by_bench");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Binary size configurations
+    let configs = vec![
+        BenchConfig {
+            name: "INT_BYTES_10",
+            bytes_length: 10,
+        },
+        BenchConfig {
+            name: "INT_BYTES_1000",
+            bytes_length: 1000,
+        },
+    ];
+
+    // Distinct values per group (JMH param)
+    let distinct_values_per_group_list = vec![1, 4, 16, 64, 256, 1024];
+
+    let binary_types = vec![BinaryType::Binary, BinaryType::BinaryView];
+
+    let arrow_folder = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("benches");
+
+    for config in &configs {
+        for &binary_type in &binary_types {
+            let binary_type_label = match binary_type {
+                BinaryType::Binary => "Binary",
+                BinaryType::BinaryView => "BinaryView",
+            };
+            for &distinct_values_per_group in &distinct_values_per_group_list {
+                let num_groups = 512;
+                let num_batches = 1;
+                let rows_per_batch = num_groups * distinct_values_per_group;
+                let total_rows = rows_per_batch * num_batches;
+
+                let label = format!("{}/{}/dvg_{}", config.name, binary_type_label, distinct_values_per_group);
+
+                // Load test data from Arrow file
+                let arrow_file_path = get_arrow_file_path(
+                    arrow_folder.to_str().unwrap(),
+                    config.name, // Use config.name for the file name
+                    num_groups,
+                    distinct_values_per_group,
+                );
+                println!("Reading Arrow file: {}", arrow_file_path.display());
+                let (schema, batches) = load_batches_from_arrow_file(&arrow_file_path, binary_type);
+
+                // Calculate approximate data size for throughput metric
+                let data_size: usize = batches
+                    .iter()
+                    .map(|b| b.get_array_memory_size())
+                    .sum();
+
+                // Log configuration for visibility in benchmark output
+                println!(
+                    "Config: {} rows, {}, {}, dvg={}, data size={:.2} MB, Arrow file: {}",
+                    total_rows,
+                    config.name,
+                    binary_type_label,
+                    distinct_values_per_group,
+                    data_size as f64 / (1024.0 * 1024.0),
+                    arrow_file_path.display()
+                );
+
+                // Set throughput metric for bytes/second calculations
+                group.throughput(Throughput::Bytes(data_size as u64));
+
+                // Validation (NOT timed - run once before benchmarking)
+                {
+                    let validation_batches = batches.clone();
+                    let validation_result = rt.block_on(async {
+                        let source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&schema),
+                            validation_batches,
+                        )) as Arc<dyn ExecutionPlan>;
+                        let plan = create_distinct_count_groupby_plan(source);
+                        let task_ctx = Arc::new(TaskContext::default());
+                        collect(plan, task_ctx).await.unwrap()
+                    });
+                    let total_result_rows: usize = validation_result.iter()
+                        .map(|batch| batch.num_rows())
+                        .sum();
+                    assert_eq!(
+                        total_result_rows,
+                        num_groups,
+                        "Expected {} distinct groups, got {}",
+                        num_groups,
+                        total_result_rows
+                    );
+                }
+
+                // Benchmark 2: Full pipeline (deser + aggregation + output serialization)
+                // Measures complete round-trip: IPC in -> aggregate -> IPC out
+                // Relevant for scenarios where results are sent over network or stored
+                group.bench_with_input(
+                    BenchmarkId::new("full_pipeline", &label),
+                    &batches,
+                    |b, batches| {
+                        b.iter(|| {
+                            rt.block_on(async {
+                                let source = Arc::new(BatchSourceExec::new(
+                                    Arc::clone(&schema),
+                                    batches.clone(),
+                                )) as Arc<dyn ExecutionPlan>;
+                                let plan = create_distinct_count_groupby_plan(source);
+                                let task_ctx = Arc::new(TaskContext::default());
+                                let results = collect(plan, task_ctx).await.unwrap();
+                                // Serialize results back to IPC format
+                                let output_ipc = serialize_results_to_ipc(&results);
+                                black_box(output_ipc)
+                            })
+                        })
+                    },
+                );
+            }
+        }
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_distinct_group_by);
+criterion_main!(benches);
+
diff --git a/datafusion/physical-plan/benches/filter_bench.rs b/datafusion/physical-plan/benches/filter_bench.rs
new file mode 100644
index 000000000000..9da637195e7f
--- /dev/null
+++ b/datafusion/physical-plan/benches/filter_bench.rs
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for DataFusion FilterExec with Arrow IPC serialization.
+//!
+//! This benchmark measures the end-to-end latency of:
+//! 1. Deserializing Arrow IPC data into RecordBatches
+//! 2. Executing a FilterExec operator (predicate: colInt > 2500)
+//! 3. Serializing the output back to Arrow IPC format
+//!
+//! The benchmark helps understand the overhead of IPC deserialization
+//! and serialization relative to actual query execution, and how filter
+//! performance scales with data size.
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench filter_bench -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Run only the deser_only benchmark
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- deser_only
+//!
+//! # Change measurement time (per benchmark, default is 5 seconds)
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --measurement-time 10
+//!
+//! # Run specific configuration
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- "1M_rows_binary_10B"
+//! ```
+//!
+//! ## Baseline Management
+//!
+//! Criterion stores benchmark results in `target/criterion/` and automatically compares
+//! new runs against previous results. Each benchmark has three states:
+//! - **base/**: The baseline for comparison (saved with --save-baseline)
+//! - **new/**: The most recent benchmark run
+//! - **change/**: Statistics about the change from base to new
+//!
+//! ```bash
+//! # Save current results as a named baseline (e.g., "main" or "before-optimization")
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --save-baseline my-baseline
+//!
+//! # Compare against a specific baseline
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --baseline my-baseline
+//!
+//! # List all saved baselines (stored in target/criterion/<benchmark-name>/<test-name>/)
+//! ls target/criterion/filter_bench/deser_only/1M_rows_binary_10B/
+//!
+//! # Delete all benchmark history and start fresh
+//! rm -rf target/criterion
+//!
+//! # Run without saving results (useful for quick checks)
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --profile-time 1
+//! ```
+//!
+//! **Typical workflow for tracking performance:**
+//! 1. Before making changes: `cargo bench --bench filter_bench -- --save-baseline before`
+//! 2. Make your code changes
+//! 3. Compare: `cargo bench --bench filter_bench -- --baseline before`
+//! 4. Criterion will show % change from the "before" baseline
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::hint::black_box;
+use std::sync::Arc;
+use arrow::buffer::Buffer;
+use arrow::datatypes::SchemaRef;
+use criterion::{
+    BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
+};
+use datafusion_common::ScalarValue;
+use datafusion_execution::TaskContext;
+use datafusion_expr::Operator;
+use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::filter::FilterExecBuilder;
+use datafusion_physical_plan::{ExecutionPlan, collect};
+
+use bench_utils::{
+    BatchSourceExec, FunctionalBatchGenerator, create_schema, deserialize_zero_copy,
+    serialize_batches_to_sink, serialize_to_ipc,
+};
+
+// ============================================================================
+// Filter Plan Creation
+// ============================================================================
+
+/// Creates a FilterExec that evaluates `colInt > 2500`.
+///
+/// With the data generation pattern `colInt = i % 5000`, this predicate
+/// has approximately 50% selectivity (values 2501-4999 pass, 0-2500 don't).
+///
+/// # Arguments
+/// * `input` - The input execution plan (typically BatchSourceExec)
+/// * `schema` - Schema of the input data
+///
+/// # Returns
+/// A FilterExec wrapped in Arc<dyn ExecutionPlan>
+fn create_filter_plan(
+    input: Arc<dyn ExecutionPlan>,
+    schema: &SchemaRef,
+) -> Arc<dyn ExecutionPlan> {
+    // Build the predicate: colInt > 2500
+    let col_int = Arc::new(Column::new_with_schema("colint", schema).unwrap())
+        as Arc<dyn PhysicalExpr>;
+    let threshold =
+        Arc::new(Literal::new(ScalarValue::Int32(Some(2500)))) as Arc<dyn PhysicalExpr>;
+    let predicate =
+        Arc::new(BinaryExpr::new(col_int, Operator::Gt, threshold)) as Arc<dyn PhysicalExpr>;
+
+    Arc::new(FilterExecBuilder::new(predicate, input).build().unwrap())
+}
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Main benchmark function for filter execution.
+///
+/// This benchmark measures four scenarios for each binary column size:
+///
+/// 1. **deser_only**: Just IPC deserialization, no execution
+///    - Establishes baseline deserialization cost
+///    - Useful for understanding I/O vs compute ratio
+///
+/// 2. **ser_only**: Just IPC serialization, no execution
+///    - Establishes baseline serialization cost
+///    - Uses pre-generated batches directly
+///
+/// 3. **filter_only**: Filter execution only
+///    - Isolates the FilterExec performance
+///    - Uses pre-generated batches directly (no deserialization)
+///    - Timed phase runs only the filter operator
+///
+/// 4. **full_pipeline**: Complete deser + filter + output serialization
+///    - Real-world end-to-end latency including result serialization
+///    - Relevant for scenarios where results are sent over network
+fn bench_filter(c: &mut Criterion) {
+    // Create a single-threaded Tokio runtime for async execution.
+    // We use current_thread to ensure all async work runs on the benchmark thread,
+    // making results comparable to single-threaded Java benchmarks.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+    let mut group = c.benchmark_group("filter_bench");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Set measurement time (default is 5 seconds)
+    // Uncomment and adjust the duration as needed:
+    // group.measurement_time(std::time::Duration::from_secs(10));
+
+    // Configuration: 1M rows total (10K rows × 100 batches)
+    let rows_per_batch = 10_000;
+    let num_batches = 100;
+    let total_rows = rows_per_batch * num_batches;
+
+    // Test different binary column sizes to understand serialization overhead
+    let binary_sizes = vec![10, 1024, 2048];
+
+    for binary_size in binary_sizes {
+        let label = format!("1M_rows_binary_{binary_size}B");
+
+        // Generate test data and serialize to IPC format
+        let schema = create_schema();
+        let mut generator = FunctionalBatchGenerator::new(
+            Arc::clone(&schema),
+            rows_per_batch,
+            num_batches,
+            binary_size,
+        );
+        let batches = generator.generate_batches();
+        let ipc_data = serialize_to_ipc(&batches, &schema);
+        let ipc_size = ipc_data.len();
+        let ipc_buffer = Buffer::from_vec(ipc_data);
+
+        // Log configuration for visibility in benchmark output
+        println!(
+            "Config: {} rows, binary_size={} bytes, IPC size={:.2} MB",
+            total_rows,
+            binary_size,
+            ipc_size as f64 / (1024.0 * 1024.0)
+        );
+
+        // Set throughput metric for bytes/second calculations
+        group.throughput(Throughput::Bytes(ipc_size as u64));
+
+        // Benchmark 3: Filter execution only
+        // Uses pre-generated batches directly, isolating FilterExec performance
+        group.bench_with_input(
+            BenchmarkId::new("filter_only", &label),
+            &batches,
+            |b, batches| {
+                b.iter_batched(
+                    // Setup: clone batches (NOT timed) - needed because execution consumes them
+                    || batches.clone(),
+                    // Benchmark: execute filter (TIMED)
+                    |batches| {
+                        rt.block_on(async {
+                            let source = Arc::new(BatchSourceExec::new(
+                                Arc::clone(&schema),
+                                batches,
+                            )) as Arc<dyn ExecutionPlan>;
+                            let plan = create_filter_plan(source, &schema);
+                            let task_ctx = Arc::new(TaskContext::default());
+                            let results = collect(plan, task_ctx).await.unwrap();
+                            black_box(results)
+                        })
+                    },
+                    BatchSize::SmallInput,
+                )
+            },
+        );
+
+        // Benchmark 4: Full pipeline (deser + filter + output serialization)
+        // Measures complete round-trip: IPC in -> filter -> IPC out
+        // Relevant for scenarios where results are sent over network or stored
+        group.bench_with_input(
+            BenchmarkId::new("full_pipeline", &label),
+            &ipc_buffer,
+            |b, ipc_buffer| {
+                b.iter(|| {
+                    rt.block_on(async {
+                        let (schema, batches) = deserialize_zero_copy(ipc_buffer);
+                        let source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&schema),
+                            batches,
+                        )) as Arc<dyn ExecutionPlan>;
+                        let plan = create_filter_plan(source, &schema);
+                        let task_ctx = Arc::new(TaskContext::default());
+                        let results = collect(plan, task_ctx).await.unwrap();
+                        // Serialize results back to IPC format
+                        black_box(serialize_batches_to_sink(&results, &schema))
+                    })
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_filter);
+criterion_main!(benches);
diff --git a/datafusion/physical-plan/benches/filter_jni_benchmark.rs b/datafusion/physical-plan/benches/filter_jni_benchmark.rs
new file mode 100644
index 000000000000..f7e3e1e760d1
--- /dev/null
+++ b/datafusion/physical-plan/benches/filter_jni_benchmark.rs
@@ -0,0 +1,590 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for DataFusion FilterExec with Java UDF via JNI and Arrow FFI.
+//!
+//! This benchmark measures the overhead of calling a Java UDF for filtering
+//! using JNI and zero-copy Arrow C Data Interface, compared to native Rust filtering.
+//!
+//! ## Prerequisites
+//!
+//! Before running this benchmark, you must compile the Java code:
+//!
+//! ```bash
+//! cd datafusion/physical-plan/benches/jvm
+//! mvn compile
+//! cd ../../../../
+//! ```
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all JNI filter benchmarks
+//! cargo bench --bench filter_jni_benchmark -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench filter_jni_benchmark -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Compare with native filter benchmark
+//! cargo bench --bench filter_bench -p datafusion-physical-plan -- --save-baseline native
+//! cargo bench --bench filter_jni_benchmark -p datafusion-physical-plan -- --baseline native
+//! ```
+//!
+//! ## What it measures
+//!
+//! - **JNI call overhead**: Cost of calling Java methods from Rust
+//! - **FFI conversion overhead**: Cost of converting between Rust and C Data Interface
+//! - **Java Arrow operations**: Cost of import/filter/export in Java
+//! - **Total overhead**: End-to-end comparison with native Rust filtering
+//!
+//! ## Architecture
+//!
+//! ```text
+//! Rust (DataFusion)           JNI Boundary              Java (Arrow)
+//! ─────────────────────────────────────────────────────────────────
+//! RecordBatch
+//!     ↓
+//! arrow::ffi::to_ffi()
+//!     ↓
+//! FFI_ArrowSchema*   ────────→  long schemaPtr
+//! FFI_ArrowArray*    ────────→  long arrayPtr
+//!                                ↓
+//!                           Data.importRecordBatch()
+//!                                ↓
+//!                           Apply filter: colInt > 2500
+//!                                ↓
+//!                           Data.exportRecordBatch()
+//!                                ↓
+//! FFI_ArrowSchema*   ←────────  long[] [schemaPtr, arrayPtr]
+//! FFI_ArrowArray*    ←────────
+//!     ↓
+//! arrow::ffi::from_ffi()
+//!     ↓
+//! RecordBatch (filtered)
+//! ```
+
+// Note: JVM library loading is handled at runtime by the jni crate (with invocation feature)
+// via java-locator. No manual #[link] directive is needed. Ensure JAVA_HOME is set before running.
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::any::Any;
+
+
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::array::{Array, BooleanArray, Int32Array, StructArray};
+use arrow::buffer::Buffer;
+use arrow::datatypes::DataType;
+use arrow::ffi::{from_ffi, to_ffi, FFI_ArrowArray, FFI_ArrowSchema};
+use criterion::{
+    criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, SamplingMode,
+    Throughput,
+};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_execution::TaskContext;
+use datafusion_expr::{col, ColumnarValue, ScalarFunctionImplementation, Volatility};
+use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::{collect, ExecutionPlan};
+use jni::objects::{JLongArray, JValue};
+use jni::sys::jlong;
+use jni::JavaVM;
+
+use bench_utils::{
+    create_schema, deserialize_zero_copy, serialize_batches_to_sink, serialize_to_ipc,
+    BatchSourceExec, FunctionalBatchGenerator,
+};
+
+// ============================================================================
+// JVM Initialization
+// ============================================================================
+
+/// Global JVM instance initialized once before benchmarks
+static mut JVM: Option<JavaVM> = None;
+
+/// Initialize JVM with classpath pointing to compiled Java classes.
+///
+/// This function should be called once before running any benchmarks.
+/// The classpath is hardcoded to point to the Maven JAR file.
+///
+/// The JVM library is dynamically loaded by the jni crate via java-locator,
+/// which uses JAVA_HOME environment variable to locate the JVM installation.
+fn init_jvm() {
+    use jni::InitArgsBuilder;
+
+    unsafe {
+        let jvm_ptr = std::ptr::addr_of_mut!(JVM);
+        if (*jvm_ptr).is_some() {
+            return; // Already initialized
+        }
+
+        // Set up JVM arguments with classpath pointing to the compiled JAR
+        // Notice you need to run `mvn package` in the jvm directory to compile and package the Java code first
+        let classpath = "benches/jvm/target/datafusion-jni-benchmark-1.0-SNAPSHOT.jar";
+
+        // Check if the JAR file exists
+        let classpath_path = std::path::Path::new(classpath);
+        if !classpath_path.exists() {
+            let absolute_path = std::env::current_dir()
+                .map(|p| p.join(classpath))
+                .unwrap_or_else(|_| classpath_path.to_path_buf());
+
+            panic!(
+                "JAR file not found at: {}\n\
+                Absolute path: {}\n\
+                Please compile the Java code first by running:\n\
+                  cd datafusion/physical-plan/benches/jvm\n\
+                  mvn package\n\
+                  cd ../../../../",
+                classpath,
+                absolute_path.display()
+            );
+        }
+
+        let classpath_option = format!("-Djava.class.path={}", classpath);
+
+        let jvm_args = InitArgsBuilder::new()
+            .option(&classpath_option)
+            .option("--add-opens=java.base/java.nio=ALL-UNNAMED")
+            .build()
+            .expect("Failed to build JVM arguments");
+
+        let jvm = JavaVM::new(jvm_args)
+            .expect("Failed to create JVM. Ensure JAVA_HOME is set and Java is installed.");
+
+        *jvm_ptr = Some(jvm);
+    }
+}
+
+/// Get JVM instance (panics if not initialized)
+fn get_jvm() -> &'static JavaVM {
+    unsafe {
+        let jvm_ptr = std::ptr::addr_of!(JVM);
+        (*jvm_ptr)
+            .as_ref()
+            .expect("JVM not initialized. Call init_jvm() first.")
+    }
+}
+
+// ============================================================================
+// Java UDF Wrapper - Scalar UDF Implementation
+// ============================================================================
+
+/// Calls Java evaluatePredicate method with Arrow FFI pointers for a single column.
+///
+/// This function:
+/// 1. Wraps Int32Array in a RecordBatch and converts to FFI pointers
+/// 2. Calls Java Udf.evaluatePredicate(schemaPtr, arrayPtr) via JNI
+/// 3. Receives result pointers from Java (boolean array in a RecordBatch)
+/// 4. Converts result back to BooleanArray using arrow::ffi::from_ffi()
+///
+/// # Memory Management
+/// - Input FFI pointers are released after Java imports the data
+/// - Output FFI pointers are managed by Arrow's Drop implementation
+/// - Java is responsible for releasing exported data after Rust imports it
+fn call_java_predicate(int_array: &Int32Array) -> Result<BooleanArray> {
+    use arrow::datatypes::{Schema, Field};
+    use arrow::record_batch::RecordBatch;
+
+    // Create a schema for the input (single Int32 column)
+    let input_schema = Schema::new(vec![Field::new("colint", DataType::Int32, true)]);
+
+    // Create a RecordBatch with the Int32Array
+    let record_batch = RecordBatch::try_new(
+        Arc::new(input_schema),
+        vec![Arc::new(int_array.clone()) as Arc<dyn Array>],
+    )?;
+
+    // Convert RecordBatch to FFI pointers via StructArray
+    let struct_array: StructArray = record_batch.into();
+    let (ffi_array, ffi_schema) = to_ffi(&struct_array.to_data())?;
+
+    // Get raw pointers for JNI call
+    let schema_ptr = &ffi_schema as *const FFI_ArrowSchema as jlong;
+    let array_ptr = &ffi_array as *const FFI_ArrowArray as jlong;
+
+    // Call Java UDF via JNI
+    let jvm = get_jvm();
+    let mut env = jvm.attach_current_thread()
+        .map_err(|e| datafusion_common::DataFusionError::Execution(
+            format!("Failed to attach JVM thread: {}", e)
+        ))?;
+
+    // Call static method: Udf.evaluatePredicate(long, long) -> long[]
+    let result_ptrs = env.call_static_method(
+        "org/apache/datafusion/benchmark/Udf",
+        "evaluatePredicate",
+        "(JJ)[J",
+        &[JValue::Long(schema_ptr), JValue::Long(array_ptr)],
+    ).map_err(|e| datafusion_common::DataFusionError::Execution(
+        format!("Java UDF call failed: {}", e)
+    ))?;
+
+    // Extract the long[] result containing [schemaPtr, arrayPtr]
+    let result_array = result_ptrs.l()
+        .map_err(|e| datafusion_common::DataFusionError::Execution(
+            format!("Failed to extract result array: {}", e)
+        ))?;
+
+    let result_array = JLongArray::from(result_array);
+
+    // Get the two pointers from the result array
+    let mut ptrs = [0i64; 2];
+    env.get_long_array_region(&result_array, 0, &mut ptrs)
+        .map_err(|e| datafusion_common::DataFusionError::Execution(
+            format!("Failed to read result pointers: {}", e)
+        ))?;
+
+    let result_schema_ptr = ptrs[0] as *mut FFI_ArrowSchema;
+    let result_array_ptr = ptrs[1] as *mut FFI_ArrowArray;
+
+    // Safety: We trust that Java has allocated valid FFI structures
+    // The from_ffi call will take ownership and handle cleanup via release callbacks
+    let result_array_data = unsafe {
+        let result_ffi_schema = FFI_ArrowSchema::from_raw(result_schema_ptr);
+        let result_ffi_array = FFI_ArrowArray::from_raw(result_array_ptr);
+        from_ffi(result_ffi_array, &result_ffi_schema)?
+    };
+
+    // Java returns a VectorSchemaRoot (struct type) with one boolean column
+    // We need to extract the child array directly from the ArrayData
+    if !matches!(result_array_data.data_type(), DataType::Struct(_)) {
+        return Err(datafusion_common::DataFusionError::Execution(
+            format!("Expected Struct type from Java, got {:?}", result_array_data.data_type())
+        ));
+    }
+
+    // Get the first child array data (the boolean column) directly
+    // We don't use StructArray::from() because it tries to slice child arrays
+    // based on the struct's offset/length, which can cause issues
+    let child_data = result_array_data.child_data().get(0)
+        .ok_or_else(|| datafusion_common::DataFusionError::Execution(
+            "Expected at least one child in result struct".to_string()
+        ))?;
+
+    // Construct the BooleanArray directly from the child ArrayData
+    let boolean_array = BooleanArray::from(child_data.clone());
+
+    Ok(boolean_array)
+}
+
+/// Create a scalar UDF that wraps the Java predicate function
+fn create_java_predicate_udf() -> ScalarFunctionImplementation {
+    Arc::new(move |args: &[ColumnarValue]| -> Result<ColumnarValue> {
+        // Extract the Int32Array from the input
+        let int_array = match &args[0] {
+            ColumnarValue::Array(arr) => arr
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("Expected Int32Array")
+                .clone(),
+            ColumnarValue::Scalar(ScalarValue::Int32(Some(val))) => {
+                // Single value - create array with one element
+                Int32Array::from(vec![*val])
+            }
+            ColumnarValue::Scalar(ScalarValue::Int32(None)) => {
+                // Null value - create array with one null
+                Int32Array::from(vec![None as Option<i32>])
+            }
+            _ => {
+                return Err(datafusion_common::DataFusionError::Execution(
+                    "Expected Int32 input".to_string(),
+                ))
+            }
+        };
+
+        // Call Java predicate
+        let result_array = call_java_predicate(&int_array)?;
+
+        Ok(ColumnarValue::Array(Arc::new(result_array)))
+    })
+}
+
+// ============================================================================
+// Filter Plan Creation with Java UDF
+// ============================================================================
+
+/// Creates a FilterExec plan that uses a Java UDF for the predicate evaluation.
+///
+/// This function creates a standard DataFusion FilterExec that applies the
+/// Java UDF predicate (colInt > 2500) via JNI.
+fn create_java_filter_plan(input: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
+    use datafusion_expr::create_udf;
+    use datafusion_physical_expr::create_physical_expr;
+    use datafusion_expr::Expr;
+    use datafusion_common::DFSchema;
+    use datafusion_expr::execution_props::ExecutionProps;
+
+    let schema = input.schema();
+
+    // Create the Java predicate UDF
+    let java_udf_impl = create_java_predicate_udf();
+
+    // Create UDF with signature
+    let java_udf = create_udf(
+        "java_gt_2500",
+        vec![DataType::Int32],
+        DataType::Boolean,
+        Volatility::Immutable,
+        java_udf_impl,
+    );
+
+    // Create the expression: java_gt_2500(colint)
+    let col_expr = col("colint");
+    let udf_expr = Expr::ScalarFunction(datafusion_expr::expr::ScalarFunction::new_udf(
+        Arc::new(java_udf),
+        vec![col_expr],
+    ));
+
+    // Convert logical expression to physical expression
+    let df_schema = DFSchema::try_from(schema.as_ref().clone())?;
+    let execution_props = ExecutionProps::new();
+    let physical_expr = create_physical_expr(
+        &udf_expr,
+        &df_schema,
+        &execution_props,
+    )?;
+
+    // Create FilterExec with the Java UDF predicate
+    Ok(Arc::new(FilterExec::try_new(physical_expr, input)?))
+}
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Main benchmark function for JNI filter execution.
+///
+/// This benchmark measures two scenarios:
+///
+/// 1. **filter_only**: Filter execution only (using Java UDF)
+///    - Isolates the JNI/FFI overhead and Java filter performance
+///    - Uses pre-generated batches directly (no deserialization)
+///
+/// 2. **full_pipeline**: Complete deser + Java filter + output serialization
+///    - Real-world end-to-end latency including JNI overhead
+///    - Relevant for understanding total cost of Java UDF integration
+fn bench_filter(c: &mut Criterion) {
+    // Initialize JVM once before all benchmarks
+    init_jvm();
+
+    // Create a single-threaded Tokio runtime for async execution
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+
+    let mut group = c.benchmark_group("filter_jni_benchmark");
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Configuration: 1M rows total (10K rows × 100 batches)
+    let rows_per_batch = 10_000;
+    let num_batches = 100;
+    let total_rows = rows_per_batch * num_batches;
+
+    // Test different binary column sizes to understand serialization overhead
+    let binary_sizes = vec![10, 1024, 2048];
+
+    for binary_size in binary_sizes {
+        let label = format!("1M_rows_binary_{binary_size}B");
+
+        // Generate test data and serialize to IPC format
+        let schema = create_schema();
+        let mut generator =
+            FunctionalBatchGenerator::new(Arc::clone(&schema), rows_per_batch, num_batches, binary_size);
+        let batches = generator.generate_batches();
+        let ipc_data = serialize_to_ipc(&batches, &schema);
+        let ipc_size = ipc_data.len();
+        let ipc_buffer = Buffer::from_vec(ipc_data);
+
+        // Log configuration
+        println!(
+            "Config: {} rows, binary_size={} bytes, IPC size={:.2} MB",
+            total_rows,
+            binary_size,
+            ipc_size as f64 / (1024.0 * 1024.0)
+        );
+
+        group.throughput(Throughput::Bytes(ipc_size as u64));
+
+        // Benchmark 2: Full pipeline (deser + Java filter + output serialization)
+        group.bench_with_input(
+            BenchmarkId::new("full_pipeline", &label),
+            &ipc_buffer,
+            |b, ipc_buffer| {
+                b.iter(|| {
+                    rt.block_on(async {
+                        let (schema, batches) = deserialize_zero_copy(ipc_buffer);
+                        let source = Arc::new(BatchSourceExec::new(Arc::clone(&schema), batches))
+                            as Arc<dyn ExecutionPlan>;
+                        let plan = create_java_filter_plan(source).unwrap();
+                        let task_ctx = Arc::new(TaskContext::default());
+                        let results = collect(plan, task_ctx).await.unwrap();
+                        black_box(serialize_batches_to_sink(&results, &schema))
+                    })
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_filter);
+criterion_main!(benches);
+
+// ============================================================================
+// Tests
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Test that the Java filter returns the correct number of rows.
+    ///
+    /// The filter predicate is `colInt > 2500`, and colInt values follow the pattern `i % 5000`.
+    /// This means:
+    /// - Values range from 0 to 4999
+    /// - Values > 2500 are: 2501, 2502, ..., 4999 (2499 values)
+    /// - Expected selectivity: 2499/5000 = 49.98%
+    ///
+    /// For 1M total rows (100 batches × 10K rows), we expect:
+    /// - Filtered rows: 1,000,000 × 0.4998 = 499,800 rows
+    #[test]
+    fn test_java_filter_row_count() {
+        // Initialize JVM
+        init_jvm();
+
+        // Create Tokio runtime
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .build()
+            .unwrap();
+
+        // Configuration matching the benchmark
+        let rows_per_batch = 10_000;
+        let num_batches = 100;
+        let total_rows = rows_per_batch * num_batches;
+        let binary_size = 10;
+
+        // Generate test data
+        let schema = create_schema();
+        let mut generator =
+            FunctionalBatchGenerator::new(Arc::clone(&schema), rows_per_batch, num_batches, binary_size);
+        let batches = generator.generate_batches();
+
+        // Create execution plan with Java filter
+        let source = Arc::new(BatchSourceExec::new(Arc::clone(&schema), batches))
+            as Arc<dyn ExecutionPlan>;
+        let plan = create_java_filter_plan(source).unwrap();
+
+        // Execute the plan
+        let task_ctx = Arc::new(TaskContext::default());
+        let results = rt.block_on(async {
+            collect(plan, task_ctx).await.unwrap()
+        });
+
+        // Count total rows in results
+        let filtered_row_count: usize = results.iter().map(|batch| batch.num_rows()).sum();
+
+        // Calculate expected count
+        // colInt values: i % 5000, so values are 0..4999
+        // Filter: colInt > 2500, so we keep 2501..4999 = 2499 values per 5000
+        // Expected: (total_rows / 5000) * 2499
+        let expected_count = (total_rows / 500) * 2499;
+
+        assert_eq!(
+            filtered_row_count, expected_count,
+            "Java filter returned {} rows, expected {} rows ({}% selectivity)",
+            filtered_row_count,
+            expected_count,
+            (expected_count as f64 / total_rows as f64) * 100.0
+        );
+
+        println!(
+            "✓ Java filter correctness test passed: {} rows filtered from {} total rows ({:.2}% selectivity)",
+            filtered_row_count,
+            total_rows,
+            (filtered_row_count as f64 / total_rows as f64) * 100.0
+        );
+    }
+
+    /// Test that the Java filter produces the same results as the expected filter logic
+    /// by verifying that all returned values actually satisfy the predicate.
+    #[test]
+    fn test_java_filter_correctness() {
+        // Initialize JVM
+        init_jvm();
+
+        // Create Tokio runtime
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .build()
+            .unwrap();
+
+        // Use smaller dataset for detailed validation
+        let rows_per_batch = 1_000;
+        let num_batches = 10;
+        let binary_size = 10;
+
+        // Generate test data
+        let schema = create_schema();
+        let mut generator =
+            FunctionalBatchGenerator::new(Arc::clone(&schema), rows_per_batch, num_batches, binary_size);
+        let batches = generator.generate_batches();
+
+        // Create execution plan with Java filter
+        let source = Arc::new(BatchSourceExec::new(Arc::clone(&schema), batches))
+            as Arc<dyn ExecutionPlan>;
+        let plan = create_java_filter_plan(source).unwrap();
+
+        // Execute the plan
+        let task_ctx = Arc::new(TaskContext::default());
+        let results = rt.block_on(async {
+            collect(plan, task_ctx).await.unwrap()
+        });
+
+        // Verify all returned rows satisfy the predicate: colInt > 2500
+        for (batch_idx, batch) in results.iter().enumerate() {
+            let colint_array = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .expect("Expected Int32Array for colint");
+
+            for row_idx in 0..colint_array.len() {
+                let value = colint_array.value(row_idx);
+                assert!(
+                    value > 2500,
+                    "Batch {}, row {}: expected value > 2500, got {}",
+                    batch_idx,
+                    row_idx,
+                    value
+                );
+            }
+        }
+
+        let total_filtered_rows: usize = results.iter().map(|batch| batch.num_rows()).sum();
+        println!(
+            "✓ Java filter correctness test passed: all {} filtered rows satisfy colInt > 2500",
+            total_filtered_rows
+        );
+    }
+}
+
diff --git a/datafusion/physical-plan/benches/hash_join_bench.rs b/datafusion/physical-plan/benches/hash_join_bench.rs
new file mode 100644
index 000000000000..c3d3491626ba
--- /dev/null
+++ b/datafusion/physical-plan/benches/hash_join_bench.rs
@@ -0,0 +1,314 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for DataFusion HashJoinExec (inner equi-join).
+//!
+//! This benchmark measures the performance of hash-based inner joins with
+//! varying match rates, key repetition patterns, and data sizes.
+//!
+//! ## Test configurations:
+//!
+//! - **Binary sizes**: 10B, 4096B (affects left side row size)
+//! - **Match rate**: 0.5, 1.0 (fraction of left keys that match right keys)
+//! - **Repeated right keys**: 1, 10, 100 (join fan-out factor)
+//!
+//! ## Data setup:
+//!
+//! - **Left side (probe)**: 1M rows (100 batches × 10K rows)
+//!   - Schema: colInt, colLong, colFloat, colDouble, colString, colBinary
+//!   - colInt values: 0-4999 (using `i % 5000` pattern)
+//!
+//! - **Right side (build)**: Variable size based on parameters
+//!   - Schema: colInt (single column)
+//!   - colInt values: 0 to (matchRate × 5000 - 1)
+//!   - Each key repeated `repeatedRightKeys` times
+//!   - This smaller side is used to build the hash table
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench hash_join_bench -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench hash_join_bench -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Run specific configuration
+//! cargo bench --bench hash_join_bench -p datafusion-physical-plan -- "binary_10B"
+//! cargo bench --bench hash_join_bench -p datafusion-physical-plan -- "match_1.0"
+//! ```
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::buffer::Buffer;
+use criterion::{
+    BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
+};
+use datafusion_common::{JoinType, NullEquality};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+use datafusion_physical_plan::{ExecutionPlan, collect};
+
+use bench_utils::{
+    BatchSourceExec, FunctionalBatchGenerator, JoinBuildSideGenerator, create_schema,
+    deserialize_zero_copy, serialize_results_to_ipc, serialize_to_ipc,
+};
+
+// ============================================================================
+// Hash Join Plan Creation
+// ============================================================================
+
+/// Creates a HashJoinExec that performs an inner equi-join on colInt.
+///
+/// The join is: `probe.colInt = build.colInt`
+///
+/// In DataFusion's HashJoinExec:
+/// - First argument (left) is the BUILD side (gets hashed into hash table)
+/// - Second argument (right) is the PROBE side (scans and probes hash table)
+///
+/// # Arguments
+/// * `probe` - Probe side execution plan (larger, 1M rows)
+/// * `build` - Build side execution plan (smaller, variable rows)
+///
+/// # Returns
+/// A HashJoinExec wrapped in Arc<dyn ExecutionPlan>
+fn create_hash_join_plan(
+    probe: Arc<dyn ExecutionPlan>,
+    build: Arc<dyn ExecutionPlan>,
+) -> Arc<dyn ExecutionPlan> {
+    let probe_schema = probe.schema();
+    let build_schema = build.schema();
+
+    // Build join condition: build.colInt = probe.colInt
+    // Note: In HashJoinExec, the "on" condition is (left_col, right_col) = (build_col, probe_col)
+    let build_col = Arc::new(Column::new_with_schema("colInt", &build_schema).unwrap())
+        as Arc<dyn PhysicalExpr>;
+    let probe_col = Arc::new(Column::new_with_schema("colInt", &probe_schema).unwrap())
+        as Arc<dyn PhysicalExpr>;
+
+    let on = vec![(build_col, probe_col)];
+
+    Arc::new(
+        HashJoinExec::try_new(
+            build,                        // Left = build side (gets hashed)
+            probe,                        // Right = probe side (probes hash table)
+            on,
+            None,                         // No additional filter
+            &JoinType::Inner,             // Inner join
+            None,                         // No projection
+            PartitionMode::CollectLeft,   // Build hash table from left (build) side
+            NullEquality::NullEqualsNothing, // NULLs don't match
+            false,                        // Not null-aware
+        )
+        .unwrap(),
+    )
+}
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Benchmark configurations for different scenarios.
+#[derive(Debug, Clone)]
+struct BenchConfig {
+    /// Binary column size for left side (affects row size)
+    binary_size: usize,
+    /// Match rate: fraction of left keys that match right keys
+    match_rate: f64,
+    /// Number of times each right key is repeated (fan-out factor)
+    repeated_right_keys: usize,
+}
+
+impl BenchConfig {
+    fn label(&self) -> String {
+        format!(
+            "binary_{}B/match_{}/repeat_{}",
+            self.binary_size, self.match_rate, self.repeated_right_keys
+        )
+    }
+}
+
+/// Main benchmark function for hash join execution.
+///
+/// This benchmark measures inner equi-join performance across:
+/// - Different left side row sizes (binary column 10B vs 4096B)
+/// - Different match rates (0.5 vs 1.0)
+/// - Different fan-out factors (repeated right keys: 1, 10, 100)
+///
+/// For each configuration, we measure:
+/// - **join_only**: Pure HashJoinExec performance using pre-generated batches
+/// - **full_pipeline**: Complete deser + join + output serialization
+fn bench_hash_join(c: &mut Criterion) {
+    // Create a single-threaded Tokio runtime for async execution.
+    // We use current_thread to ensure all async work runs on the benchmark thread,
+    // making results comparable to single-threaded Java benchmarks.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+    let mut group = c.benchmark_group("hash_join_bench");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Probe side configuration: 1M rows total (10K rows × 100 batches)
+    let rows_per_batch = 10_000;
+    let num_batches = 1;
+    let total_probe_rows = rows_per_batch * num_batches;
+
+    // Generate all benchmark configurations
+    let configs: Vec<BenchConfig> = vec![
+        // Binary size 10B configurations
+        BenchConfig { binary_size: 10, match_rate: 0.5, repeated_right_keys: 1 },
+        BenchConfig { binary_size: 10, match_rate: 0.5, repeated_right_keys: 10 },
+        BenchConfig { binary_size: 10, match_rate: 0.5, repeated_right_keys: 100 },
+        BenchConfig { binary_size: 10, match_rate: 1.0, repeated_right_keys: 1 },
+        BenchConfig { binary_size: 10, match_rate: 1.0, repeated_right_keys: 10 },
+        BenchConfig { binary_size: 10, match_rate: 1.0, repeated_right_keys: 100 },
+        // Binary size 4096B configurations
+        BenchConfig { binary_size: 4096, match_rate: 0.5, repeated_right_keys: 1 },
+        BenchConfig { binary_size: 4096, match_rate: 0.5, repeated_right_keys: 10 },
+        BenchConfig { binary_size: 4096, match_rate: 0.5, repeated_right_keys: 100 },
+        BenchConfig { binary_size: 4096, match_rate: 1.0, repeated_right_keys: 1 },
+        BenchConfig { binary_size: 4096, match_rate: 1.0, repeated_right_keys: 10 },
+        BenchConfig { binary_size: 4096, match_rate: 1.0, repeated_right_keys: 100 },
+    ];
+
+    for config in &configs {
+        let label = config.label();
+
+        // Generate probe side data (left side - larger, 1M rows)
+        let probe_schema = create_schema();
+        let mut probe_generator = FunctionalBatchGenerator::new(
+            Arc::clone(&probe_schema),
+            rows_per_batch,
+            num_batches,
+            config.binary_size,
+        );
+        let probe_batches = probe_generator.generate_batches();
+
+        // Generate build side data (smaller, variable rows - used for hash table)
+        let build_generator =
+            JoinBuildSideGenerator::new(config.match_rate, config.repeated_right_keys);
+        let build_schema = build_generator.schema();
+        let build_batches = build_generator.generate_batches();
+        let total_build_rows = build_generator.total_rows();
+
+        // Serialize batches to IPC format for full pipeline benchmark
+        let probe_ipc_data = serialize_to_ipc(&probe_batches, &probe_schema);
+        let build_ipc_data = serialize_to_ipc(&build_batches, &build_schema);
+        let total_ipc_size = probe_ipc_data.len() + build_ipc_data.len();
+
+        // Calculate approximate data sizes
+        let probe_data_size: usize =
+            probe_batches.iter().map(|b| b.get_array_memory_size()).sum();
+        let build_data_size: usize =
+            build_batches.iter().map(|b| b.get_array_memory_size()).sum();
+
+        // Log configuration for visibility in benchmark output
+        println!(
+            "Config: {}, probe={} rows ({:.2} MB), build={} rows ({:.2} MB), IPC={:.2} MB",
+            label,
+            total_probe_rows,
+            probe_data_size as f64 / (1024.0 * 1024.0),
+            total_build_rows,
+            build_data_size as f64 / (1024.0 * 1024.0),
+            total_ipc_size as f64 / (1024.0 * 1024.0)
+        );
+
+        // Set throughput metric for bytes/second calculations (based on input size)
+        group.throughput(Throughput::Bytes(total_ipc_size as u64));
+
+        // Benchmark 1: Join execution only
+        // Uses pre-generated batches directly, isolating HashJoinExec performance
+        // group.bench_with_input(
+        //     BenchmarkId::new("join_only", &label),
+        //     &(&probe_batches, &build_batches),
+        //     |b, (probe_batches, build_batches)| {
+        //         b.iter_batched(
+        //             // Setup: clone batches (NOT timed) - needed because execution consumes them
+        //             || ((*probe_batches).clone(), (*build_batches).clone()),
+        //             // Benchmark: execute join (TIMED)
+        //             |(probe_batches, build_batches)| {
+        //                 rt.block_on(async {
+        //                     let probe_source = Arc::new(BatchSourceExec::new(
+        //                         Arc::clone(&probe_schema),
+        //                         probe_batches,
+        //                     )) as Arc<dyn ExecutionPlan>;
+        //                     let build_source = Arc::new(BatchSourceExec::new(
+        //                         Arc::clone(&build_schema),
+        //                         build_batches,
+        //                     )) as Arc<dyn ExecutionPlan>;
+        //                     let plan = create_hash_join_plan(probe_source, build_source);
+        //                     let task_ctx = Arc::new(TaskContext::default());
+        //                     let results = collect(plan, task_ctx).await.unwrap();
+        //                     black_box(results)
+        //                 })
+        //             },
+        //             BatchSize::SmallInput,
+        //         )
+        //     },
+        // );
+
+        // Convert to Buffer for zero-copy deserialization
+        let probe_buffer = Buffer::from_vec(probe_ipc_data.clone());
+        let build_buffer = Buffer::from_vec(build_ipc_data.clone());
+
+        // Benchmark 2: Full pipeline (deser + join + output serialization)
+        // Measures complete round-trip: IPC in -> join -> IPC out
+        group.bench_with_input(
+            BenchmarkId::new("full_pipeline", &label),
+            &(&probe_buffer, &build_buffer),
+            |b, (probe_buffer, build_buffer)| {
+                b.iter(|| {
+                    rt.block_on(async {
+                        let (probe_schema, probe_batches) = deserialize_zero_copy(probe_buffer);
+                        let (build_schema, build_batches) = deserialize_zero_copy(build_buffer);
+
+                        let probe_source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&probe_schema),
+                            probe_batches,
+                        )) as Arc<dyn ExecutionPlan>;
+                        let build_source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&build_schema),
+                            build_batches,
+                        )) as Arc<dyn ExecutionPlan>;
+
+                        let plan = create_hash_join_plan(probe_source, build_source);
+                        let task_ctx = Arc::new(TaskContext::default());
+                        let results = collect(plan, task_ctx).await.unwrap();
+
+                        // Serialize results back to IPC format
+                        let output_ipc = serialize_results_to_ipc(&results);
+                        black_box(output_ipc)
+                    })
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_hash_join);
+criterion_main!(benches);
diff --git a/datafusion/physical-plan/benches/hash_join_by_type.rs b/datafusion/physical-plan/benches/hash_join_by_type.rs
new file mode 100644
index 000000000000..d0c4a7e96791
--- /dev/null
+++ b/datafusion/physical-plan/benches/hash_join_by_type.rs
@@ -0,0 +1,508 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for DataFusion HashJoinExec comparing different column types.
+//!
+//! This benchmark measures the performance of hash-based inner joins across
+//! different column types (Int32, Utf8, Utf8View, Dictionary-encoded strings).
+//!
+//! ## Test configurations:
+//!
+//! - **Column types**: Int32, Utf8, Utf8View, Dictionary(Int16, Utf8), Dictionary(Int16, Utf8View)
+//! - **Match rate**: 0.5, 1.0 (fraction of left keys that match right keys)
+//! - **Repeated right keys**: 1, 10 (join fan-out factor)
+//!
+//! ## Data setup:
+//!
+//! - **Left side (probe)**: 10K rows (1 batch × 10K rows)
+//!   - Schema: colInt, colLong, colFloat, colDouble, colString, colBinary
+//!   - Binary column: 10 bytes (small rows)
+//!   - colInt values: 0-4999 (using `i % 5000` pattern)
+//!   - colString values: "str_0000" to "str_4999" (9 bytes each, 5000 distinct values)
+//!
+//! - **Right side (build)**: Variable size based on parameters
+//!   - Schema: Single column (colInt or colString depending on join type)
+//!   - Values: 0 to (matchRate × 5000 - 1) for Int, "str_0000" to "str_{matchRate × 5000 - 1}" for String
+//!   - Each key repeated `repeatedRightKeys` times
+//!   - This smaller side is used to build the hash table
+//!
+//! ## Column type details:
+//!
+//! - **Int**: Standard Int32 join on colInt
+//! - **String**: Utf8 join on colString (9-byte strings)
+//! - **StringView**: Utf8View join on colString (optimized for strings ≤12 bytes)
+//! - **DictionaryString**: Dictionary(Int16, Utf8) join on colString
+//! - **DictionaryStringView**: Dictionary(Int16, Utf8View) join on colString
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Run specific column type
+//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "joinInt"
+//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "joinStr"
+//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "joinStrView"
+//!
+//! # Run specific configuration
+//! cargo bench --bench hash_join_by_type -p datafusion-physical-plan -- "match_1.0"
+//! ```
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use arrow::buffer::Buffer;
+use criterion::{
+    BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
+};
+use datafusion_common::{JoinType, NullEquality};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::PhysicalExpr;
+use datafusion_physical_plan::joins::{HashJoinExec, PartitionMode};
+use datafusion_physical_plan::{ExecutionPlan, collect};
+
+use bench_utils::{
+    BatchSourceExec, FunctionalBatchGenerator, JoinBuildSideGenerator, JoinColumnType,
+    StringColumnType, create_schema_with_string_type, deserialize_zero_copy,
+    serialize_results_to_ipc, serialize_to_ipc,
+};
+
+// ============================================================================
+// Hash Join Plan Creation
+// ============================================================================
+
+/// Creates a HashJoinExec that performs an inner equi-join.
+///
+/// In DataFusion's HashJoinExec:
+/// - First argument (left) is the BUILD side (gets hashed into hash table)
+/// - Second argument (right) is the PROBE side (scans and probes hash table)
+///
+/// # Arguments
+/// * `probe` - Probe side execution plan (larger, 10K rows)
+/// * `build` - Build side execution plan (smaller, variable rows)
+/// * `join_col` - Name of the column to join on ("colInt" or "colString")
+///
+/// # Returns
+/// A HashJoinExec wrapped in Arc<dyn ExecutionPlan>
+fn create_hash_join_plan(
+    probe: Arc<dyn ExecutionPlan>,
+    build: Arc<dyn ExecutionPlan>,
+    join_col: &str,
+) -> Arc<dyn ExecutionPlan> {
+    let probe_schema = probe.schema();
+    let build_schema = build.schema();
+
+    // Build join condition: build.{join_col} = probe.{join_col}
+    // Note: In HashJoinExec, the "on" condition is (left_col, right_col) = (build_col, probe_col)
+    let build_col = Arc::new(Column::new_with_schema(join_col, &build_schema).unwrap())
+        as Arc<dyn PhysicalExpr>;
+    let probe_col = Arc::new(Column::new_with_schema(join_col, &probe_schema).unwrap())
+        as Arc<dyn PhysicalExpr>;
+
+    let on = vec![(build_col, probe_col)];
+
+    Arc::new(
+        HashJoinExec::try_new(
+            build,                        // Left = build side (gets hashed)
+            probe,                        // Right = probe side (probes hash table)
+            on,
+            None,                         // No additional filter
+            &JoinType::Inner,             // Inner join
+            None,                         // No projection
+            PartitionMode::CollectLeft,   // Build hash table from left (build) side
+            NullEquality::NullEqualsNothing, // NULLs don't match
+            false,                        // Not null-aware
+        )
+        .unwrap(),
+    )
+}
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Benchmark configurations for different scenarios.
+#[derive(Debug, Clone)]
+struct BenchConfig {
+    /// Match rate: fraction of left keys that match right keys
+    match_rate: f64,
+    /// Number of times each right key is repeated (fan-out factor)
+    repeated_right_keys: usize,
+    /// Type of column to join on
+    join_column_type: JoinColumnType,
+}
+
+impl BenchConfig {
+    fn label(&self) -> String {
+        format!(
+            "match_{}/repeat_{}",
+            self.match_rate, self.repeated_right_keys
+        )
+    }
+
+    fn benchmark_name(&self) -> &'static str {
+        match self.join_column_type {
+            JoinColumnType::Int => "joinInt",
+            JoinColumnType::String => "joinStr",
+            JoinColumnType::StringView => "joinStrView",
+            JoinColumnType::DictionaryString => "joinDictStr",
+            JoinColumnType::DictionaryStringView => "joinDictStrView",
+        }
+    }
+
+    fn join_column_name(&self) -> &'static str {
+        match self.join_column_type {
+            JoinColumnType::Int => "colint",
+            JoinColumnType::String
+            | JoinColumnType::StringView
+            | JoinColumnType::DictionaryString
+            | JoinColumnType::DictionaryStringView => "colstring",
+        }
+    }
+
+    fn string_column_type(&self) -> StringColumnType {
+        match self.join_column_type {
+            JoinColumnType::Int => StringColumnType::Utf8, // Not used for Int joins
+            JoinColumnType::String => StringColumnType::Utf8,
+            JoinColumnType::StringView => StringColumnType::Utf8View,
+            JoinColumnType::DictionaryString => StringColumnType::DictionaryUtf8,
+            JoinColumnType::DictionaryStringView => StringColumnType::DictionaryUtf8View,
+        }
+    }
+}
+
+/// Main benchmark function for hash join execution across different column types.
+///
+/// This benchmark measures inner equi-join performance across:
+/// - Different column types (Int32, Utf8, Utf8View, Dictionary variants)
+/// - Different match rates (0.5 vs 1.0)
+/// - Different fan-out factors (repeated right keys: 1, 10)
+///
+/// For each configuration, we measure the complete pipeline:
+/// deser + join + output serialization
+fn bench_hash_join_by_type(c: &mut Criterion) {
+    // Create a single-threaded Tokio runtime for async execution.
+    // We use current_thread to ensure all async work runs on the benchmark thread,
+    // making results comparable to single-threaded Java benchmarks.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+    let mut group = c.benchmark_group("hash_join_by_type");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Probe side configuration: 10K rows total (10K rows × 1 batch)
+    let rows_per_batch = 10_000;
+    let num_batches = 1;
+    let total_probe_rows = rows_per_batch * num_batches;
+    let binary_size = 10; // Small binary column (10 bytes)
+
+    // Generate all benchmark configurations
+    let mut configs: Vec<BenchConfig> = Vec::new();
+
+    // For each base configuration (match_rate × repeated_keys)
+    for match_rate in &[0.5, 1.0] {
+        for repeated_right_keys in &[1, 10] {
+            // Create a config for each column type
+            for column_type in &[
+                JoinColumnType::Int,
+                JoinColumnType::String,
+                JoinColumnType::StringView,
+                JoinColumnType::DictionaryString,
+                JoinColumnType::DictionaryStringView,
+            ] {
+                configs.push(BenchConfig {
+                    match_rate: *match_rate,
+                    repeated_right_keys: *repeated_right_keys,
+                    join_column_type: *column_type,
+                });
+            }
+        }
+    }
+
+    for config in &configs {
+        let label = config.label();
+        let benchmark_name = config.benchmark_name();
+        let join_column_name = config.join_column_name();
+
+        // Generate probe side data
+        let probe_schema = create_schema_with_string_type(config.string_column_type());
+        let mut probe_generator = FunctionalBatchGenerator::new_with_string_type(
+            Arc::clone(&probe_schema),
+            rows_per_batch,
+            num_batches,
+            binary_size,
+            config.string_column_type(),
+        );
+        let probe_batches = probe_generator.generate_batches();
+
+        // Generate build side data
+        let build_generator = JoinBuildSideGenerator::new_with_column_type(
+            config.match_rate,
+            config.repeated_right_keys,
+            config.join_column_type,
+        );
+        let build_schema = build_generator.schema();
+        let build_batches = build_generator.generate_batches();
+        let total_build_rows = build_generator.total_rows();
+
+        // Serialize batches to IPC format for full pipeline benchmark
+        let probe_ipc_data = serialize_to_ipc(&probe_batches, &probe_schema);
+        let build_ipc_data = serialize_to_ipc(&build_batches, &build_schema);
+        let total_ipc_size = probe_ipc_data.len() + build_ipc_data.len();
+
+        // Calculate approximate data sizes
+        let probe_data_size: usize =
+            probe_batches.iter().map(|b| b.get_array_memory_size()).sum();
+        let build_data_size: usize =
+            build_batches.iter().map(|b| b.get_array_memory_size()).sum();
+
+        // Log configuration for visibility in benchmark output
+        println!(
+            "Config: {} ({}), probe={} rows ({:.2} MB), build={} rows ({:.2} MB), IPC={:.2} MB",
+            benchmark_name,
+            label,
+            total_probe_rows,
+            probe_data_size as f64 / (1024.0 * 1024.0),
+            total_build_rows,
+            build_data_size as f64 / (1024.0 * 1024.0),
+            total_ipc_size as f64 / (1024.0 * 1024.0)
+        );
+
+        // Set throughput metric for bytes/second calculations (based on input size)
+        group.throughput(Throughput::Bytes(total_ipc_size as u64));
+
+        // Convert to Buffer for zero-copy deserialization
+        let probe_buffer = Buffer::from_vec(probe_ipc_data.clone());
+        let build_buffer = Buffer::from_vec(build_ipc_data.clone());
+
+        // Benchmark: Full pipeline (deser + join + output serialization)
+        // Measures complete round-trip: IPC in -> join -> IPC out
+        group.bench_with_input(
+            BenchmarkId::new(benchmark_name, &label),
+            &(&probe_buffer, &build_buffer),
+            |b, (probe_buffer, build_buffer)| {
+                b.iter(|| {
+                    rt.block_on(async {
+                        let (probe_schema, probe_batches) = deserialize_zero_copy(probe_buffer);
+                        let (build_schema, build_batches) = deserialize_zero_copy(build_buffer);
+
+                        let probe_source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&probe_schema),
+                            probe_batches,
+                        )) as Arc<dyn ExecutionPlan>;
+                        let build_source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&build_schema),
+                            build_batches,
+                        )) as Arc<dyn ExecutionPlan>;
+
+                        let plan = create_hash_join_plan(probe_source, build_source, join_column_name);
+                        let task_ctx = Arc::new(TaskContext::default());
+                        let results = collect(plan, task_ctx).await.unwrap();
+
+                        // Serialize results back to IPC format
+                        let output_ipc = serialize_results_to_ipc(&results);
+                        black_box(output_ipc)
+                    })
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_hash_join_by_type);
+criterion_main!(benches);
+
+// ============================================================================
+// Tests
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Calculates the expected number of rows from a hash join.
+    ///
+    /// For an inner join:
+    /// - Probe side: 10K rows with 5000 distinct keys (each key appears 2 times)
+    /// - Build side: (match_rate * 5000) distinct keys, each repeated `repeated_keys` times
+    ///
+    /// Expected output rows:
+    /// - For each matching key, output = probe_occurrences * build_occurrences
+    /// - Total = matching_keys * 2 * repeated_keys
+    fn expected_row_count(match_rate: f64, repeated_keys: usize) -> usize {
+        let total_probe_rows = 10_000;
+        let probe_distinct_keys = 5_000;
+        let probe_key_occurrences = total_probe_rows / probe_distinct_keys; // = 2
+
+        let build_distinct_keys = (match_rate * 5000.0) as usize;
+
+        // Each matching key produces: probe_occurrences * build_occurrences rows
+        build_distinct_keys * probe_key_occurrences * repeated_keys
+    }
+
+    /// Test helper to execute a join and verify row count
+    async fn test_join_row_count(
+        match_rate: f64,
+        repeated_keys: usize,
+        column_type: JoinColumnType,
+    ) {
+        let rows_per_batch = 10_000;
+        let num_batches = 1;
+        let binary_size = 10;
+
+        let string_column_type = match column_type {
+            JoinColumnType::Int => StringColumnType::Utf8,
+            JoinColumnType::String => StringColumnType::Utf8,
+            JoinColumnType::StringView => StringColumnType::Utf8View,
+            JoinColumnType::DictionaryString => StringColumnType::DictionaryUtf8,
+            JoinColumnType::DictionaryStringView => StringColumnType::DictionaryUtf8View,
+        };
+
+        let join_column_name = match column_type {
+            JoinColumnType::Int => "colint",
+            _ => "colstring",
+        };
+
+        // Generate probe side data
+        let probe_schema = create_schema_with_string_type(string_column_type);
+        let mut probe_generator = FunctionalBatchGenerator::new_with_string_type(
+            Arc::clone(&probe_schema),
+            rows_per_batch,
+            num_batches,
+            binary_size,
+            string_column_type,
+        );
+        let probe_batches = probe_generator.generate_batches();
+
+        // Generate build side data
+        let build_generator = JoinBuildSideGenerator::new_with_column_type(
+            match_rate,
+            repeated_keys,
+            column_type,
+        );
+        let build_schema = build_generator.schema();
+        let build_batches = build_generator.generate_batches();
+
+        // Execute join
+        let probe_source = Arc::new(BatchSourceExec::new(
+            Arc::clone(&probe_schema),
+            probe_batches,
+        )) as Arc<dyn ExecutionPlan>;
+        let build_source = Arc::new(BatchSourceExec::new(
+            Arc::clone(&build_schema),
+            build_batches,
+        )) as Arc<dyn ExecutionPlan>;
+
+        let plan = create_hash_join_plan(probe_source, build_source, join_column_name);
+        let task_ctx = Arc::new(TaskContext::default());
+        let results = collect(plan, task_ctx).await.unwrap();
+
+        // Calculate actual row count
+        let actual_row_count: usize = results.iter().map(|batch| batch.num_rows()).sum();
+        let expected = expected_row_count(match_rate, repeated_keys);
+
+        assert_eq!(
+            actual_row_count, expected,
+            "Row count mismatch for match_rate={}, repeated_keys={}, column_type={:?}. Expected {}, got {}",
+            match_rate, repeated_keys, column_type, expected, actual_row_count
+        );
+    }
+
+    #[tokio::test]
+    async fn test_join_row_counts_int() {
+        // Test Int column type with different configurations
+        test_join_row_count(0.5, 1, JoinColumnType::Int).await;
+        test_join_row_count(0.5, 10, JoinColumnType::Int).await;
+        test_join_row_count(1.0, 1, JoinColumnType::Int).await;
+        test_join_row_count(1.0, 10, JoinColumnType::Int).await;
+    }
+
+    #[tokio::test]
+    async fn test_join_row_counts_string() {
+        // Test String column type with different configurations
+        test_join_row_count(0.5, 1, JoinColumnType::String).await;
+        test_join_row_count(0.5, 10, JoinColumnType::String).await;
+        test_join_row_count(1.0, 1, JoinColumnType::String).await;
+        test_join_row_count(1.0, 10, JoinColumnType::String).await;
+    }
+
+    #[tokio::test]
+    async fn test_join_row_counts_string_view() {
+        // Test StringView column type with different configurations
+        test_join_row_count(0.5, 1, JoinColumnType::StringView).await;
+        test_join_row_count(0.5, 10, JoinColumnType::StringView).await;
+        test_join_row_count(1.0, 1, JoinColumnType::StringView).await;
+        test_join_row_count(1.0, 10, JoinColumnType::StringView).await;
+    }
+
+    #[tokio::test]
+    async fn test_join_row_counts_dictionary_string() {
+        // Test DictionaryString column type with different configurations
+        test_join_row_count(0.5, 1, JoinColumnType::DictionaryString).await;
+        test_join_row_count(0.5, 10, JoinColumnType::DictionaryString).await;
+        test_join_row_count(1.0, 1, JoinColumnType::DictionaryString).await;
+        test_join_row_count(1.0, 10, JoinColumnType::DictionaryString).await;
+    }
+
+    #[tokio::test]
+    async fn test_join_row_counts_dictionary_string_view() {
+        // Test DictionaryStringView column type with different configurations
+        test_join_row_count(0.5, 1, JoinColumnType::DictionaryStringView).await;
+        test_join_row_count(0.5, 10, JoinColumnType::DictionaryStringView).await;
+        test_join_row_count(1.0, 1, JoinColumnType::DictionaryStringView).await;
+        test_join_row_count(1.0, 10, JoinColumnType::DictionaryStringView).await;
+    }
+
+    #[test]
+    fn test_expected_row_count_calculation() {
+        // Verify the expected row count formula
+
+        // match_rate=0.5, repeated_keys=1
+        // - Matching keys: 2500
+        // - Probe occurrences per key: 2
+        // - Build occurrences per key: 1
+        // - Total: 2500 * 2 * 1 = 5000
+        assert_eq!(expected_row_count(0.5, 1), 5_000);
+
+        // match_rate=0.5, repeated_keys=10
+        // - Total: 2500 * 2 * 10 = 50000
+        assert_eq!(expected_row_count(0.5, 10), 50_000);
+
+        // match_rate=1.0, repeated_keys=1
+        // - Matching keys: 5000
+        // - Total: 5000 * 2 * 1 = 10000
+        assert_eq!(expected_row_count(1.0, 1), 10_000);
+
+        // match_rate=1.0, repeated_keys=10
+        // - Total: 5000 * 2 * 10 = 100000
+        assert_eq!(expected_row_count(1.0, 10), 100_000);
+    }
+}
+
diff --git a/datafusion/physical-plan/benches/jvm/mvnw b/datafusion/physical-plan/benches/jvm/mvnw
new file mode 100755
index 000000000000..19529ddf8c6e
--- /dev/null
+++ b/datafusion/physical-plan/benches/jvm/mvnw
@@ -0,0 +1,259 @@
+#!/bin/sh
+# ----------------------------------------------------------------------------
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ----------------------------------------------------------------------------
+
+# ----------------------------------------------------------------------------
+# Apache Maven Wrapper startup batch script, version 3.3.2
+#
+# Optional ENV vars
+# -----------------
+#   JAVA_HOME - location of a JDK home dir, required when download maven via java source
+#   MVNW_REPOURL - repo url base for downloading maven distribution
+#   MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven
+#   MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output
+# ----------------------------------------------------------------------------
+
+set -euf
+[ "${MVNW_VERBOSE-}" != debug ] || set -x
+
+# OS specific support.
+native_path() { printf %s\\n "$1"; }
+case "$(uname)" in
+CYGWIN* | MINGW*)
+  [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")"
+  native_path() { cygpath --path --windows "$1"; }
+  ;;
+esac
+
+# set JAVACMD and JAVACCMD
+set_java_home() {
+  # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched
+  if [ -n "${JAVA_HOME-}" ]; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ]; then
+      # IBM's JDK on AIX uses strange locations for the executables
+      JAVACMD="$JAVA_HOME/jre/sh/java"
+      JAVACCMD="$JAVA_HOME/jre/sh/javac"
+    else
+      JAVACMD="$JAVA_HOME/bin/java"
+      JAVACCMD="$JAVA_HOME/bin/javac"
+
+      if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then
+        echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2
+        echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2
+        return 1
+      fi
+    fi
+  else
+    JAVACMD="$(
+      'set' +e
+      'unset' -f command 2>/dev/null
+      'command' -v java
+    )" || :
+    JAVACCMD="$(
+      'set' +e
+      'unset' -f command 2>/dev/null
+      'command' -v javac
+    )" || :
+
+    if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then
+      echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2
+      return 1
+    fi
+  fi
+}
+
+# hash string like Java String::hashCode
+hash_string() {
+  str="${1:-}" h=0
+  while [ -n "$str" ]; do
+    char="${str%"${str#?}"}"
+    h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296))
+    str="${str#?}"
+  done
+  printf %x\\n $h
+}
+
+verbose() { :; }
+[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; }
+
+die() {
+  printf %s\\n "$1" >&2
+  exit 1
+}
+
+trim() {
+  # MWRAPPER-139:
+  #   Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds.
+  #   Needed for removing poorly interpreted newline sequences when running in more
+  #   exotic environments such as mingw bash on Windows.
+  printf "%s" "${1}" | tr -d '[:space:]'
+}
+
+# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties
+while IFS="=" read -r key value; do
+  case "${key-}" in
+  distributionUrl) distributionUrl=$(trim "${value-}") ;;
+  distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;;
+  esac
+done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties"
+[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties"
+
+case "${distributionUrl##*/}" in
+maven-mvnd-*bin.*)
+  MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/
+  case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in
+  *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;;
+  :Darwin*x86_64) distributionPlatform=darwin-amd64 ;;
+  :Darwin*arm64) distributionPlatform=darwin-aarch64 ;;
+  :Linux*x86_64*) distributionPlatform=linux-amd64 ;;
+  *)
+    echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2
+    distributionPlatform=linux-amd64
+    ;;
+  esac
+  distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip"
+  ;;
+maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;;
+*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;;
+esac
+
+# apply MVNW_REPOURL and calculate MAVEN_HOME
+# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash>
+[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}"
+distributionUrlName="${distributionUrl##*/}"
+distributionUrlNameMain="${distributionUrlName%.*}"
+distributionUrlNameMain="${distributionUrlNameMain%-bin}"
+MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}"
+MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")"
+
+exec_maven() {
+  unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || :
+  exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD"
+}
+
+if [ -d "$MAVEN_HOME" ]; then
+  verbose "found existing MAVEN_HOME at $MAVEN_HOME"
+  exec_maven "$@"
+fi
+
+case "${distributionUrl-}" in
+*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;;
+*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;;
+esac
+
+# prepare tmp dir
+if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then
+  clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; }
+  trap clean HUP INT TERM EXIT
+else
+  die "cannot create temp dir"
+fi
+
+mkdir -p -- "${MAVEN_HOME%/*}"
+
+# Download and Install Apache Maven
+verbose "Couldn't find MAVEN_HOME, downloading and installing it ..."
+verbose "Downloading from: $distributionUrl"
+verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName"
+
+# select .zip or .tar.gz
+if ! command -v unzip >/dev/null; then
+  distributionUrl="${distributionUrl%.zip}.tar.gz"
+  distributionUrlName="${distributionUrl##*/}"
+fi
+
+# verbose opt
+__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR=''
+[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v
+
+# normalize http auth
+case "${MVNW_PASSWORD:+has-password}" in
+'') MVNW_USERNAME='' MVNW_PASSWORD='' ;;
+has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;;
+esac
+
+if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then
+  verbose "Found wget ... using wget"
+  wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl"
+elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then
+  verbose "Found curl ... using curl"
+  curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl"
+elif set_java_home; then
+  verbose "Falling back to use Java to download"
+  javaSource="$TMP_DOWNLOAD_DIR/Downloader.java"
+  targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName"
+  cat >"$javaSource" <<-END
+	public class Downloader extends java.net.Authenticator
+	{
+	  protected java.net.PasswordAuthentication getPasswordAuthentication()
+	  {
+	    return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() );
+	  }
+	  public static void main( String[] args ) throws Exception
+	  {
+	    setDefault( new Downloader() );
+	    java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() );
+	  }
+	}
+	END
+  # For Cygwin/MinGW, switch paths to Windows format before running javac and java
+  verbose " - Compiling Downloader.java ..."
+  "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java"
+  verbose " - Running Downloader.java ..."
+  "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")"
+fi
+
+# If specified, validate the SHA-256 sum of the Maven distribution zip file
+if [ -n "${distributionSha256Sum-}" ]; then
+  distributionSha256Result=false
+  if [ "$MVN_CMD" = mvnd.sh ]; then
+    echo "Checksum validation is not supported for maven-mvnd." >&2
+    echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
+    exit 1
+  elif command -v sha256sum >/dev/null; then
+    if echo "$distributionSha256Sum  $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then
+      distributionSha256Result=true
+    fi
+  elif command -v shasum >/dev/null; then
+    if echo "$distributionSha256Sum  $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then
+      distributionSha256Result=true
+    fi
+  else
+    echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2
+    echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2
+    exit 1
+  fi
+  if [ $distributionSha256Result = false ]; then
+    echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2
+    echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2
+    exit 1
+  fi
+fi
+
+# unzip and move
+if command -v unzip >/dev/null; then
+  unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip"
+else
+  tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar"
+fi
+printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url"
+mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME"
+
+clean || :
+exec_maven "$@"
diff --git a/datafusion/physical-plan/benches/jvm/pom.xml b/datafusion/physical-plan/benches/jvm/pom.xml
new file mode 100644
index 000000000000..86b399ef58ab
--- /dev/null
+++ b/datafusion/physical-plan/benches/jvm/pom.xml
@@ -0,0 +1,125 @@
+<?xml version="1.0"?>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>org.apache.datafusion</groupId>
+  <artifactId>datafusion-jni-benchmark</artifactId>
+  <version>1.0-SNAPSHOT</version>
+  <name>DataFusion JNI Benchmark</name>
+
+  <properties>
+    <maven.compiler.source>21</maven.compiler.source>
+    <maven.compiler.target>21</maven.compiler.target>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <arrow.version>15.0.2</arrow.version>
+  </properties>
+
+  <dependencies>
+    <!-- Arrow Java C Data Interface -->
+    <dependency>
+      <groupId>org.apache.arrow</groupId>
+      <artifactId>arrow-c-data</artifactId>
+      <version>${arrow.version}</version>
+    </dependency>
+
+    <!-- Arrow Memory Management (Netty-based allocator) -->
+    <dependency>
+      <groupId>org.apache.arrow</groupId>
+      <artifactId>arrow-memory-netty</artifactId>
+      <version>${arrow.version}</version>
+    </dependency>
+
+    <!-- Arrow Vector (core data structures) -->
+    <dependency>
+      <groupId>org.apache.arrow</groupId>
+      <artifactId>arrow-vector</artifactId>
+      <version>${arrow.version}</version>
+    </dependency>
+
+    <!-- JUnit 5 for testing -->
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter</artifactId>
+      <version>5.10.1</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <version>3.2.3</version>
+        <configuration>
+          <argLine>--add-opens=java.base/java.nio=ALL-UNNAMED</argLine>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.11.0</version>
+        <configuration>
+          <source>21</source>
+          <target>21</target>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>3.5.1</version>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <shadedArtifactAttached>false</shadedArtifactAttached>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                  <manifestEntries>
+                    <Implementation-Title>${project.name}</Implementation-Title>
+                    <Implementation-Version>${project.version}</Implementation-Version>
+                  </manifestEntries>
+                </transformer>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+              </transformers>
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>META-INF/*.SF</exclude>
+                    <exclude>META-INF/*.DSA</exclude>
+                    <exclude>META-INF/*.RSA</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+              <createDependencyReducedPom>false</createDependencyReducedPom>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
diff --git a/datafusion/physical-plan/benches/jvm/src/main/java/org/apache/datafusion/benchmark/Udf.java b/datafusion/physical-plan/benches/jvm/src/main/java/org/apache/datafusion/benchmark/Udf.java
new file mode 100644
index 000000000000..f0eb5f76ac31
--- /dev/null
+++ b/datafusion/physical-plan/benches/jvm/src/main/java/org/apache/datafusion/benchmark/Udf.java
@@ -0,0 +1,116 @@
+package org.apache.datafusion.benchmark;
+
+
+import org.apache.arrow.c.ArrowArray;
+import org.apache.arrow.c.ArrowSchema;
+import org.apache.arrow.c.CDataDictionaryProvider;
+import org.apache.arrow.c.Data;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.types.pojo.Schema;
+
+
+/**
+ * Java UDF for filtering Arrow RecordBatches via JNI with zero-copy FFI.
+ *
+ * This class receives Arrow data from Rust via the C Data Interface,
+ * applies a filter predicate (colInt > 2500), and returns a boolean array
+ * indicating which rows pass the filter.
+ */
+public class Udf {
+
+  // Static allocator initialized once for all operations
+  private static final BufferAllocator allocator = new RootAllocator();
+
+  /**
+   * Evaluates the predicate: value > 2500
+   *
+   * Returns a boolean array indicating which rows pass the filter. For each filter,
+   * there is a corresponding boolean value: true if the row passes, false otherwise.
+   *
+   * @param schemaPtr Pointer to FFI_ArrowSchema (C Data Interface)
+   * @param arrayPtr Pointer to FFI_ArrowArray (C Data Interface)
+   * @return Array of [newSchemaPtr, newArrayPtr] for the boolean result array
+   */
+  public static long[] evaluatePredicate(long schemaPtr, long arrayPtr) {
+    try {
+      // Import Array from FFI pointers
+      ArrowSchema arrowSchema = ArrowSchema.wrap(schemaPtr);
+      ArrowArray arrowArray = ArrowArray.wrap(arrayPtr);
+
+      VectorSchemaRoot root = Data.importVectorSchemaRoot(
+        allocator,
+        arrowArray,
+        arrowSchema,
+        new CDataDictionaryProvider()
+      );
+
+      // Get the integer column (assuming single column input)
+      IntVector intVector = (IntVector) root.getVector(0);
+      if (intVector == null) {
+        throw new RuntimeException("Expected integer vector as input");
+      }
+
+      int rowCount = root.getRowCount();
+
+      // Create result schema root with single boolean column
+      Field field = new Field("result", FieldType.nullable(new ArrowType.Bool()), null);
+      Schema resultSchema = new Schema(java.util.Collections.singletonList(field));
+
+      VectorSchemaRoot resultRoot = VectorSchemaRoot.create(resultSchema, allocator);
+      resultRoot.setRowCount(rowCount);
+
+      // Get the BitVector from the result root
+      BitVector resultVector = (BitVector) resultRoot.getVector(0);
+      resultVector.allocateNew(rowCount);
+
+      // Evaluate predicate for each row
+      for (int i = 0; i < rowCount; i++) {
+        boolean passes = !intVector.isNull(i) && intVector.get(i) > 2500;
+        if (passes) {
+          resultVector.set(i, 1);
+        }
+        // Note: BitVector bits are initialized to 0 by allocateNew(), so no need to explicitly set false values
+      }
+      resultVector.setValueCount(rowCount);
+
+      // Export result to FFI pointers
+      ArrowArray resultArray = ArrowArray.allocateNew(allocator);
+      ArrowSchema resultArrowSchema = ArrowSchema.allocateNew(allocator);
+
+      Data.exportVectorSchemaRoot(
+        allocator,
+        resultRoot,
+        new CDataDictionaryProvider(),
+        resultArray,
+        resultArrowSchema
+      );
+
+      // Clean up input root (can be closed after export since we've consumed it)
+      root.close();
+
+      // NOTE: resultRoot is NOT closed here because the exported FFI pointers
+      // reference its memory. Arrow's C Data Interface handles cleanup via
+      // release callbacks when Rust calls from_ffi() to import the data.
+      // The release callback will eventually free the resultRoot memory.
+
+      return new long[] { resultArrowSchema.memoryAddress(), resultArray.memoryAddress() };
+    } catch (Exception e) {
+      throw new RuntimeException("Error in Java UDF evaluation", e);
+    }
+  }
+
+  /**
+   * Legacy method for backward compatibility - filters entire batches
+   * For use with standard DataFusion filter, use evaluatePredicate instead
+   */
+  public static long[] filterBatch(long schemaPtr, long arrayPtr) {
+    return evaluatePredicate(schemaPtr, arrayPtr);
+  }
+}
diff --git a/datafusion/physical-plan/benches/jvm/src/test/java/org/apache/datafusion/benchmark/UdfTest.java b/datafusion/physical-plan/benches/jvm/src/test/java/org/apache/datafusion/benchmark/UdfTest.java
new file mode 100644
index 000000000000..6021c5234032
--- /dev/null
+++ b/datafusion/physical-plan/benches/jvm/src/test/java/org/apache/datafusion/benchmark/UdfTest.java
@@ -0,0 +1,284 @@
+package org.apache.datafusion.benchmark;
+
+import org.apache.arrow.c.ArrowArray;
+import org.apache.arrow.c.ArrowSchema;
+import org.apache.arrow.c.CDataDictionaryProvider;
+import org.apache.arrow.c.Data;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BitVector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
+import org.apache.arrow.vector.types.pojo.Schema;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.util.Collections;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * Test for the Udf.evaluatePredicate method to verify correct behavior
+ * when returning boolean arrays via Arrow FFI.
+ */
+public class UdfTest {
+
+    private BufferAllocator allocator;
+
+    @BeforeEach
+    public void setup() {
+        allocator = new RootAllocator(Long.MAX_VALUE);
+    }
+
+    @AfterEach
+    public void tearDown() {
+        allocator.close();
+    }
+
+    @Test
+    public void testEvaluatePredicate_BasicFunctionality() {
+        // Create input with 10 rows
+        int rowCount = 10;
+
+        // Create input schema and data
+        Field intField = new Field("colint", FieldType.nullable(new ArrowType.Int(32, true)), null);
+        Schema inputSchema = new Schema(Collections.singletonList(intField));
+
+        VectorSchemaRoot inputRoot = VectorSchemaRoot.create(inputSchema, allocator);
+        inputRoot.setRowCount(rowCount);
+
+        IntVector intVector = (IntVector) inputRoot.getVector(0);
+
+        // Fill with test data: [1000, 2000, 2500, 2501, 3000, 3500, 4000, null, 1500, 2600]
+        int[] testValues = {1000, 2000, 2500, 2501, 3000, 3500, 4000, -1, 1500, 2600};
+        boolean[] expectedResults = {false, false, false, true, true, true, true, false, false, true};
+
+        for (int i = 0; i < rowCount; i++) {
+            if (testValues[i] == -1) {
+                intVector.setNull(i);
+            } else {
+                intVector.set(i, testValues[i]);
+            }
+        }
+        intVector.setValueCount(rowCount);
+
+        // Export input to FFI
+        ArrowArray inputArray = ArrowArray.allocateNew(allocator);
+        ArrowSchema inputArrowSchema = ArrowSchema.allocateNew(allocator);
+
+        Data.exportVectorSchemaRoot(
+            allocator,
+            inputRoot,
+            new CDataDictionaryProvider(),
+            inputArray,
+            inputArrowSchema
+        );
+
+        long inputSchemaPtr = inputArrowSchema.memoryAddress();
+        long inputArrayPtr = inputArray.memoryAddress();
+
+        // Call the UDF
+        long[] resultPtrs = Udf.evaluatePredicate(inputSchemaPtr, inputArrayPtr);
+
+        assertNotNull(resultPtrs, "Result pointers should not be null");
+        assertEquals(2, resultPtrs.length, "Should return array with 2 pointers [schemaPtr, arrayPtr]");
+
+        // Import result from FFI
+        ArrowSchema resultArrowSchema = ArrowSchema.wrap(resultPtrs[0]);
+        ArrowArray resultArray = ArrowArray.wrap(resultPtrs[1]);
+
+        VectorSchemaRoot resultRoot = Data.importVectorSchemaRoot(
+            allocator,
+            resultArray,
+            resultArrowSchema,
+            new CDataDictionaryProvider()
+        );
+
+        // Clean up input resources after UDF call
+        inputArray.close();
+        inputArrowSchema.close();
+        inputRoot.close();
+
+        // Verify result structure
+        assertEquals(1, resultRoot.getFieldVectors().size(), "Result should have 1 column");
+        assertEquals(rowCount, resultRoot.getRowCount(), "Result row count should match input");
+
+        // Verify result is a BitVector (boolean)
+        assertTrue(resultRoot.getVector(0) instanceof BitVector,
+            "Result column should be BitVector, got: " + resultRoot.getVector(0).getClass().getName());
+
+        BitVector resultVector = (BitVector) resultRoot.getVector(0);
+        assertEquals(rowCount, resultVector.getValueCount(),
+            "Result vector value count should be " + rowCount + ", got: " + resultVector.getValueCount());
+
+        // Verify predicate results
+        for (int i = 0; i < rowCount; i++) {
+            boolean actual = resultVector.isSet(i) != 0;
+            assertEquals(expectedResults[i], actual,
+                String.format("Row %d: value=%s, expected=%s, got=%s",
+                    i,
+                    testValues[i] == -1 ? "null" : testValues[i],
+                    expectedResults[i],
+                    actual));
+        }
+
+        // Cleanup
+        resultRoot.close();
+        resultArrowSchema.close();
+        resultArray.close();
+    }
+
+    @Test
+    public void testEvaluatePredicate_LargeDataset() {
+        // Test with 10,000 rows (same as benchmark)
+        int rowCount = 10000;
+
+        Field intField = new Field("colint", FieldType.nullable(new ArrowType.Int(32, true)), null);
+        Schema inputSchema = new Schema(Collections.singletonList(intField));
+
+        VectorSchemaRoot inputRoot = VectorSchemaRoot.create(inputSchema, allocator);
+        inputRoot.setRowCount(rowCount);
+
+        IntVector intVector = (IntVector) inputRoot.getVector(0);
+
+        // Fill with values 0 to 9999
+        int expectedPassCount = 0;
+        for (int i = 0; i < rowCount; i++) {
+            intVector.set(i, i);
+            if (i > 2500) {
+                expectedPassCount++;
+            }
+        }
+        intVector.setValueCount(rowCount);
+
+        // Export input to FFI
+        ArrowArray inputArray = ArrowArray.allocateNew(allocator);
+        ArrowSchema inputArrowSchema = ArrowSchema.allocateNew(allocator);
+
+        Data.exportVectorSchemaRoot(
+            allocator,
+            inputRoot,
+            new CDataDictionaryProvider(),
+            inputArray,
+            inputArrowSchema
+        );
+
+        long inputSchemaPtr = inputArrowSchema.memoryAddress();
+        long inputArrayPtr = inputArray.memoryAddress();
+
+        // Call the UDF
+        long[] resultPtrs = Udf.evaluatePredicate(inputSchemaPtr, inputArrayPtr);
+
+        // Import result
+        ArrowSchema resultArrowSchema = ArrowSchema.wrap(resultPtrs[0]);
+        ArrowArray resultArray = ArrowArray.wrap(resultPtrs[1]);
+
+        VectorSchemaRoot resultRoot = Data.importVectorSchemaRoot(
+            allocator,
+            resultArray,
+            resultArrowSchema,
+            new CDataDictionaryProvider()
+        );
+
+        // Clean up input resources after UDF call
+        inputArray.close();
+        inputArrowSchema.close();
+        inputRoot.close();
+
+        // Verify structure
+        assertEquals(rowCount, resultRoot.getRowCount(),
+            "Result row count should be " + rowCount + ", got: " + resultRoot.getRowCount());
+
+        BitVector resultVector = (BitVector) resultRoot.getVector(0);
+        assertEquals(rowCount, resultVector.getValueCount(),
+            "Result vector value count should be " + rowCount + ", got: " + resultVector.getValueCount());
+
+        // Count how many pass the predicate
+        int actualPassCount = 0;
+        for (int i = 0; i < rowCount; i++) {
+            if (resultVector.isSet(i) != 0) {
+                actualPassCount++;
+            }
+        }
+
+        assertEquals(expectedPassCount, actualPassCount,
+            String.format("Expected %d rows to pass predicate (> 2500), but got %d",
+                expectedPassCount, actualPassCount));
+
+        // Verify specific values
+        assertFalse(resultVector.isSet(0) != 0, "Value 0 should not pass (0 <= 2500)");
+        assertFalse(resultVector.isSet(2500) != 0, "Value 2500 should not pass (2500 <= 2500)");
+        assertTrue(resultVector.isSet(2501) != 0, "Value 2501 should pass (2501 > 2500)");
+        assertTrue(resultVector.isSet(9999) != 0, "Value 9999 should pass (9999 > 2500)");
+
+        // Cleanup
+        resultRoot.close();
+        resultArrowSchema.close();
+        resultArray.close();
+    }
+
+    @Test
+    public void testEvaluatePredicate_AllNull() {
+        int rowCount = 100;
+
+        Field intField = new Field("colint", FieldType.nullable(new ArrowType.Int(32, true)), null);
+        Schema inputSchema = new Schema(Collections.singletonList(intField));
+
+        VectorSchemaRoot inputRoot = VectorSchemaRoot.create(inputSchema, allocator);
+        inputRoot.setRowCount(rowCount);
+
+        IntVector intVector = (IntVector) inputRoot.getVector(0);
+
+        // All null values
+        for (int i = 0; i < rowCount; i++) {
+            intVector.setNull(i);
+        }
+        intVector.setValueCount(rowCount);
+
+        // Export and call UDF
+        ArrowArray inputArray = ArrowArray.allocateNew(allocator);
+        ArrowSchema inputArrowSchema = ArrowSchema.allocateNew(allocator);
+
+        Data.exportVectorSchemaRoot(
+            allocator,
+            inputRoot,
+            new CDataDictionaryProvider(),
+            inputArray,
+            inputArrowSchema
+        );
+
+        long[] resultPtrs = Udf.evaluatePredicate(inputArrowSchema.memoryAddress(), inputArray.memoryAddress());
+
+        // Import result
+        ArrowSchema resultArrowSchema = ArrowSchema.wrap(resultPtrs[0]);
+        ArrowArray resultArray = ArrowArray.wrap(resultPtrs[1]);
+
+        VectorSchemaRoot resultRoot = Data.importVectorSchemaRoot(
+            allocator,
+            resultArray,
+            resultArrowSchema,
+            new CDataDictionaryProvider()
+        );
+
+        // Clean up input resources after UDF call
+        inputArray.close();
+        inputArrowSchema.close();
+        inputRoot.close();
+
+        // All nulls should result in false (not passing predicate)
+        BitVector resultVector = (BitVector) resultRoot.getVector(0);
+        for (int i = 0; i < rowCount; i++) {
+            assertFalse(resultVector.isSet(i) != 0,
+                "Null values should not pass predicate");
+        }
+
+        // Cleanup
+        resultRoot.close();
+        resultArrowSchema.close();
+        resultArray.close();
+    }
+}
diff --git a/datafusion/physical-plan/benches/serde.rs b/datafusion/physical-plan/benches/serde.rs
new file mode 100644
index 000000000000..ed598d724a18
--- /dev/null
+++ b/datafusion/physical-plan/benches/serde.rs
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for Arrow IPC serialization performance.
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench serde -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench serde -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Run only the sink benchmark
+//! cargo bench --bench serde -p datafusion-physical-plan -- serialize_to_sink
+//!
+//! # Run only the memory benchmark
+//! cargo bench --bench serde -p datafusion-physical-plan -- serialize_to_memory
+//!
+//! # Change measurement time (per benchmark, default is 5 seconds)
+//! cargo bench --bench serde -p datafusion-physical-plan -- --measurement-time 10
+//!
+//! # Run specific configuration
+//! cargo bench --bench serde -p datafusion-physical-plan -- "1M_rows_binary_10B"
+//! ```
+//!
+//! ## Baseline Management
+//!
+//! ```bash
+//! # Save current results as a named baseline
+//! cargo bench --bench serde -p datafusion-physical-plan -- --save-baseline my-baseline
+//!
+//! # Compare against a specific baseline
+//! cargo bench --bench serde -p datafusion-physical-plan -- --baseline my-baseline
+//!
+//! # Delete all benchmark history and start fresh
+//! rm -rf target/criterion
+//! ```
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::hint::black_box;
+use std::sync::Arc;
+
+use criterion::{
+    BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
+};
+
+use bench_utils::{
+    FunctionalBatchGenerator, create_schema, serialize_batches_to_sink
+};
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Benchmarks serialization to a sink that drops all data.
+///
+/// This measures the pure CPU cost of Arrow IPC serialization without
+/// including memory allocation or I/O overhead. Useful for understanding
+/// the baseline serialization cost.
+fn bench_serialize(c: &mut Criterion) {
+    let mut group = c.benchmark_group("serialize");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Configuration: 1M rows total (10K rows × 100 batches)
+    let rows_per_batch = 10_000;
+    let num_batches_vec = vec![1, 100];
+    for num_batches in num_batches_vec {
+        let total_rows = rows_per_batch * num_batches;
+
+        // Test different binary column sizes to understand serialization overhead
+        let binary_sizes = vec![10, 1024, 2048];
+
+        for binary_size in binary_sizes {
+            let label = format!("{num_batches}_batches/rows_binary_{binary_size}B");
+
+            // Generate test data
+            let schema = create_schema();
+            let mut generator = FunctionalBatchGenerator::new(
+                Arc::clone(&schema),
+                rows_per_batch,
+                num_batches,
+                binary_size,
+            );
+            let batches = generator.generate_batches();
+
+            // Calculate expected output size for throughput metric
+            let expected_size = estimate_serialized_size(&batches, binary_size, total_rows);
+
+            // Set throughput metric for bytes/second calculations
+            group.throughput(Throughput::Bytes(expected_size as u64));
+
+            // Log configuration
+            println!(
+                "Config (sink): {} rows, binary_size={} bytes, estimated output={:.2} MB",
+                total_rows,
+                binary_size,
+                expected_size as f64 / (1024.0 * 1024.0)
+            );
+
+            group.bench_with_input(
+                BenchmarkId::from_parameter(&label),
+                &batches,
+                |b, batches| {
+                    b.iter(|| {
+                        let bytes_written = serialize_batches_to_sink(batches, &schema);
+                        // black_box prevents compiler from optimizing away unused results
+                        black_box(bytes_written)
+                    })
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+/// Estimates the serialized size of batches for throughput calculations.
+///
+/// This is an approximation based on the data types and sizes. For accurate
+/// measurements, we could do one actual serialization, but this is good enough
+/// for throughput reporting.
+fn estimate_serialized_size(batches: &[arrow::array::RecordBatch], binary_size: usize, total_rows: usize) -> usize {
+    // Rough estimate of IPC overhead:
+    // - File header/footer: ~1KB
+    // - Per-batch metadata: ~200 bytes per batch
+    // - Per-column data: actual column size + alignment padding
+
+    let num_batches = batches.len();
+    let overhead = 1024 + (num_batches * 200);
+
+    // Data size estimation per row:
+    // - Int32: 4 bytes
+    // - Int64: 8 bytes
+    // - Float32: 4 bytes
+    // - Float64: 8 bytes
+    // - StringView: ~16 bytes (view) + actual string data (~6 bytes for "str_XX")
+    // - BinaryView: ~16 bytes (view) + binary_size bytes
+    let per_row_size = 4 + 8 + 4 + 8 + 16 + 6 + 16 + binary_size;
+
+    overhead + (total_rows * per_row_size)
+}
+
+criterion_group!(benches, bench_serialize);
+criterion_main!(benches);
+
diff --git a/datafusion/physical-plan/benches/sort_bench.rs b/datafusion/physical-plan/benches/sort_bench.rs
new file mode 100644
index 000000000000..27bddd59a07f
--- /dev/null
+++ b/datafusion/physical-plan/benches/sort_bench.rs
@@ -0,0 +1,291 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for DataFusion SortExec with Arrow IPC serialization.
+//!
+//! This benchmark measures the end-to-end latency of:
+//! 1. Deserializing Arrow IPC data into RecordBatches
+//! 2. Executing a SortExec operator (ORDER BY colInt ASC)
+//! 3. Serializing the output back to Arrow IPC format
+//!
+//! Two sort variants are tested:
+//! - **Full sort**: Sort all 1M rows (no limit)
+//! - **TopK sort**: Sort with LIMIT 10,000 (uses heap-based TopK algorithm)
+//!
+//! The benchmark helps understand sort performance characteristics,
+//! how the TopK optimization affects latency, and the overhead of
+//! serializing sorted results.
+//!
+//! ## Running the benchmark
+//!
+//! ```bash
+//! # Run all configurations
+//! cargo bench --bench sort_bench -p datafusion-physical-plan
+//!
+//! # Run with fewer samples for quick testing
+//! cargo bench --bench sort_bench -p datafusion-physical-plan -- --sample-size 10
+//!
+//! # Run specific configuration
+//! cargo bench --bench sort_bench -p datafusion-physical-plan -- "sort_no_limit"
+//! ```
+
+// Include shared benchmark utilities
+#[path = "bench_utils.rs"]
+mod bench_utils;
+
+use std::hint::black_box;
+use std::sync::Arc;
+use arrow::buffer::Buffer;
+use arrow::compute::SortOptions;
+use arrow::datatypes::SchemaRef;
+use criterion::{
+    BatchSize, BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
+};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::{ExecutionPlan, collect};
+
+use bench_utils::{
+    BatchSourceExec, FunctionalBatchGenerator, create_schema, deserialize_zero_copy,
+    serialize_batches_to_sink, serialize_to_ipc,
+};
+
+// ============================================================================
+// Sort Plan Creation
+// ============================================================================
+
+/// Creates a SortExec that sorts by `colInt ASC`.
+///
+/// With the data generation pattern `colInt = i % 5000`, the sort will
+/// group all rows with the same colInt value together, with values
+/// ranging from 0 to 4999.
+///
+/// # Arguments
+/// * `input` - The input execution plan (typically BatchSourceExec)
+/// * `schema` - Schema of the input data
+/// * `fetch` - Optional limit for TopK optimization:
+///   - `None`: Full sort of all rows
+///   - `Some(n)`: TopK sort returning only top n rows
+///
+/// # Returns
+/// A SortExec wrapped in Arc<dyn ExecutionPlan>
+fn create_sort_plan(
+    input: Arc<dyn ExecutionPlan>,
+    schema: &SchemaRef,
+    fetch: Option<usize>,
+) -> Arc<dyn ExecutionPlan> {
+    // Build sort expression: ORDER BY colInt ASC
+    let col_int = Arc::new(Column::new_with_schema("colInt", schema).unwrap());
+    let sort_expr = PhysicalSortExpr::new(col_int, SortOptions::default());
+    let sort_exprs = LexOrdering::new(vec![sort_expr]).unwrap();
+
+    // Create SortExec with optional fetch limit
+    let sort = SortExec::new(sort_exprs, input);
+    let sort = if let Some(limit) = fetch {
+        // TopK optimization: uses a heap to track only the top `limit` rows
+        sort.with_fetch(Some(limit))
+    } else {
+        sort
+    };
+
+    Arc::new(sort)
+}
+
+// ============================================================================
+// Benchmark Implementation
+// ============================================================================
+
+/// Main benchmark function for sort execution.
+///
+/// This benchmark measures six scenarios for each binary column size:
+///
+/// 1. **deser_only**: Just IPC deserialization, no execution
+///    - Establishes baseline deserialization cost
+///
+/// 2. **ser_only**: Just IPC serialization, no execution
+///    - Establishes baseline serialization cost
+///    - Uses pre-generated batches directly
+///
+/// 3. **sort_no_limit**: Full sort execution only
+///    - Sorts all 1M rows by colInt
+///    - Uses pre-generated batches directly (no deserialization)
+///    - Isolates SortExec performance
+///
+/// 4. **sort_limit_10k**: TopK sort execution only
+///    - Uses TopK algorithm to find top 10,000 rows
+///    - Uses pre-generated batches directly (no deserialization)
+///    - Should be faster than full sort for large datasets
+///
+/// 5. **full_pipeline_no_limit**: Complete deser + full sort + output serialization
+///    - Real-world latency for ORDER BY queries including result serialization
+///
+/// 6. **full_pipeline_limit_10k**: Complete deser + TopK sort + output serialization
+///    - Real-world latency for LIMIT queries including result serialization
+fn bench_sort(c: &mut Criterion) {
+    // Create a single-threaded Tokio runtime for async execution.
+    // We use current_thread to ensure all async work runs on the benchmark thread,
+    // making results comparable to single-threaded Java benchmarks.
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .build()
+        .unwrap();
+    let mut group = c.benchmark_group("sort_bench");
+
+    // Use flat sampling to collect exactly the requested samples without time constraints
+    group.sampling_mode(SamplingMode::Flat);
+
+    // Configuration: 1M rows total (10K rows × 100 batches)
+    let rows_per_batch = 10_000;
+    let num_batches = 100;
+    let total_rows = rows_per_batch * num_batches;
+
+    // Test different binary column sizes to understand serialization overhead
+    let binary_sizes = vec![10, 1024, 2048];
+
+    for binary_size in binary_sizes {
+        let label = format!("1M_rows_binary_{binary_size}B");
+
+        // Generate test data and serialize to IPC format
+        let schema = create_schema();
+        let mut generator = FunctionalBatchGenerator::new(
+            Arc::clone(&schema),
+            rows_per_batch,
+            num_batches,
+            binary_size,
+        );
+        let batches = generator.generate_batches();
+        let ipc_data = serialize_to_ipc(&batches, &schema);
+        let ipc_size = ipc_data.len();
+        let ipc_buffer = Buffer::from_vec(ipc_data);
+
+        // Log configuration for visibility in benchmark output
+        println!(
+            "Config: {} rows, binary_size={} bytes, IPC size={:.2} MB",
+            total_rows,
+            binary_size,
+            ipc_size as f64 / (1024.0 * 1024.0)
+        );
+
+        // Set throughput metric for bytes/second calculations
+        group.throughput(Throughput::Bytes(ipc_size as u64));
+
+        // Benchmark 3: Full sort (no limit) - execution only
+        // Uses pre-generated batches directly, isolating SortExec performance
+        group.bench_with_input(
+            BenchmarkId::new("sort_no_limit", &label),
+            &batches,
+            |b, batches| {
+                b.iter_batched(
+                    // Setup: clone batches (NOT timed) - needed because execution consumes them
+                    || batches.clone(),
+                    // Benchmark: execute sort (TIMED)
+                    |batches| {
+                        rt.block_on(async {
+                            let source = Arc::new(BatchSourceExec::new(
+                                Arc::clone(&schema),
+                                batches,
+                            )) as Arc<dyn ExecutionPlan>;
+                            let plan = create_sort_plan(source, &schema, None);
+                            let task_ctx = Arc::new(TaskContext::default());
+                            let results = collect(plan, task_ctx).await.unwrap();
+                            black_box(results)
+                        })
+                    },
+                    BatchSize::SmallInput,
+                )
+            },
+        );
+
+        // Benchmark 4: TopK sort (LIMIT 10,000) - execution only
+        // Uses pre-generated batches directly; should be faster than full sort
+        group.bench_with_input(
+            BenchmarkId::new("sort_limit_10k", &label),
+            &batches,
+            |b, batches| {
+                b.iter_batched(
+                    // Setup: clone batches (NOT timed) - needed because execution consumes them
+                    || batches.clone(),
+                    // Benchmark: execute TopK sort (TIMED)
+                    |batches| {
+                        rt.block_on(async {
+                            let source = Arc::new(BatchSourceExec::new(
+                                Arc::clone(&schema),
+                                batches,
+                            )) as Arc<dyn ExecutionPlan>;
+                            let plan = create_sort_plan(source, &schema, Some(10_000));
+                            let task_ctx = Arc::new(TaskContext::default());
+                            let results = collect(plan, task_ctx).await.unwrap();
+                            black_box(results)
+                        })
+                    },
+                    BatchSize::SmallInput,
+                )
+            },
+        );
+
+        // Benchmark 5: Full pipeline with full sort + output serialization
+        // Measures complete round-trip: IPC in -> sort all rows -> IPC out
+        group.bench_with_input(
+            BenchmarkId::new("full_pipeline_no_limit", &label),
+            &ipc_buffer,
+            |b, ipc_buffer| {
+                b.iter(|| {
+                    rt.block_on(async {
+                        let (schema, batches) = deserialize_zero_copy(ipc_buffer);
+                        let source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&schema),
+                            batches,
+                        )) as Arc<dyn ExecutionPlan>;
+                        let plan = create_sort_plan(source, &schema, None);
+                        let task_ctx = Arc::new(TaskContext::default());
+                        let results = collect(plan, task_ctx).await.unwrap();
+                        black_box(serialize_batches_to_sink(&results, &schema))
+                    })
+                })
+            },
+        );
+
+        // Benchmark 6: Full pipeline with TopK sort + output serialization
+        // Measures complete round-trip: IPC in -> TopK sort -> IPC out
+        // Output size is limited to 10K rows, so serialization should be faster
+        group.bench_with_input(
+            BenchmarkId::new("full_pipeline_limit_10k", &label),
+            &ipc_buffer,
+            |b, ipc_buffer| {
+                b.iter(|| {
+                    rt.block_on(async {
+                        let (schema, batches) = deserialize_zero_copy(ipc_buffer);
+                        let source = Arc::new(BatchSourceExec::new(
+                            Arc::clone(&schema),
+                            batches,
+                        )) as Arc<dyn ExecutionPlan>;
+                        let plan = create_sort_plan(source, &schema, Some(10_000));
+                        let task_ctx = Arc::new(TaskContext::default());
+                        let results = collect(plan, task_ctx).await.unwrap();
+                        black_box(serialize_batches_to_sink(&results, &schema))
+                    })
+                })
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_sort);
+criterion_main!(benches);