From 4b065461d42eb94094291f0cafa61ff97c0f83ca Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Tue, 1 Apr 2025 04:22:39 -0700
Subject: [PATCH 1/2] feat: example lexical range handling

---
 Cargo.lock                                    |   2 +
 datafusion/physical-optimizer/Cargo.toml      |   2 +
 datafusion/physical-optimizer/src/lib.rs      |   1 +
 .../src/progressive_evaluation.rs             | 108 +++
 .../progressive_evaluation/extract_ranges.rs  | 900 ++++++++++++++++++
 .../progressive_evaluation/lexical_ranges.rs  | 774 +++++++++++++++
 .../src/progressive_evaluation/statistics.rs  | 294 ++++++
 .../src/progressive_evaluation/util.rs        | 171 ++++
 8 files changed, 2252 insertions(+)
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation.rs
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/util.rs

diff --git a/Cargo.lock b/Cargo.lock
index c708c516ab36..d9f4cdd381d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2444,6 +2444,8 @@ version = "46.0.1"
 dependencies = [
  "arrow",
  "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-datasource-parquet",
  "datafusion-execution",
  "datafusion-expr",
  "datafusion-expr-common",
diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml
index aaadb09bcc98..7fddca52f5fb 100644
--- a/datafusion/physical-optimizer/Cargo.toml
+++ b/datafusion/physical-optimizer/Cargo.toml
@@ -40,6 +40,7 @@ recursive_protection = ["dep:recursive"]
 [dependencies]
 arrow = { workspace = true }
 datafusion-common = { workspace = true, default-features = true }
+datafusion-datasource = { workspace = true }
 datafusion-execution = { workspace = true }
 datafusion-expr = { workspace = true }
 datafusion-expr-common = { workspace = true, default-features = true }
@@ -52,5 +53,6 @@ recursive = { workspace = true, optional = true }
 
 [dev-dependencies]
 datafusion-expr = { workspace = true }
+datafusion-datasource-parquet = { workspace = true }
 datafusion-functions-nested = { workspace = true }
 insta = { workspace = true }
diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs
index 35503f3b0b5f..231957d21e11 100644
--- a/datafusion/physical-optimizer/src/lib.rs
+++ b/datafusion/physical-optimizer/src/lib.rs
@@ -34,6 +34,7 @@ pub mod limit_pushdown;
 pub mod limited_distinct_aggregation;
 pub mod optimizer;
 pub mod output_requirements;
+pub mod progressive_evaluation;
 pub mod projection_pushdown;
 pub mod pruning;
 pub mod sanity_checker;
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation.rs b/datafusion/physical-optimizer/src/progressive_evaluation.rs
new file mode 100644
index 000000000000..b162973e6a3a
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! TODO: physical optimizer run to conditionally swap the SPM with the ProgressiveEvalExec
+
+mod extract_ranges;
+mod lexical_ranges;
+mod statistics;
+mod util;
+
+use itertools::Itertools;
+use std::sync::Arc;
+
+use datafusion_common::{
+    tree_node::{Transformed, TreeNode},
+    Result,
+};
+use datafusion_physical_plan::{
+    sorts::sort_preserving_merge::SortPreservingMergeExec, union::UnionExec,
+    ExecutionPlan,
+};
+use extract_ranges::extract_disjoint_ranges_from_plan;
+use util::split_parquet_files;
+
+use crate::PhysicalOptimizerRule;
+
+#[allow(dead_code)]
+#[derive(Debug)]
+struct InsertProgressiveEval;
+
+impl PhysicalOptimizerRule for InsertProgressiveEval {
+    fn name(&self) -> &str {
+        "TBD"
+    }
+
+    fn schema_check(&self) -> bool {
+        false
+    }
+
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &datafusion_common::config::ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(|plan| {
+            // Find SortPreservingMergeExec
+            let Some(sort_preserving_merge_exec) =
+                plan.as_any().downcast_ref::<SortPreservingMergeExec>()
+            else {
+                return Ok(Transformed::no(plan));
+            };
+
+            // Split file groups to maximize potential disjoint ranges.
+            let new_inputs: Vec<Arc<dyn ExecutionPlan>> = sort_preserving_merge_exec
+                .children()
+                .into_iter()
+                .map(|spm_child| {
+                    Arc::clone(spm_child)
+                        .transform_down(|plan| {
+                            split_parquet_files(plan, sort_preserving_merge_exec.expr())
+                        })
+                        .map(|t| t.data)
+                })
+                .try_collect()?;
+            let transformed_input_plan = Arc::new(UnionExec::new(new_inputs)) as _;
+
+            // try to extract the lexical ranges for the input partitions
+            let Ok(Some(_lexical_ranges)) = extract_disjoint_ranges_from_plan(
+                sort_preserving_merge_exec.expr(),
+                &transformed_input_plan,
+            ) else {
+                return Ok(Transformed::no(plan));
+            };
+
+            // confirm we still have the ordering needed for the SPM
+            assert!(transformed_input_plan
+                .properties()
+                .equivalence_properties()
+                .ordering_satisfy(sort_preserving_merge_exec.expr()));
+
+            // Replace SortPreservingMergeExec with ProgressiveEvalExec
+            // TODO: have the ProgressiveEvalExec perform that partition mapping
+            // let progresive_eval_exec = Arc::new(ProgressiveEvalExec::new(
+            //     transformed_input_plan,
+            //     lexical_ranges,
+            //     sort_preserving_merge_exec.fetch(),
+            // ));
+            // Ok(Transformed::yes(progresive_eval_exec))
+
+            Ok(Transformed::no(plan))
+        })
+        .map(|t| t.data)
+    }
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs b/datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs
new file mode 100644
index 000000000000..e26ea8119bf5
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs
@@ -0,0 +1,900 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Extract [`NonOverlappingOrderedLexicalRanges`] from different sources.
+
+use arrow::compute::SortOptions;
+use arrow::datatypes::Schema;
+use datafusion_common::tree_node::TreeNode;
+use datafusion_common::{ColumnStatistics, Result, ScalarValue};
+use datafusion_datasource::PartitionedFile;
+use datafusion_physical_expr::expressions::Column;
+use datafusion_physical_expr::LexOrdering;
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
+use std::sync::Arc;
+
+use super::lexical_ranges::{LexicalRange, NonOverlappingOrderedLexicalRanges};
+use super::statistics::{column_statistics_min_max, PartitionStatisticsVisitor};
+
+/// Attempt to extract LexicalRanges for the given sort keys and input plan
+///
+/// Output will have N ranges where N is the number of output partitions
+///
+/// Returns None if not possible to determine ranges
+pub fn extract_disjoint_ranges_from_plan(
+    exprs: &LexOrdering,
+    input_plan: &Arc<dyn ExecutionPlan>,
+) -> Result<Option<NonOverlappingOrderedLexicalRanges>> {
+    // if the ordering does not match, then we cannot confirm proper ranges
+    if !input_plan
+        .properties()
+        .equivalence_properties()
+        .ordering_satisfy(exprs)
+    {
+        return Ok(None);
+    }
+
+    let num_input_partitions = partition_count(input_plan);
+
+    // one builder for each output partition
+    let mut builders = vec![LexicalRange::builder(); num_input_partitions];
+    for sort_expr in exprs.iter() {
+        let Some(column) = sort_expr.expr.as_any().downcast_ref::<Column>() else {
+            return Ok(None);
+        };
+        let col_name = column.name();
+
+        for (partition, builder) in
+            builders.iter_mut().enumerate().take(num_input_partitions)
+        {
+            let Some((min, max)) = min_max_for_partition(
+                col_name,
+                &sort_expr.options,
+                partition,
+                input_plan,
+            )?
+            else {
+                return Ok(None);
+            };
+            builder.push(min, max);
+        }
+    }
+
+    let sort_options = exprs.iter().map(|e| e.options).collect::<Vec<_>>();
+    let ranges_per_partition = builders
+        .into_iter()
+        .map(|builder| builder.build())
+        .collect::<Vec<_>>();
+
+    NonOverlappingOrderedLexicalRanges::try_new(&sort_options, ranges_per_partition)
+}
+
+fn partition_count(plan: &Arc<dyn ExecutionPlan>) -> usize {
+    plan.output_partitioning().partition_count()
+}
+
+/// Returns the min and max value for the specified partition.
+///
+/// Eventually this would be represented by the Statistics structure in
+/// DataFusion.
+fn min_max_for_partition(
+    col_name: &str,
+    sort_options: &SortOptions,
+    partition: usize,
+    plan: &Arc<dyn ExecutionPlan>,
+) -> Result<Option<(ScalarValue, ScalarValue)>> {
+    // Check if the column is a constant value according to the equivalence properties (TODO)
+
+    let mut extractor = PartitionStatisticsVisitor::new(col_name, partition);
+    plan.visit(&mut extractor)?;
+    let stats = extractor.result();
+
+    if let Some(ColumnStatistics {
+        min_value,
+        max_value,
+        null_count,
+        ..
+    }) = stats
+    {
+        let (Some(min), Some(max)) = (min_value.get_value(), max_value.get_value())
+        else {
+            return Ok(None);
+        };
+
+        let mut min = min.clone();
+        let mut max = max.clone();
+        if *null_count.get_value().unwrap_or(&0) > 0 {
+            let nulls_as_min = !sort_options.descending && sort_options.nulls_first // ASC nulls first
+                || sort_options.descending && !sort_options.nulls_first; // DESC nulls last
+
+            // TODO(wiedld): use ScalarValue::Null after using type coersion in ConvertedRows?
+            // For now, return None if fails to convert.
+            let Some(null) = use_scalar_none(&min) else {
+                return Ok(None);
+            };
+
+            if nulls_as_min {
+                min = null;
+            } else {
+                max = null;
+            }
+        }
+
+        Ok(Some((min, max)))
+    } else {
+        Ok(None)
+    }
+}
+
+/// Attempt to extract LexicalRanges for the given sort keys and partitioned files
+///
+/// Output will have N ranges where N is the number of partitioned files
+///
+/// Returns None if not possible to determine disjoint ranges
+pub fn extract_ranges_from_files(
+    exprs: &LexOrdering,
+    files: &Vec<&PartitionedFile>,
+    schema: Arc<Schema>,
+) -> Result<Option<NonOverlappingOrderedLexicalRanges>> {
+    let num_input_partitions = files.len();
+
+    // one builder for each output partition
+    let mut builders = vec![LexicalRange::builder(); num_input_partitions];
+    for sort_expr in exprs.iter() {
+        let Some(column) = sort_expr.expr.as_any().downcast_ref::<Column>() else {
+            return Ok(None);
+        };
+        let col_name = column.name();
+
+        for (file, builder) in files.iter().zip(builders.iter_mut()) {
+            let Some((min, max)) = min_max_for_partitioned_file(col_name, file, &schema)?
+            else {
+                return Ok(None);
+            };
+            builder.push(min, max);
+        }
+    }
+
+    let sort_options = exprs.iter().map(|e| e.options).collect::<Vec<_>>();
+    let ranges_per_partition = builders
+        .into_iter()
+        .map(|builder| builder.build())
+        .collect::<Vec<_>>();
+
+    NonOverlappingOrderedLexicalRanges::try_new(&sort_options, ranges_per_partition)
+}
+
+/// Returns the min and max value for the specified partitioned file.
+///
+/// Eventually this will not be required, since we will generalize the solution to not require DAG plan modification.
+fn min_max_for_partitioned_file(
+    col_name: &str,
+    file: &PartitionedFile,
+    schema: &Arc<Schema>,
+) -> Result<Option<(ScalarValue, ScalarValue)>> {
+    let (Some((col_idx, _)), Some(file_stats)) =
+        (schema.fields().find(col_name), &file.statistics)
+    else {
+        return Ok(None);
+    };
+    let col_stats = file_stats.column_statistics[col_idx].clone();
+    Ok(column_statistics_min_max(col_stats))
+}
+
+/// TODO: remove this function.
+///
+/// This is due to our [`ConvertedRows`] not yet handling type coersion of [`ScalarValue::Null`].
+fn use_scalar_none(value: &ScalarValue) -> Option<ScalarValue> {
+    match value {
+        ScalarValue::Boolean(_) => Some(ScalarValue::Boolean(None)),
+        ScalarValue::Float16(_) => Some(ScalarValue::Float16(None)),
+        ScalarValue::Float32(_) => Some(ScalarValue::Float32(None)),
+        ScalarValue::Float64(_) => Some(ScalarValue::Float64(None)),
+        ScalarValue::Decimal128(_, a, b) => Some(ScalarValue::Decimal128(None, *a, *b)),
+        ScalarValue::Decimal256(_, a, b) => Some(ScalarValue::Decimal256(None, *a, *b)),
+        ScalarValue::Int8(_) => Some(ScalarValue::Int8(None)),
+        ScalarValue::Int16(_) => Some(ScalarValue::Int16(None)),
+        ScalarValue::Int32(_) => Some(ScalarValue::Int32(None)),
+        ScalarValue::Int64(_) => Some(ScalarValue::Int64(None)),
+        ScalarValue::UInt8(_) => Some(ScalarValue::UInt8(None)),
+        ScalarValue::UInt16(_) => Some(ScalarValue::UInt16(None)),
+        ScalarValue::UInt32(_) => Some(ScalarValue::UInt32(None)),
+        ScalarValue::UInt64(_) => Some(ScalarValue::UInt64(None)),
+        ScalarValue::Utf8(_) => Some(ScalarValue::Utf8(None)),
+        ScalarValue::Utf8View(_) => Some(ScalarValue::Utf8View(None)),
+        ScalarValue::LargeUtf8(_) => Some(ScalarValue::LargeUtf8(None)),
+        ScalarValue::Binary(_) => Some(ScalarValue::Binary(None)),
+        ScalarValue::BinaryView(_) => Some(ScalarValue::BinaryView(None)),
+        ScalarValue::FixedSizeBinary(i, _) => {
+            Some(ScalarValue::FixedSizeBinary(*i, None))
+        }
+        ScalarValue::LargeBinary(_) => Some(ScalarValue::LargeBinary(None)),
+        ScalarValue::Date32(_) => Some(ScalarValue::Date32(None)),
+        ScalarValue::Date64(_) => Some(ScalarValue::Date64(None)),
+        ScalarValue::Time32Second(_) => Some(ScalarValue::Time32Second(None)),
+        ScalarValue::Time32Millisecond(_) => Some(ScalarValue::Time32Millisecond(None)),
+        ScalarValue::Time64Microsecond(_) => Some(ScalarValue::Time64Microsecond(None)),
+        ScalarValue::Time64Nanosecond(_) => Some(ScalarValue::Time64Nanosecond(None)),
+        ScalarValue::TimestampSecond(_, tz) => {
+            Some(ScalarValue::TimestampSecond(None, tz.clone()))
+        }
+        ScalarValue::TimestampMillisecond(_, tz) => {
+            Some(ScalarValue::TimestampMillisecond(None, tz.clone()))
+        }
+        ScalarValue::TimestampMicrosecond(_, tz) => {
+            Some(ScalarValue::TimestampMicrosecond(None, tz.clone()))
+        }
+        ScalarValue::TimestampNanosecond(_, tz) => {
+            Some(ScalarValue::TimestampNanosecond(None, tz.clone()))
+        }
+        ScalarValue::IntervalYearMonth(_) => Some(ScalarValue::IntervalYearMonth(None)),
+        ScalarValue::IntervalDayTime(_) => Some(ScalarValue::IntervalDayTime(None)),
+        ScalarValue::IntervalMonthDayNano(_) => {
+            Some(ScalarValue::IntervalMonthDayNano(None))
+        }
+        ScalarValue::DurationSecond(_) => Some(ScalarValue::DurationSecond(None)),
+        ScalarValue::DurationMillisecond(_) => {
+            Some(ScalarValue::DurationMillisecond(None))
+        }
+        ScalarValue::DurationMicrosecond(_) => {
+            Some(ScalarValue::DurationMicrosecond(None))
+        }
+        ScalarValue::DurationNanosecond(_) => Some(ScalarValue::DurationNanosecond(None)),
+
+        // for now, don't support others.
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::cmp;
+
+    use super::*;
+
+    use arrow::{
+        compute::SortOptions,
+        datatypes::{DataType, Field},
+    };
+    use datafusion_common::{stats::Precision, ColumnStatistics, Statistics};
+    use datafusion_datasource::{
+        file_groups::FileGroup, file_scan_config::FileScanConfigBuilder,
+        source::DataSourceExec,
+    };
+    use datafusion_datasource_parquet::source::ParquetSource;
+    use datafusion_execution::object_store::ObjectStoreUrl;
+    use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::{
+        expressions::col, sorts::sort::SortExec, union::UnionExec,
+    };
+    use itertools::Itertools;
+
+    struct KeyRange {
+        min: Option<i32>,
+        max: Option<i32>,
+        null_count: usize,
+    }
+
+    fn schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]))
+    }
+
+    /// Create a single parquet
+    fn parquet_exec_with_sort_with_statistics(
+        output_ordering: Vec<LexOrdering>,
+        key_ranges: &[&KeyRange],
+    ) -> Arc<dyn ExecutionPlan> {
+        let mut statistics = Statistics::new_unknown(&schema());
+        let mut file_groups = Vec::with_capacity(key_ranges.len());
+
+        let mut cum_null_count = 0;
+        let mut cum_min = None;
+        let mut cum_max = None;
+        for key_range in key_ranges {
+            let KeyRange {
+                min,
+                max,
+                null_count,
+            } = key_range;
+
+            // update cummulative statistics for entire parquet exec
+            cum_min = match (cum_min, min) {
+                (None, x) => *x,
+                (x, None) => x,
+                (Some(a), Some(b)) => Some(cmp::min(a, *b)),
+            };
+            cum_max = match (cum_max, max) {
+                (None, x) => *x,
+                (x, None) => x,
+                (Some(a), Some(b)) => Some(cmp::max(a, *b)),
+            };
+            cum_null_count += *null_count;
+
+            // Create file with statistics.
+            let mut file = PartitionedFile::new("x".to_string(), 100);
+            let stats = Statistics {
+                num_rows: Precision::Absent,
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![ColumnStatistics {
+                    null_count: Precision::Exact(*null_count),
+                    min_value: Precision::Exact(ScalarValue::Int32(*min)),
+                    max_value: Precision::Exact(ScalarValue::Int32(*max)),
+                    ..Default::default()
+                }],
+            };
+            file.statistics = Some(Arc::new(stats.clone()));
+            file_groups.push(FileGroup::new(vec![file]).with_statistics(stats));
+        }
+
+        // add stats, for the whole parquet exec, for a single column
+        statistics.column_statistics[0] = ColumnStatistics {
+            null_count: Precision::Exact(cum_null_count),
+            min_value: Precision::Exact(ScalarValue::Int32(cum_min)),
+            max_value: Precision::Exact(ScalarValue::Int32(cum_max)),
+            ..Default::default()
+        };
+
+        let config = Arc::new(
+            FileScanConfigBuilder::new(
+                ObjectStoreUrl::parse("test:///").unwrap(),
+                schema(),
+                Arc::new(ParquetSource::new(Default::default())),
+            )
+            .with_file_groups(file_groups)
+            .with_output_ordering(output_ordering)
+            .with_statistics(statistics)
+            .build(),
+        );
+
+        Arc::new(DataSourceExec::new(config))
+    }
+
+    fn union_exec(input: Vec<Arc<dyn ExecutionPlan>>) -> Arc<dyn ExecutionPlan> {
+        Arc::new(UnionExec::new(input))
+    }
+
+    fn sort_exec(
+        sort_exprs: LexOrdering,
+        input: Arc<dyn ExecutionPlan>,
+        preserve_partitioning: bool,
+    ) -> Arc<dyn ExecutionPlan> {
+        let new_sort = SortExec::new(sort_exprs, input)
+            .with_preserve_partitioning(preserve_partitioning);
+        Arc::new(new_sort)
+    }
+
+    fn str_lexical_ranges(lex_ranges: &[LexicalRange]) -> String {
+        lex_ranges
+            .iter()
+            .map(|lr| lr.to_string())
+            .collect_vec()
+            .join(", ")
+    }
+
+    /// Build a test physical plan like:
+    /// UNION
+    ///     ParquetExec (range_a)
+    ///     SORT
+    ///         UNION
+    ///             ParquetExec (range_b_1)
+    ///             ParquetExec (range_b_2)
+    fn build_test_case(
+        ordering: &LexOrdering,
+        key_ranges: &[KeyRange; 3],
+    ) -> Arc<dyn ExecutionPlan> {
+        let [range_a, range_b_1, range_b_2] = key_ranges;
+
+        let datasrc_a =
+            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_a]);
+
+        let datasrc_b1 =
+            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_b_1]);
+        let datasrc_b2 =
+            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_b_2]);
+        let b = sort_exec(
+            ordering.clone(),
+            union_exec(vec![datasrc_b1, datasrc_b2]),
+            false,
+        );
+
+        union_exec(vec![datasrc_a, b])
+    }
+
+    #[test]
+    fn test_extract_disjoint_ranges() -> Result<()> {
+        let plan_ranges_disjoint = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+        let plan_ranges_overlapping = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2010),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        // Test: ASC, is disjoint (not overlapping)
+        // (1000)->(2000), (2001)->(3500)
+        let options = SortOptions {
+            descending: false,
+            ..Default::default()
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint);
+        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
+        else {
+            unreachable!("should find disjoint")
+        };
+        assert_eq!(actual.indices(), &[0, 1], "should find proper ordering");
+        assert_eq!(
+            format!("{}", str_lexical_ranges(actual.value_ranges())),
+            "(1000)->(2000), (2001)->(3500)",
+            "should find proper ranges"
+        );
+
+        // Test: ASC, is overlapping
+        // (1000)->(2010), (2001)->(3500)
+        let options = SortOptions {
+            descending: false,
+            ..Default::default()
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_overlapping);
+        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
+        assert!(actual.is_none(), "should not find disjoint range");
+
+        // Test: DESC, is disjoint (not overlapping)
+        // (2001)->(3500), (1000)->(2000)
+        let options = SortOptions {
+            descending: true,
+            ..Default::default()
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint);
+        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
+        else {
+            unreachable!("should find disjoint")
+        };
+        assert_eq!(actual.indices(), &[1, 0], "should find proper ordering"); // NOTE THE INVERSE ORDER
+        assert_eq!(
+            format!("{}", str_lexical_ranges(actual.value_ranges())),
+            "(2001)->(3500), (1000)->(2000)",
+            "should find proper ranges"
+        );
+
+        // Test: DESC, is overlapping
+        // (2001)->(3500), (1000)->(2010)
+        let options = SortOptions {
+            descending: false,
+            ..Default::default()
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_overlapping);
+        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
+        assert!(actual.is_none(), "should not find disjoint range");
+
+        Ok(())
+    }
+
+    /// Same as `test_extract_disjoint_ranges`, but include nulls first.
+    #[test]
+    fn test_extract_disjoint_ranges_nulls_first() -> Result<()> {
+        // NULL is min_value if ASC
+        let plan_ranges_disjoint_if_asc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 1, // null is min value
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+        let plan_ranges_overlap_if_asc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 1, // null is min value, this will be overlapping
+            },
+        ];
+
+        // NULL is max_value if DESC
+        let plan_ranges_disjoint_if_desc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 1, // null is max value
+            },
+        ];
+        let plan_ranges_overlap_if_desc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 1, // null is max value, this will be overlapping
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        // Test: ASC, is disjoint (not overlapping)
+        // (1000)->(2000), (2001)->(3500), and partition<(1000)->(2000)> has the NULL (min value)
+        let options = SortOptions {
+            descending: false,
+            nulls_first: true,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_asc);
+        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
+        else {
+            unreachable!("should find disjoint")
+        };
+        assert_eq!(actual.indices(), &[0, 1], "should find proper ordering");
+        assert_eq!(
+            format!("{}", str_lexical_ranges(actual.value_ranges())),
+            "(NULL)->(2000), (2001)->(3500)",
+            "should find proper ranges"
+        );
+
+        // Test: ASC, is overlapping
+        // (1000)->(2000), (2001)->(3500), and partition<(2001)->(3500)> has the NULL (min value)
+        let options = SortOptions {
+            descending: false,
+            nulls_first: true,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_asc);
+        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
+        assert!(actual.is_none(), "should not find disjoint range");
+
+        // Test: DESC, is disjoint (not overlapping)
+        // (2001)->(3500), (1000)->(2000), and partition<(2001)->(3500)> has the NULL (max value)
+        let options = SortOptions {
+            descending: true,
+            nulls_first: true,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_desc);
+        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
+        else {
+            unreachable!("should find disjoint")
+        };
+        assert_eq!(actual.indices(), &[1, 0], "should find proper ordering"); // NOTE THE INVERSE ORDER
+        assert_eq!(
+            format!("{}", str_lexical_ranges(actual.value_ranges())),
+            "(2001)->(NULL), (1000)->(2000)",
+            "should find proper ranges"
+        );
+
+        // Test: DESC, is overlapping
+        // (2001)->(3500), (1000)->(2000), and partition<(1000)->(2000)> has the NULL (max value)
+        let options = SortOptions {
+            descending: true,
+            nulls_first: true,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_desc);
+        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
+        assert!(actual.is_none(), "should not find disjoint range");
+
+        Ok(())
+    }
+
+    /// Same as `test_extract_disjoint_ranges`, but include nulls last.
+    #[test]
+    fn test_extract_disjoint_ranges_nulls_last() -> Result<()> {
+        // NULL is max_value if ASC
+        let plan_ranges_disjoint_if_asc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 1, // null is max value
+            },
+        ];
+        let plan_ranges_overlap_if_asc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 1, // null is max value, this will be overlapping
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        // NULL is min_value if DESC
+        let plan_ranges_disjoint_if_desc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 1, // null is min value
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+        let plan_ranges_overlap_if_desc = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 1, // null is min value, this will be overlapping
+            },
+        ];
+
+        // Test: ASC, is disjoint (not overlapping)
+        // (1000)->(2000), (2001)->(3500), and partition<(2001)->(3500)> has the NULL (max value)
+        let options = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_asc);
+        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
+        else {
+            unreachable!("should find disjoint")
+        };
+        assert_eq!(actual.indices(), &[0, 1], "should find proper ordering");
+        assert_eq!(
+            format!("{}", str_lexical_ranges(actual.value_ranges())),
+            "(1000)->(2000), (2001)->(NULL)",
+            "should find proper ranges"
+        );
+
+        // Test: ASC, is overlapping
+        // (1000)->(2000), (2001)->(3500), and partition<(1000)->(2000)> has the NULL (max value)
+        let options = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_asc);
+        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
+        assert!(actual.is_none(), "should not find disjoint range");
+
+        // Test: DESC, is disjoint (not overlapping)
+        // (2001)->(3500), (1000)->(2000), and partition<(1000)->(2000)> has the NULL (min value)
+        let options = SortOptions {
+            descending: true,
+            nulls_first: false,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_desc);
+        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
+        else {
+            unreachable!("should find disjoint")
+        };
+        assert_eq!(actual.indices(), &[1, 0], "should find proper ordering"); // NOTE THE INVERSE ORDER
+        assert_eq!(
+            format!("{}", str_lexical_ranges(actual.value_ranges())),
+            "(2001)->(3500), (NULL)->(2000)",
+            "should find proper ranges"
+        );
+
+        // Test: DESC, is overlapping
+        // (2001)->(3500), (1000)->(2000), and partition<(2001)->(3500)> has the NULL (min value)
+        let options = SortOptions {
+            descending: true,
+            nulls_first: false,
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_desc);
+        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
+        assert!(actual.is_none(), "should not find disjoint range");
+
+        Ok(())
+    }
+
+    /// Build a test physical plan like:
+    /// UNION
+    ///     ParquetExec (range_a)
+    ///     ParquetExec (range_b_1, range_b_2)
+    fn build_test_case_multiple_partition_parquet_exec(
+        ordering: &LexOrdering,
+        key_ranges: &[KeyRange; 3],
+    ) -> Arc<dyn ExecutionPlan> {
+        let [range_a, range_b_1, range_b_2] = key_ranges;
+
+        let datasrc_a =
+            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_a]);
+        let datasrc_b = parquet_exec_with_sort_with_statistics(
+            vec![ordering.clone()],
+            &[range_b_1, range_b_2],
+        );
+
+        union_exec(vec![datasrc_a, datasrc_b])
+    }
+
+    #[test]
+    fn test_extract_multiple_partitions_union_parquet_exec() -> Result<()> {
+        let plan_ranges_disjoint = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+        let plan_ranges_overlapping = &[
+            KeyRange {
+                min: Some(1000),
+                max: Some(2010),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            KeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        // Test: ASC, is disjoint (not overlapping)
+        let options = SortOptions {
+            descending: false,
+            ..Default::default()
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case_multiple_partition_parquet_exec(
+            &lex_ordering,
+            plan_ranges_disjoint,
+        );
+        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
+        else {
+            unreachable!("should find disjoint")
+        };
+        assert_eq!(actual.indices(), &[0, 1, 2], "should find proper ordering");
+        assert_eq!(
+            format!("{}", str_lexical_ranges(actual.value_ranges())),
+            "(1000)->(2000), (2001)->(3000), (3001)->(3500)",
+            "should find proper ranges"
+        );
+
+        // Test: ASC, is overlapping
+        let options = SortOptions {
+            descending: false,
+            ..Default::default()
+        };
+        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
+        let lex_ordering = LexOrdering::new(vec![sort_expr]);
+        let plan = build_test_case_multiple_partition_parquet_exec(
+            &lex_ordering,
+            plan_ranges_overlapping,
+        );
+        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
+        assert!(actual.is_none(), "should not find disjoint range");
+
+        Ok(())
+    }
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs b/datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs
new file mode 100644
index 000000000000..36783466f2c0
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs
@@ -0,0 +1,774 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`NonOverlappingOrderedLexicalRanges`] represents ranges of lexically ordered values.
+
+use arrow::array::ArrayRef;
+use arrow::compute::SortOptions;
+use arrow::row::{Row, RowConverter, Rows, SortField};
+use datafusion_common::{error::DataFusionError, Result, ScalarValue};
+use std::fmt::Display;
+use std::sync::Arc;
+
+/// # Lexical Space
+///
+/// The "Lexical Space" is all possible values of a sort order (set of sort
+/// expressions).
+///
+/// For example, given data with a sort order of `A ASC, B ASC`
+/// (`A` ascending, `B` ascending), then the lexical space is all the unique
+/// combinations of `(A, B)`.
+///
+/// # Lexical Range
+///
+/// The "lexical range" of an input in this lexical space is
+/// the minimum and maximum sort key values for that range.
+///
+/// For example, for data like
+/// | `a` | `b` |
+/// |--------|--------|
+/// | 1 | 100 |
+/// | 1 | 200 |
+/// | 1 | 300 |
+/// | 2 | 100 |
+/// | 2 | 200 |
+/// | 3 | 50 |
+///
+/// The lexical range is `min --> max`: `(1,100) --> (3,50)`
+#[derive(Debug, Default, Clone)]
+pub struct LexicalRange {
+    /// The minimum value in the lexical space (one for each sort key)
+    min: Vec<ScalarValue>,
+    /// The maximum value in the lexical space (one for each sort key)
+    max: Vec<ScalarValue>,
+}
+
+impl LexicalRange {
+    pub fn builder() -> LexicalRangeBuilder {
+        LexicalRangeBuilder::new()
+    }
+}
+
+impl Display for LexicalRange {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "({})->({})",
+            self.min
+                .iter()
+                .map(|v| v.to_string())
+                .collect::<Vec<_>>()
+                .join(","),
+            self.max
+                .iter()
+                .map(|v| v.to_string())
+                .collect::<Vec<_>>()
+                .join(",")
+        )
+    }
+}
+
+/// Builder for [`LexicalRange`]
+#[derive(Debug, Default, Clone)]
+pub struct LexicalRangeBuilder {
+    inner: LexicalRange,
+}
+
+impl LexicalRangeBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn build(self) -> LexicalRange {
+        self.inner
+    }
+
+    /// Adds min and max to the end of the in-progress ranges
+    pub fn push(&mut self, min: ScalarValue, max: ScalarValue) {
+        self.inner.min.push(min);
+        self.inner.max.push(max);
+    }
+}
+
+/// Represents ranges of lexically ordered values in a sort order.
+///
+/// One element for each input partition
+
+#[derive(Debug)]
+pub struct NonOverlappingOrderedLexicalRanges {
+    /// Corresponding lexical range per partition, already reordered to match indices
+    value_ranges: Vec<LexicalRange>,
+
+    /// The order of the input partitions (rows of ranges) that would provide ensure they are
+    /// sorted in lexical order
+    indices: Vec<usize>,
+}
+
+impl Display for NonOverlappingOrderedLexicalRanges {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "LexicalRanges\n  {:?}", self.indices,)
+    }
+}
+
+impl NonOverlappingOrderedLexicalRanges {
+    /// Attempt to create a new [`NonOverlappingOrderedLexicalRanges`]
+    ///
+    /// Returns None if:
+    /// * - There are no ranges
+    /// * - The ranges are not disjoint (aka they overlap in the lexical space)
+    ///
+    /// Returns Err if there is an error converting values
+    pub fn try_new(
+        sort_options: &[SortOptions],
+        ranges_per_partition: Vec<LexicalRange>,
+    ) -> Result<Option<Self>> {
+        if ranges_per_partition.is_empty() {
+            return Ok(None);
+        }
+        // convert to min/maxes, as VecPerPartition<VecPerSortKey>
+        let (mins, maxes) = ranges_per_partition.clone().into_iter().fold(
+            (vec![], vec![]),
+            |(mut mins, mut maxes), partition_range| {
+                let LexicalRange {
+                    min: min_per_sort_key,
+                    max: max_per_sort_key,
+                } = partition_range;
+                mins.push(min_per_sort_key);
+                maxes.push(max_per_sort_key);
+                (mins, maxes)
+            },
+        );
+        let rows = ConvertedRows::try_new(sort_options, mins, maxes)?;
+
+        let mut indices = (0..rows.num_rows()).collect::<Vec<_>>();
+        indices.sort_by_key(|&i| rows.min_row(i));
+
+        // check that the ranges are disjoint
+        if !rows.are_disjoint(&indices) {
+            return Ok(None);
+        };
+
+        // reorder lexical ranges
+        let mut value_ranges = ranges_per_partition
+            .into_iter()
+            .enumerate()
+            .map(|(input_partition_idx, range)| {
+                let reordering_idx = indices
+                    .iter()
+                    .position(|reorder_idx| *reorder_idx == input_partition_idx)
+                    .expect("missing partition in reordering indices");
+                (reordering_idx, range)
+            })
+            .collect::<Vec<_>>();
+        value_ranges.sort_by_key(|(reord_i, _)| *reord_i);
+
+        let value_ranges = value_ranges
+            .into_iter()
+            .map(|(_, range)| range)
+            .collect::<Vec<_>>();
+
+        Ok(Some(Self {
+            value_ranges,
+            indices,
+        }))
+    }
+
+    /// Returns the indices that describe the order input partitions must be reordered
+    /// to be non overlapping and ordered in lexical order.
+    pub fn indices(&self) -> &[usize] {
+        &self.indices
+    }
+
+    /// Returns the lexical ranges already re-ordered
+    pub fn value_ranges(&self) -> &[LexicalRange] {
+        &self.value_ranges
+    }
+}
+
+/// Result of converting multiple-column ScalarValue rows to columns.
+#[derive(Debug)]
+struct ConvertedRows {
+    /// Use the same row converter for mins and maxes, otherwise they cannot be compared.
+    converter: RowConverter,
+
+    mins: Rows,
+    maxes: Rows,
+}
+
+impl ConvertedRows {
+    /// Create new [`ConvertedRows`] from the vector of sort keys and specified options.
+    ///
+    /// Keys are in the format `VecPerPartition<VecPerSortKey<value>>`
+    fn try_new(
+        sort_options: &[SortOptions],
+        min_keys: Vec<Vec<ScalarValue>>,
+        max_keys: Vec<Vec<ScalarValue>>,
+    ) -> Result<Self> {
+        assert_eq!(sort_options.len(), min_keys[0].len());
+        assert_eq!(sort_options.len(), max_keys[0].len());
+
+        // build converter using min keys
+        let arrays = pivot_to_arrays(min_keys)?;
+        let converter_fields = arrays
+            .iter()
+            .zip(sort_options.iter())
+            .map(|(a, options)| {
+                SortField::new_with_options(a.data_type().clone(), *options)
+            })
+            .collect::<Vec<_>>();
+        let converter = RowConverter::new(converter_fields)?;
+        let mins = converter.convert_columns(&arrays)?;
+
+        // build maxes
+        let arrays = pivot_to_arrays(max_keys)?;
+        let maxes = converter.convert_columns(&arrays)?;
+
+        Ok(Self {
+            converter,
+            mins,
+            maxes,
+        })
+    }
+
+    fn num_rows(&self) -> usize {
+        self.mins.num_rows()
+    }
+
+    /// Return the min (as Row) at the specified index
+    fn min_row(&self, index: usize) -> Row<'_> {
+        self.mins.row(index)
+    }
+
+    /// Return the max (as Row) at the specified index
+    fn max_row(&self, index: usize) -> Row<'_> {
+        self.maxes.row(index)
+    }
+
+    /// Return the min value at the specified index
+    fn min_value(&self, index: usize) -> Result<Vec<ArrayRef>> {
+        let values = self
+            .converter
+            .convert_rows([self.min_row(index)])
+            .map_err(|e| DataFusionError::ArrowError(e, None))?;
+        Ok(values.iter().map(Arc::clone).collect::<Vec<_>>())
+    }
+
+    /// Return the max value at the specified index
+    fn max_value(&self, index: usize) -> Result<Vec<ArrayRef>> {
+        let values = self
+            .converter
+            .convert_rows([self.max_row(index)])
+            .map_err(|e| DataFusionError::ArrowError(e, None))?;
+        Ok(values.iter().map(Arc::clone).collect::<Vec<_>>())
+    }
+
+    // Return true if ranges are disjoint when order according to ordered_partition_idx.
+    fn are_disjoint(&self, ordered_by_min_partition_indices: &[usize]) -> bool {
+        for index_index in 1..ordered_by_min_partition_indices.len() {
+            let index = ordered_by_min_partition_indices[index_index];
+            let prev_index = ordered_by_min_partition_indices[index_index - 1];
+
+            // Ordering is by sort key, and may be desc.
+            // Therefore need to confirm that the min & max of the current range is greater than the previous range.
+            let start_exclusive = self.min_row(index) > self.min_row(prev_index)
+                && self.min_row(index) > self.max_row(prev_index);
+            let end_exclusive = self.max_row(index) > self.min_row(prev_index)
+                && self.max_row(index) > self.max_row(prev_index);
+            if !(start_exclusive && end_exclusive) {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+/// Convert a multi-column ScalarValue row to columns
+fn pivot_to_arrays(keys: Vec<Vec<ScalarValue>>) -> Result<Vec<ArrayRef>> {
+    let mut arrays = vec![];
+    for col in 0..keys[0].len() {
+        let mut column = vec![];
+        for row in &keys {
+            // todo avoid this clone (with take)
+            column.push(row[col].clone());
+        }
+        arrays.push(ScalarValue::iter_to_array(column)?)
+    }
+    Ok(arrays)
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::compute::SortOptions;
+    use datafusion_common::ScalarValue;
+    use itertools::Itertools;
+
+    use super::{LexicalRange, NonOverlappingOrderedLexicalRanges};
+
+    struct TestCase {
+        partitions: Vec<TestPartition>,
+        num_sort_keys: usize,
+        name: &'static str,
+        expected_ranges_per_partition: Vec<&'static str>, // before ordering
+        expect_disjoint: bool,
+        expected_ordered_indices: Vec<ExpectedOrderedIndices>, // after ordering
+    }
+
+    impl TestCase {
+        fn run(self) {
+            // Test: confirm found proper lexical ranges
+            let lexical_ranges = self.build_lexical_ranges();
+            let expected = self
+                .expected_ranges_per_partition
+                .iter()
+                .map(|str| str.to_string())
+                .collect_vec();
+            let actual = lexical_ranges
+                .iter()
+                .map(|range| format!("{range}"))
+                .collect_vec();
+            assert_eq!(
+                actual, expected,
+                "ERROR {}: expected ranges {:?} but found {:?}",
+                self.name, expected, actual
+            );
+
+            // Test: confirm found proper non overlapping (or not) ranges per given sort ordering
+            [
+                TestSortOption::AscNullsFirst,
+                TestSortOption::AscNullsLast,
+                TestSortOption::DescNullsFirst,
+                TestSortOption::DescNullsLast,
+            ].into_iter().for_each(|sort_ordering| {
+                if let Some(nonoverlapping) = NonOverlappingOrderedLexicalRanges::try_new(&sort_ordering.sort_options(self.num_sort_keys).as_slice(), lexical_ranges.clone()).expect("should not error") {
+                    assert!(self.expect_disjoint, "ERROR {} for {:?}: expected ranges to overlap, instead found disjoint ranges", self.name, &sort_ordering);
+
+                    let expected_ordered_indices = self.find_expected_indices(&sort_ordering);
+                    assert_eq!(expected_ordered_indices, nonoverlapping.indices(), "ERROR {} for {:?}: expected to find indices ordered {:?}, instead found ordering {:?}", self.name, &sort_ordering, expected_ordered_indices, nonoverlapping.indices());
+                } else {
+                    assert!(!self.expect_disjoint, "ERROR {} for {:?}: expected to find disjoint ranges, instead could either not detect ranges or found overlapping ranges", self.name, &sort_ordering);
+                };
+            });
+        }
+
+        fn build_lexical_ranges(&self) -> Vec<LexicalRange> {
+            self.partitions
+                .iter()
+                .map(|partition| {
+                    let mut builder = LexicalRange::builder();
+                    for SortKeyRange { min, max } in &partition.range_per_sort_key {
+                        builder.push(min.clone(), max.clone());
+                    }
+                    builder.build()
+                })
+                .collect_vec()
+        }
+
+        fn find_expected_indices(&self, sort_ordering: &TestSortOption) -> &[usize] {
+            self.expected_ordered_indices
+                .iter()
+                .find(|ord| ord.sort_ordering == *sort_ordering)
+                .expect("should have expected outcome")
+                .expected_indices
+                .as_ref()
+        }
+    }
+
+    struct TestPartition {
+        range_per_sort_key: Vec<SortKeyRange>,
+    }
+
+    /// Range of a sort key. Note that this is not impacted by directionality of ordering (e.g. [`SortOptions`]).
+    struct SortKeyRange {
+        min: ScalarValue,
+        max: ScalarValue,
+    }
+
+    fn build_partition_with_single_sort_key(
+        ints: (Option<i64>, Option<i64>),
+    ) -> TestPartition {
+        let range_per_sort_key = vec![SortKeyRange {
+            min: ScalarValue::Int64(ints.0),
+            max: ScalarValue::Int64(ints.1),
+        }];
+        TestPartition { range_per_sort_key }
+    }
+
+    fn build_partition_with_multiple_sort_keys(
+        ints: (Option<i64>, Option<i64>),
+        strings: (Option<String>, Option<String>),
+        times: (Option<i64>, Option<i64>),
+    ) -> TestPartition {
+        let range_per_sort_key = vec![
+            SortKeyRange {
+                min: ScalarValue::Int64(ints.0),
+                max: ScalarValue::Int64(ints.1),
+            },
+            SortKeyRange {
+                min: ScalarValue::Utf8(strings.0),
+                max: ScalarValue::Utf8(strings.1),
+            },
+            SortKeyRange {
+                min: ScalarValue::TimestampNanosecond(times.0, None),
+                max: ScalarValue::TimestampNanosecond(times.1, None),
+            },
+        ];
+        TestPartition { range_per_sort_key }
+    }
+
+    #[derive(Debug, PartialEq)]
+    enum TestSortOption {
+        AscNullsLast,
+        AscNullsFirst,
+        DescNullsLast,
+        DescNullsFirst,
+    }
+
+    impl TestSortOption {
+        fn sort_options(&self, len: usize) -> Vec<SortOptions> {
+            match self {
+                Self::AscNullsLast => std::iter::repeat_n(
+                    SortOptions {
+                        descending: false,
+                        nulls_first: false,
+                    },
+                    len,
+                )
+                .collect_vec(),
+                Self::AscNullsFirst => std::iter::repeat_n(
+                    SortOptions {
+                        descending: false,
+                        nulls_first: true,
+                    },
+                    len,
+                )
+                .collect_vec(),
+                Self::DescNullsLast => std::iter::repeat_n(
+                    SortOptions {
+                        descending: true,
+                        nulls_first: false,
+                    },
+                    len,
+                )
+                .collect_vec(),
+                Self::DescNullsFirst => std::iter::repeat_n(
+                    SortOptions {
+                        descending: true,
+                        nulls_first: true,
+                    },
+                    len,
+                )
+                .collect_vec(),
+            }
+        }
+    }
+
+    struct ExpectedOrderedIndices {
+        /// the ordering (e.g. [`SortOptions`]) applied to all columns in the sort key.
+        sort_ordering: TestSortOption,
+        /// Expected outcome ordering with this sort_ordering applied.
+        expected_indices: Vec<usize>,
+    }
+
+    impl From<(TestSortOption, Vec<usize>)> for ExpectedOrderedIndices {
+        fn from(value: (TestSortOption, Vec<usize>)) -> Self {
+            Self {
+                sort_ordering: value.0,
+                expected_indices: value.1,
+            }
+        }
+    }
+
+    #[test]
+    fn test_disjointness_single_key() {
+        let cases = [
+            TestCase {
+                partitions: vec![
+                    build_partition_with_single_sort_key((Some(1), Some(10))),
+                    build_partition_with_single_sort_key((Some(2), Some(10))),
+                    build_partition_with_single_sort_key((Some(0), Some(0))),
+                ],
+                num_sort_keys: 1,
+                name: "order_by_single_sort_key__overlapping",
+                expected_ranges_per_partition: vec!["(1)->(10)", "(2)->(10)", "(0)->(0)"],
+                expect_disjoint: false,
+                expected_ordered_indices: vec![],
+            },
+            TestCase {
+                partitions: vec![
+                    build_partition_with_single_sort_key((Some(1), Some(10))),
+                    build_partition_with_single_sort_key((Some(11), Some(20))),
+                    build_partition_with_single_sort_key((Some(0), Some(0))),
+                ],
+                num_sort_keys: 1,
+                name: "order_by_single_sort_key__disjoint",
+                expected_ranges_per_partition: vec![
+                    "(1)->(10)",
+                    "(11)->(20)",
+                    "(0)->(0)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![2, 0, 1]).into(),
+                    (TestSortOption::AscNullsLast, vec![2, 0, 1]).into(),
+                    (TestSortOption::DescNullsFirst, vec![1, 0, 2]).into(),
+                    (TestSortOption::DescNullsLast, vec![1, 0, 2]).into(),
+                ],
+            },
+        ];
+
+        cases.into_iter().for_each(|test_case| test_case.run());
+    }
+
+    #[test]
+    fn test_disjointness_multiple_sort_keys() {
+        let cases = [
+            /* Using the first sort key, an integer, as the decider. */
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(10)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(2), Some(10)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(0), Some(0)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(1), Some(1)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_first_sort_key__overlapping",
+                expected_ranges_per_partition: vec![
+                    "(1,same,1)->(10,same,1)",
+                    "(2,same,1)->(10,same,1)",
+                    "(0,same,1)->(0,same,1)",
+                ],
+                expect_disjoint: false,
+                expected_ordered_indices: vec![],
+            },
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(10)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(11), Some(20)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(0), Some(0)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(1), Some(1)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_first_sort_key__disjoint",
+                expected_ranges_per_partition: vec![
+                    "(1,same,1)->(10,same,1)",
+                    "(11,same,1)->(20,same,1)",
+                    "(0,same,1)->(0,same,1)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![2, 0, 1]).into(),
+                    (TestSortOption::AscNullsLast, vec![2, 0, 1]).into(),
+                    (TestSortOption::DescNullsFirst, vec![1, 0, 2]).into(),
+                    (TestSortOption::DescNullsLast, vec![1, 0, 2]).into(),
+                ],
+            },
+            /* Using the middle sort key, a string, as the decider. */
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("a".into()), Some("d".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("f".into()), Some("g".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("c".into()), Some("e".into())),
+                        (Some(1), Some(1)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_middle_sort_key__overlapping",
+                expected_ranges_per_partition: vec![
+                    "(1,a,1)->(1,d,1)",
+                    "(1,f,1)->(1,g,1)",
+                    "(1,c,1)->(1,e,1)",
+                ],
+                expect_disjoint: false,
+                expected_ordered_indices: vec![],
+            },
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("a".into()), Some("b".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("f".into()), Some("g".into())),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("c".into()), Some("e".into())),
+                        (Some(1), Some(1)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_middle_sort_key__disjoint",
+                expected_ranges_per_partition: vec![
+                    "(1,a,1)->(1,b,1)",
+                    "(1,f,1)->(1,g,1)",
+                    "(1,c,1)->(1,e,1)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![0, 2, 1]).into(),
+                    (TestSortOption::AscNullsLast, vec![0, 2, 1]).into(),
+                    (TestSortOption::DescNullsFirst, vec![1, 2, 0]).into(),
+                    (TestSortOption::DescNullsLast, vec![1, 2, 0]).into(),
+                ],
+            },
+            /* Using the last sort key, a nanosecond timestamp, as the decider. */
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(50000000), Some(50000001)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(700000), Some(50000001)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(100000000), Some(100000001)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_last_sort_key__overlapping",
+                expected_ranges_per_partition: vec![
+                    "(1,same,50000000)->(1,same,50000001)",
+                    "(1,same,700000)->(1,same,50000001)",
+                    "(1,same,100000000)->(1,same,100000001)",
+                ],
+                expect_disjoint: false,
+                expected_ordered_indices: vec![],
+            },
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(50000000), Some(50000001)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(700000), Some(7000001)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same".into()), Some("same".into())),
+                        (Some(100000000), Some(100000001)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_last_sort_key__disjoint",
+                expected_ranges_per_partition: vec![
+                    "(1,same,50000000)->(1,same,50000001)",
+                    "(1,same,700000)->(1,same,7000001)",
+                    "(1,same,100000000)->(1,same,100000001)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![1, 0, 2]).into(),
+                    (TestSortOption::AscNullsLast, vec![1, 0, 2]).into(),
+                    (TestSortOption::DescNullsFirst, vec![2, 0, 1]).into(),
+                    (TestSortOption::DescNullsLast, vec![2, 0, 1]).into(),
+                ],
+            },
+        ];
+
+        cases.into_iter().for_each(|test_case| test_case.run());
+    }
+
+    #[test]
+    fn test_disjointness_with_nulls() {
+        let cases = [
+            TestCase {
+                partitions: vec![
+                    build_partition_with_single_sort_key((Some(1), Some(10))),
+                    build_partition_with_single_sort_key((Some(2), Some(10))),
+                    build_partition_with_single_sort_key((None, None)),
+                ],
+                num_sort_keys: 1,
+                name: "order_by_nulls__overlapping",
+                expected_ranges_per_partition: vec![
+                    "(1)->(10)",
+                    "(2)->(10)",
+                    "(NULL)->(NULL)",
+                ],
+                expect_disjoint: false,
+                expected_ordered_indices: vec![],
+            },
+            TestCase {
+                partitions: vec![
+                    build_partition_with_single_sort_key((Some(1), Some(10))),
+                    build_partition_with_single_sort_key((Some(11), Some(20))),
+                    build_partition_with_single_sort_key((None, None)),
+                ],
+                num_sort_keys: 1,
+                name: "order_by_nulls__disjoint",
+                expected_ranges_per_partition: vec![
+                    "(1)->(10)",
+                    "(11)->(20)",
+                    "(NULL)->(NULL)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![2, 0, 1]).into(),
+                    (TestSortOption::AscNullsLast, vec![0, 1, 2]).into(),
+                    (TestSortOption::DescNullsFirst, vec![2, 1, 0]).into(),
+                    (TestSortOption::DescNullsLast, vec![1, 0, 2]).into(),
+                ],
+            },
+        ];
+
+        cases.into_iter().for_each(|test_case| test_case.run());
+    }
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs b/datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs
new file mode 100644
index 000000000000..2b2ae31f39bb
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs
@@ -0,0 +1,294 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_common::stats::Precision;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
+use datafusion_common::{
+    internal_datafusion_err, Result as DatafusionResult, ScalarValue,
+};
+use datafusion_datasource::file_scan_config::FileScanConfig;
+use datafusion_datasource::source::DataSourceExec;
+use datafusion_physical_plan::aggregates::AggregateExec;
+use datafusion_physical_plan::empty::EmptyExec;
+use datafusion_physical_plan::joins::{
+    CrossJoinExec, HashJoinExec, NestedLoopJoinExec, SymmetricHashJoinExec,
+};
+use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion_physical_plan::repartition::RepartitionExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::ColumnStatistics;
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
+
+/// Similar to our in-house `StatisticsVisitor`, except it performs the statistics extraction per partition,
+/// and not on the entire node.
+#[derive(Debug)]
+pub struct PartitionStatisticsVisitor<'a> {
+    column_name: &'a str,
+    partition: usize,
+    statistics: Option<ColumnStatistics>,
+}
+
+impl<'a> PartitionStatisticsVisitor<'a> {
+    pub fn new(column_name: &'a str, partition: usize) -> Self {
+        Self {
+            column_name,
+            partition,
+            statistics: Some(ColumnStatistics::new_unknown()),
+        }
+    }
+
+    /// Consumes self, returning the found [`ColumnStatistics`].
+    ///
+    /// Returning None means the [`PartitionStatisticsVisitor`] could not extract the stats.
+    pub fn result(&mut self) -> Option<ColumnStatistics> {
+        std::mem::take(&mut self.statistics)
+    }
+
+    /// Merge stats.
+    fn merge_stats<'i>(
+        mut stats: impl Iterator<Item = Option<&'i ColumnStatistics>>,
+    ) -> Option<ColumnStatistics> {
+        let Some(start) = stats.next() else {
+            // stats is empty
+            return None;
+        };
+        stats.fold(start.cloned(), |a, b| {
+            if let (Some(a), Some(b)) = (a, b) {
+                Some(merge_stats(a, b))
+            } else {
+                None // we hit something that prevented stats extraction
+            }
+        })
+    }
+
+    /// Find partition within multiple children, and return the partition stats.
+    fn find_stats_per_partition_within_multiple_children(
+        &mut self,
+        node: &Arc<dyn ExecutionPlan>,
+    ) -> DatafusionResult<Option<ColumnStatistics>> {
+        // figure out which input of the union to use
+        let mut child_partition = self.partition;
+        for child in node.children() {
+            let num_child_partitions = partition_count(child);
+            if child_partition < num_child_partitions {
+                self.partition = child_partition;
+                child.visit(self)?;
+                return Ok(self.result());
+            }
+            child_partition -= num_child_partitions;
+        }
+        unreachable!("didn't find the partition in children");
+    }
+
+    /// Applied per node, merge all statistics across partitions in node.
+    fn merge_column_statistics_across_partitions(
+        &mut self,
+        node: &Arc<dyn ExecutionPlan>,
+    ) -> DatafusionResult<Option<ColumnStatistics>> {
+        let cnt_partitions = partition_count_children(node);
+        let mut extracted_stats = Vec::with_capacity(cnt_partitions);
+
+        for child in node.children() {
+            for partition in 0..partition_count(child) {
+                let mut extractor = Self::new(self.column_name, partition);
+                child.visit(&mut extractor)?;
+                extracted_stats.push(extractor.result());
+            }
+        }
+
+        Ok(Self::merge_stats(
+            extracted_stats.iter().map(|a| a.as_ref()),
+        ))
+    }
+
+    /// Extract column statistics from file_group of parquet exec.
+    ///
+    /// This is required since we need the statistics per partition.
+    ///
+    /// This is a hack until upstream code is in place.
+    fn extract_statistics_from_file_group(
+        &self,
+        datasrc_exec: &DataSourceExec,
+    ) -> DatafusionResult<Option<ColumnStatistics>> {
+        let Ok(col_idx) = datasrc_exec.schema().index_of(self.column_name) else {
+            // maybe alias
+            return Ok(None);
+        };
+
+        // Extract partitioned files from the ParquetExec
+        let Some(base_config) = datasrc_exec
+            .data_source()
+            .as_any()
+            .downcast_ref::<FileScanConfig>()
+        else {
+            return Ok(None);
+        };
+
+        let file_group_partition = base_config.file_groups.get(self.partition).ok_or(
+            internal_datafusion_err!(
+                "requested partition does not exist in datasource exec"
+            ),
+        )?;
+
+        Ok(file_group_partition
+            .statistics()
+            .as_ref()
+            .and_then(|stats| stats.column_statistics.get(col_idx).cloned()))
+    }
+
+    /// Extract from datasource, with consideration of partitioning.
+    fn extract_from_data_source(
+        &mut self,
+        datasrc: &Arc<dyn ExecutionPlan>,
+    ) -> DatafusionResult<Option<ColumnStatistics>> {
+        if datasrc.as_any().downcast_ref::<EmptyExec>().is_some()
+            || datasrc
+                .as_any()
+                .downcast_ref::<PlaceholderRowExec>()
+                .is_some()
+        {
+            Ok(self.statistics.clone())
+        } else if let Some(datasrc_exec) =
+            datasrc.as_any().downcast_ref::<DataSourceExec>()
+        {
+            let datarsc_stats =
+                match self.extract_statistics_from_file_group(datasrc_exec) {
+                    Ok(Some(col_stats)) => Some(convert_min_max_to_exact(col_stats)),
+                    _ => None,
+                };
+            Ok(datarsc_stats)
+        } else {
+            // specific constraint on our plans
+            unreachable!("should not have another unsupported data source")
+        }
+    }
+}
+
+impl<'n> TreeNodeVisitor<'n> for PartitionStatisticsVisitor<'_> {
+    type Node = Arc<dyn ExecutionPlan>;
+
+    fn f_down(&mut self, node: &'n Self::Node) -> DatafusionResult<TreeNodeRecursion> {
+        if !is_supported(node) {
+            self.statistics = None;
+            Ok(TreeNodeRecursion::Stop)
+        } else if is_leaf(node) {
+            self.statistics = self.extract_from_data_source(node)?;
+            Ok(TreeNodeRecursion::Stop)
+        } else if should_merge_partition_statistics(node) {
+            self.statistics = self.merge_column_statistics_across_partitions(node)?;
+            Ok(TreeNodeRecursion::Jump)
+        } else if should_pass_thru_partition_statistics(node) {
+            self.statistics =
+                self.find_stats_per_partition_within_multiple_children(node)?;
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            self.statistics = None;
+            Ok(TreeNodeRecursion::Stop)
+        }
+    }
+}
+
+fn is_supported(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    !(plan.as_any().downcast_ref::<HashJoinExec>().is_some()
+        || plan.as_any().downcast_ref::<SymmetricHashJoinExec>().is_some()
+        || plan.as_any().downcast_ref::<CrossJoinExec>().is_some()
+        || plan.as_any().downcast_ref::<NestedLoopJoinExec>().is_some()
+        || plan.as_any().downcast_ref::<AggregateExec>().is_some() // aggregate values can be different than min/max
+        || plan.as_any().downcast_ref::<RepartitionExec>().is_some())
+}
+
+/// Has multiple input partitions, and 1 output partition. Merge stats.
+///
+/// This is a heuristic (based on our plans) and contingent on [`is_supported`].
+fn should_merge_partition_statistics(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    let sort_preserving_partitioning =
+        if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
+            sort_exec.preserve_partitioning()
+        } else {
+            false
+        };
+    partition_count(plan) == 1
+        && has_multiple_child_partitions(plan)
+        && !sort_preserving_partitioning
+}
+
+/// The input partitions and output partitions are expected to have unchanged statistics.
+///
+/// This is a heuristic (based on our plans) and contingent on [`is_supported`].
+fn should_pass_thru_partition_statistics(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    plan.children().len() == 1 || partition_count(plan) == partition_count_children(plan)
+}
+
+fn is_leaf(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    plan.children().is_empty()
+}
+
+fn has_multiple_child_partitions(plan: &Arc<dyn ExecutionPlan>) -> bool {
+    plan.children().len() > 1 || partition_count_children(plan) > 1
+}
+
+fn partition_count_children(plan: &Arc<dyn ExecutionPlan>) -> usize {
+    plan.children()
+        .iter()
+        .map(|child| partition_count(child))
+        .sum::<usize>()
+}
+
+fn partition_count(plan: &Arc<dyn ExecutionPlan>) -> usize {
+    plan.output_partitioning().partition_count()
+}
+
+fn merge_stats(a: ColumnStatistics, b: &ColumnStatistics) -> ColumnStatistics {
+    ColumnStatistics {
+        null_count: a.null_count.add(&b.null_count),
+        min_value: a.min_value.min(&b.min_value),
+        max_value: a.max_value.max(&b.max_value),
+        distinct_count: Precision::Absent,
+        sum_value: Precision::Absent,
+    }
+}
+
+/// DataFusion statistics can't distinguish "inexact" from "known to be within the range"
+///
+/// Converts inexact values to exact if possible for the purposes of
+/// analysis by [`PartitionStatisticsVisitor`]
+fn convert_min_max_to_exact(mut column_statistics: ColumnStatistics) -> ColumnStatistics {
+    column_statistics.min_value = to_exact(column_statistics.min_value);
+    column_statistics.max_value = to_exact(column_statistics.max_value);
+    column_statistics
+}
+
+/// Convert [`Precision::Inexact`] to [`Precision::Exact`] if possible
+fn to_exact(v: Precision<ScalarValue>) -> Precision<ScalarValue> {
+    match v {
+        Precision::Exact(v) | Precision::Inexact(v) => Precision::Exact(v),
+        Precision::Absent => Precision::Absent,
+    }
+}
+
+/// Return min max of a ColumnStatistics with precise values
+pub fn column_statistics_min_max(
+    column_statistics: ColumnStatistics,
+) -> Option<(ScalarValue, ScalarValue)> {
+    match (column_statistics.min_value, column_statistics.max_value) {
+        (Precision::Exact(min), Precision::Exact(max)) => Some((min, max)),
+        // the statistics values are absent or imprecise
+        _ => None,
+    }
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/util.rs b/datafusion/physical-optimizer/src/progressive_evaluation/util.rs
new file mode 100644
index 000000000000..c6b76e6e77e9
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/util.rs
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_common::{
+    error::Result,
+    tree_node::{Transformed, TreeNodeRecursion},
+};
+use datafusion_datasource::{
+    file_groups::FileGroup, file_scan_config::FileScanConfig, source::DataSourceExec,
+    PartitionedFile,
+};
+use datafusion_physical_expr::LexOrdering;
+use datafusion_physical_plan::{sorts::sort::SortExec, ExecutionPlan};
+
+use itertools::Itertools;
+
+use crate::progressive_evaluation::extract_ranges::extract_ranges_from_files;
+
+// This is an optimization for the most recent value query `ORDER BY time DESC/ASC LIMIT n`
+// See: https://github.com/influxdata/influxdb_iox/issues/12205
+// The observation is recent data is mostly in first file, so the plan should avoid reading the others unless necessary
+//
+/// This function is to split all files in the same ParquetExec into different groups/DF partitions and
+/// set the `preserve_partitioning` so they will be executed sequentially. The files will later be re-ordered
+/// (if non-overlapping) by lexical range.
+pub fn split_parquet_files(
+    plan: Arc<dyn ExecutionPlan>,
+    ordering_req: &LexOrdering,
+) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
+    if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
+        if !sort_exec
+            .properties()
+            .equivalence_properties()
+            .ordering_satisfy(ordering_req)
+        {
+            // halt on DAG branch
+            Ok(Transformed::new(plan, false, TreeNodeRecursion::Stop))
+        } else {
+            // continue down
+            Ok(Transformed::no(plan))
+        }
+    } else if let Some(parquet_exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
+        if let Some(transformed) =
+            transform_parquet_exec_single_file_each_group(parquet_exec, ordering_req)?
+        {
+            Ok(Transformed::yes(transformed))
+        } else {
+            Ok(Transformed::no(plan))
+        }
+    } else {
+        Ok(Transformed::no(plan))
+    }
+}
+
+/// Transform a ParquetExec with N files in various groupings,
+/// into a ParquetExec into N groups each include one file.
+///
+/// The function is only called when the plan does not include DeduplicateExec and includes only one ParquetExec.
+///
+/// This function will return error if
+///   - There are no statsitics for the given column (including the when the column is missing from the file
+///     and produce null values that leads to absent statistics)
+///   - Some files overlap (the min/max time ranges are disjoint)
+///   - There is a DeduplicateExec in the plan which means the data of the plan overlaps
+///
+/// The output ParquetExec's are ordered such that the file with the most recent time ranges is read first
+///
+/// For example
+/// ```text
+/// ParquetExec(groups=[[file1,file2], [file3]])
+/// ```
+/// Is rewritten so each file is in its own group and the files are ordered by time range
+/// ```text
+/// ParquetExec(groups=[[file1], [file2], [file3]])
+/// ```
+fn transform_parquet_exec_single_file_each_group(
+    datasrc_exec: &DataSourceExec,
+    ordering_req: &LexOrdering,
+) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+    if datasrc_exec
+        .properties()
+        .output_partitioning()
+        .partition_count()
+        == 1
+    {
+        return Ok(None);
+    }
+
+    // Extract partitioned files from the ParquetExec
+    let Some(base_config) = datasrc_exec
+        .data_source()
+        .as_any()
+        .downcast_ref::<FileScanConfig>()
+    else {
+        return Ok(None);
+    };
+    let files = base_config
+        .file_groups
+        .iter()
+        .flat_map(|group| group.files())
+        .collect_vec();
+    let schema = Arc::clone(&base_config.file_schema);
+
+    // extract disjoint lexical ranges
+    // if cannot find, then is not disjoint
+    let Some(lexical_ranges) = extract_ranges_from_files(ordering_req, &files, schema)?
+    else {
+        return Ok(None);
+    };
+
+    // reorder partitioned files by lexical indices
+    let indices = lexical_ranges.indices();
+    assert_eq!(
+        indices.len(),
+        files.len(),
+        "should have every file listed in the sorted indices"
+    );
+    let mut new_partitioned_file_groups = files
+        .into_iter()
+        .enumerate()
+        .map(|(file_idx, file)| {
+            (
+                indices
+                    .iter()
+                    .position(|sorted_idx| *sorted_idx == file_idx)
+                    .expect("file should be listed in indices"),
+                file,
+            )
+        })
+        .collect::<Vec<_>>();
+    new_partitioned_file_groups.sort_by_key(|(idx, _)| *idx);
+
+    // create new file grouping
+    let new_partitioned_file_groups = new_partitioned_file_groups
+        .into_iter()
+        .map(|(_, file)| {
+            // each file group has 1 file
+            build_file_group_with_stats(file)
+        })
+        .collect_vec();
+
+    // Assigned new partitioned file groups to the new base config
+    let mut new_base_config = base_config.clone();
+    new_base_config.file_groups = new_partitioned_file_groups;
+
+    Ok(Some(DataSourceExec::from_data_source(new_base_config)))
+}
+
+fn build_file_group_with_stats(file: &PartitionedFile) -> FileGroup {
+    let mut group = FileGroup::new(vec![file.clone()]);
+    if let Some(stats) = &file.statistics {
+        group = group.with_statistics(Arc::unwrap_or_clone(stats.to_owned()))
+    }
+    group
+}

From 4871fdd77b4a84cd88d4c1a5e98dfa9cc8137e29 Mon Sep 17 00:00:00 2001
From: wiedld <dlw405@gmail.com>
Date: Thu, 8 May 2025 11:24:44 -0700
Subject: [PATCH 2/2] refactor: port to upstream the massive refactoring from
 the past few weeks

---
 datafusion/physical-optimizer/Cargo.toml      |    2 +-
 .../src/progressive_evaluation.rs             |    1 +
 .../progressive_evaluation/extract_ranges.rs  | 1200 +++++------
 .../progressive_evaluation/lexical_ranges.rs  |  441 +++-
 .../src/progressive_evaluation/statistics.rs  | 1856 +++++++++++++++--
 .../statistics/fetch.rs                       |  211 ++
 .../statistics/filter.rs                      |   68 +
 .../statistics/project_schema.rs              |  632 ++++++
 .../progressive_evaluation/statistics/util.rs |  111 +
 .../src/progressive_evaluation/util.rs        |  259 ++-
 datafusion/physical-plan/src/filter.rs        |    2 +-
 11 files changed, 3752 insertions(+), 1031 deletions(-)
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/statistics/fetch.rs
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/statistics/filter.rs
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/statistics/project_schema.rs
 create mode 100644 datafusion/physical-optimizer/src/progressive_evaluation/statistics/util.rs

diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml
index 7fddca52f5fb..49eacc591fd7 100644
--- a/datafusion/physical-optimizer/Cargo.toml
+++ b/datafusion/physical-optimizer/Cargo.toml
@@ -52,7 +52,7 @@ log = { workspace = true }
 recursive = { workspace = true, optional = true }
 
 [dev-dependencies]
-datafusion-expr = { workspace = true }
 datafusion-datasource-parquet = { workspace = true }
+datafusion-expr = { workspace = true }
 datafusion-functions-nested = { workspace = true }
 insta = { workspace = true }
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation.rs b/datafusion/physical-optimizer/src/progressive_evaluation.rs
index b162973e6a3a..60a130c30962 100644
--- a/datafusion/physical-optimizer/src/progressive_evaluation.rs
+++ b/datafusion/physical-optimizer/src/progressive_evaluation.rs
@@ -18,6 +18,7 @@
 //! TODO: physical optimizer run to conditionally swap the SPM with the ProgressiveEvalExec
 
 mod extract_ranges;
+#[allow(dead_code)]
 mod lexical_ranges;
 mod statistics;
 mod util;
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs b/datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs
index e26ea8119bf5..3a37e7996bda 100644
--- a/datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/extract_ranges.rs
@@ -19,8 +19,9 @@
 
 use arrow::compute::SortOptions;
 use arrow::datatypes::Schema;
-use datafusion_common::tree_node::TreeNode;
-use datafusion_common::{ColumnStatistics, Result, ScalarValue};
+use datafusion_common::{
+    internal_datafusion_err, ColumnStatistics, Result, ScalarValue, Statistics,
+};
 use datafusion_datasource::PartitionedFile;
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::LexOrdering;
@@ -28,7 +29,7 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 use std::sync::Arc;
 
 use super::lexical_ranges::{LexicalRange, NonOverlappingOrderedLexicalRanges};
-use super::statistics::{column_statistics_min_max, PartitionStatisticsVisitor};
+use super::statistics::{column_statistics_min_max, statistics_by_partition};
 
 /// Attempt to extract LexicalRanges for the given sort keys and input plan
 ///
@@ -50,22 +51,31 @@ pub fn extract_disjoint_ranges_from_plan(
 
     let num_input_partitions = partition_count(input_plan);
 
-    // one builder for each output partition
+    // One builder for each output partition.
+    // Each builder will contain multiple sort keys
     let mut builders = vec![LexicalRange::builder(); num_input_partitions];
+
+    // get partitioned stats for the plan
+    // TODO: this will be replace with the `ExecutionPlan::statistics_by_partition`, but it doesn't yet cover our
+    // use cases. (It does cover joins, which we don't address.)
+    let partitioned_stats = statistics_by_partition(input_plan.as_ref())?;
+
+    // add per sort key
     for sort_expr in exprs.iter() {
         let Some(column) = sort_expr.expr.as_any().downcast_ref::<Column>() else {
             return Ok(None);
         };
-        let col_name = column.name();
+        let Ok(col_idx) = input_plan.schema().index_of(column.name()) else {
+            return Ok(None);
+        };
 
-        for (partition, builder) in
-            builders.iter_mut().enumerate().take(num_input_partitions)
+        // add per partition
+        for (builder, stats_for_partition) in builders.iter_mut().zip(&partitioned_stats)
         {
-            let Some((min, max)) = min_max_for_partition(
-                col_name,
+            let Some((min, max)) = min_max_for_stats_with_sort_options(
+                col_idx,
                 &sort_expr.options,
-                partition,
-                input_plan,
+                stats_for_partition,
             )?
             else {
                 return Ok(None);
@@ -87,57 +97,52 @@ fn partition_count(plan: &Arc<dyn ExecutionPlan>) -> usize {
     plan.output_partitioning().partition_count()
 }
 
-/// Returns the min and max value for the specified partition.
+// Returns the min and max value for the [`Statistics`],
+/// and handles null values based on [`SortOptions`].
 ///
-/// Eventually this would be represented by the Statistics structure in
-/// DataFusion.
-fn min_max_for_partition(
-    col_name: &str,
+/// Returns None if statistics are absent/unknown.
+fn min_max_for_stats_with_sort_options(
+    col_idx: usize,
     sort_options: &SortOptions,
-    partition: usize,
-    plan: &Arc<dyn ExecutionPlan>,
+    stats: &Statistics,
 ) -> Result<Option<(ScalarValue, ScalarValue)>> {
     // Check if the column is a constant value according to the equivalence properties (TODO)
 
-    let mut extractor = PartitionStatisticsVisitor::new(col_name, partition);
-    plan.visit(&mut extractor)?;
-    let stats = extractor.result();
-
-    if let Some(ColumnStatistics {
-        min_value,
+    let Some(ColumnStatistics {
         max_value,
+        min_value,
         null_count,
         ..
-    }) = stats
-    {
-        let (Some(min), Some(max)) = (min_value.get_value(), max_value.get_value())
-        else {
-            return Ok(None);
-        };
+    }) = stats.column_statistics.get(col_idx)
+    else {
+        return Err(internal_datafusion_err!(
+            "extracted statistics is missing @{:?}, only has {:?} columns",
+            col_idx,
+            stats.column_statistics.len()
+        ));
+    };
 
-        let mut min = min.clone();
-        let mut max = max.clone();
-        if *null_count.get_value().unwrap_or(&0) > 0 {
-            let nulls_as_min = !sort_options.descending && sort_options.nulls_first // ASC nulls first
-                || sort_options.descending && !sort_options.nulls_first; // DESC nulls last
+    let (Some(min), Some(max)) = (min_value.get_value(), max_value.get_value()) else {
+        return Ok(None);
+    };
 
-            // TODO(wiedld): use ScalarValue::Null after using type coersion in ConvertedRows?
-            // For now, return None if fails to convert.
-            let Some(null) = use_scalar_none(&min) else {
-                return Ok(None);
-            };
+    let mut min = min.clone();
+    let mut max = max.clone();
+    if *null_count.get_value().unwrap_or(&0) > 0 {
+        let nulls_as_min = !sort_options.descending && sort_options.nulls_first // ASC nulls first
+    || sort_options.descending && !sort_options.nulls_first; // DESC nulls last
 
-            if nulls_as_min {
-                min = null;
-            } else {
-                max = null;
-            }
-        }
+        // Get the typed null value for the data type of min/max
+        let null: ScalarValue = min.data_type().try_into()?;
 
-        Ok(Some((min, max)))
-    } else {
-        Ok(None)
+        if nulls_as_min {
+            min = null;
+        } else {
+            max = null;
+        }
     }
+
+    Ok(Some((min, max)))
 }
 
 /// Attempt to extract LexicalRanges for the given sort keys and partitioned files
@@ -195,706 +200,485 @@ fn min_max_for_partitioned_file(
     Ok(column_statistics_min_max(col_stats))
 }
 
-/// TODO: remove this function.
-///
-/// This is due to our [`ConvertedRows`] not yet handling type coersion of [`ScalarValue::Null`].
-fn use_scalar_none(value: &ScalarValue) -> Option<ScalarValue> {
-    match value {
-        ScalarValue::Boolean(_) => Some(ScalarValue::Boolean(None)),
-        ScalarValue::Float16(_) => Some(ScalarValue::Float16(None)),
-        ScalarValue::Float32(_) => Some(ScalarValue::Float32(None)),
-        ScalarValue::Float64(_) => Some(ScalarValue::Float64(None)),
-        ScalarValue::Decimal128(_, a, b) => Some(ScalarValue::Decimal128(None, *a, *b)),
-        ScalarValue::Decimal256(_, a, b) => Some(ScalarValue::Decimal256(None, *a, *b)),
-        ScalarValue::Int8(_) => Some(ScalarValue::Int8(None)),
-        ScalarValue::Int16(_) => Some(ScalarValue::Int16(None)),
-        ScalarValue::Int32(_) => Some(ScalarValue::Int32(None)),
-        ScalarValue::Int64(_) => Some(ScalarValue::Int64(None)),
-        ScalarValue::UInt8(_) => Some(ScalarValue::UInt8(None)),
-        ScalarValue::UInt16(_) => Some(ScalarValue::UInt16(None)),
-        ScalarValue::UInt32(_) => Some(ScalarValue::UInt32(None)),
-        ScalarValue::UInt64(_) => Some(ScalarValue::UInt64(None)),
-        ScalarValue::Utf8(_) => Some(ScalarValue::Utf8(None)),
-        ScalarValue::Utf8View(_) => Some(ScalarValue::Utf8View(None)),
-        ScalarValue::LargeUtf8(_) => Some(ScalarValue::LargeUtf8(None)),
-        ScalarValue::Binary(_) => Some(ScalarValue::Binary(None)),
-        ScalarValue::BinaryView(_) => Some(ScalarValue::BinaryView(None)),
-        ScalarValue::FixedSizeBinary(i, _) => {
-            Some(ScalarValue::FixedSizeBinary(*i, None))
-        }
-        ScalarValue::LargeBinary(_) => Some(ScalarValue::LargeBinary(None)),
-        ScalarValue::Date32(_) => Some(ScalarValue::Date32(None)),
-        ScalarValue::Date64(_) => Some(ScalarValue::Date64(None)),
-        ScalarValue::Time32Second(_) => Some(ScalarValue::Time32Second(None)),
-        ScalarValue::Time32Millisecond(_) => Some(ScalarValue::Time32Millisecond(None)),
-        ScalarValue::Time64Microsecond(_) => Some(ScalarValue::Time64Microsecond(None)),
-        ScalarValue::Time64Nanosecond(_) => Some(ScalarValue::Time64Nanosecond(None)),
-        ScalarValue::TimestampSecond(_, tz) => {
-            Some(ScalarValue::TimestampSecond(None, tz.clone()))
-        }
-        ScalarValue::TimestampMillisecond(_, tz) => {
-            Some(ScalarValue::TimestampMillisecond(None, tz.clone()))
-        }
-        ScalarValue::TimestampMicrosecond(_, tz) => {
-            Some(ScalarValue::TimestampMicrosecond(None, tz.clone()))
-        }
-        ScalarValue::TimestampNanosecond(_, tz) => {
-            Some(ScalarValue::TimestampNanosecond(None, tz.clone()))
-        }
-        ScalarValue::IntervalYearMonth(_) => Some(ScalarValue::IntervalYearMonth(None)),
-        ScalarValue::IntervalDayTime(_) => Some(ScalarValue::IntervalDayTime(None)),
-        ScalarValue::IntervalMonthDayNano(_) => {
-            Some(ScalarValue::IntervalMonthDayNano(None))
-        }
-        ScalarValue::DurationSecond(_) => Some(ScalarValue::DurationSecond(None)),
-        ScalarValue::DurationMillisecond(_) => {
-            Some(ScalarValue::DurationMillisecond(None))
-        }
-        ScalarValue::DurationMicrosecond(_) => {
-            Some(ScalarValue::DurationMicrosecond(None))
-        }
-        ScalarValue::DurationNanosecond(_) => Some(ScalarValue::DurationNanosecond(None)),
-
-        // for now, don't support others.
-        _ => None,
-    }
-}
-
 #[cfg(test)]
 mod tests {
-    use std::cmp;
-
     use super::*;
-
-    use arrow::{
-        compute::SortOptions,
-        datatypes::{DataType, Field},
-    };
-    use datafusion_common::{stats::Precision, ColumnStatistics, Statistics};
-    use datafusion_datasource::{
-        file_groups::FileGroup, file_scan_config::FileScanConfigBuilder,
-        source::DataSourceExec,
-    };
-    use datafusion_datasource_parquet::source::ParquetSource;
-    use datafusion_execution::object_store::ObjectStoreUrl;
-    use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
-    use datafusion_physical_plan::{
-        expressions::col, sorts::sort::SortExec, union::UnionExec,
+    use crate::progressive_evaluation::util::test_utils::{
+        parquet_exec_with_sort_with_statistics,
+        parquet_exec_with_sort_with_statistics_and_schema, sort_exec, union_exec,
+        SortKeyRange,
     };
-    use itertools::Itertools;
-
-    struct KeyRange {
-        min: Option<i32>,
-        max: Option<i32>,
-        null_count: usize,
-    }
+    use std::fmt::{Debug, Display, Formatter};
 
-    fn schema() -> Arc<Schema> {
-        Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]))
-    }
-
-    /// Create a single parquet
-    fn parquet_exec_with_sort_with_statistics(
-        output_ordering: Vec<LexOrdering>,
-        key_ranges: &[&KeyRange],
-    ) -> Arc<dyn ExecutionPlan> {
-        let mut statistics = Statistics::new_unknown(&schema());
-        let mut file_groups = Vec::with_capacity(key_ranges.len());
-
-        let mut cum_null_count = 0;
-        let mut cum_min = None;
-        let mut cum_max = None;
-        for key_range in key_ranges {
-            let KeyRange {
-                min,
-                max,
-                null_count,
-            } = key_range;
-
-            // update cummulative statistics for entire parquet exec
-            cum_min = match (cum_min, min) {
-                (None, x) => *x,
-                (x, None) => x,
-                (Some(a), Some(b)) => Some(cmp::min(a, *b)),
-            };
-            cum_max = match (cum_max, max) {
-                (None, x) => *x,
-                (x, None) => x,
-                (Some(a), Some(b)) => Some(cmp::max(a, *b)),
-            };
-            cum_null_count += *null_count;
-
-            // Create file with statistics.
-            let mut file = PartitionedFile::new("x".to_string(), 100);
-            let stats = Statistics {
-                num_rows: Precision::Absent,
-                total_byte_size: Precision::Absent,
-                column_statistics: vec![ColumnStatistics {
-                    null_count: Precision::Exact(*null_count),
-                    min_value: Precision::Exact(ScalarValue::Int32(*min)),
-                    max_value: Precision::Exact(ScalarValue::Int32(*max)),
-                    ..Default::default()
-                }],
-            };
-            file.statistics = Some(Arc::new(stats.clone()));
-            file_groups.push(FileGroup::new(vec![file]).with_statistics(stats));
-        }
-
-        // add stats, for the whole parquet exec, for a single column
-        statistics.column_statistics[0] = ColumnStatistics {
-            null_count: Precision::Exact(cum_null_count),
-            min_value: Precision::Exact(ScalarValue::Int32(cum_min)),
-            max_value: Precision::Exact(ScalarValue::Int32(cum_max)),
-            ..Default::default()
-        };
-
-        let config = Arc::new(
-            FileScanConfigBuilder::new(
-                ObjectStoreUrl::parse("test:///").unwrap(),
-                schema(),
-                Arc::new(ParquetSource::new(Default::default())),
-            )
-            .with_file_groups(file_groups)
-            .with_output_ordering(output_ordering)
-            .with_statistics(statistics)
-            .build(),
-        );
-
-        Arc::new(DataSourceExec::new(config))
-    }
-
-    fn union_exec(input: Vec<Arc<dyn ExecutionPlan>>) -> Arc<dyn ExecutionPlan> {
-        Arc::new(UnionExec::new(input))
-    }
-
-    fn sort_exec(
-        sort_exprs: LexOrdering,
-        input: Arc<dyn ExecutionPlan>,
-        preserve_partitioning: bool,
-    ) -> Arc<dyn ExecutionPlan> {
-        let new_sort = SortExec::new(sort_exprs, input)
-            .with_preserve_partitioning(preserve_partitioning);
-        Arc::new(new_sort)
-    }
+    use arrow::compute::SortOptions;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::displayable;
+    use datafusion_physical_plan::expressions::col;
 
-    fn str_lexical_ranges(lex_ranges: &[LexicalRange]) -> String {
-        lex_ranges
-            .iter()
-            .map(|lr| lr.to_string())
-            .collect_vec()
-            .join(", ")
-    }
+    use insta::assert_snapshot;
 
-    /// Build a test physical plan like:
-    /// UNION
-    ///     ParquetExec (range_a)
-    ///     SORT
-    ///         UNION
-    ///             ParquetExec (range_b_1)
-    ///             ParquetExec (range_b_2)
-    fn build_test_case(
-        ordering: &LexOrdering,
-        key_ranges: &[KeyRange; 3],
-    ) -> Arc<dyn ExecutionPlan> {
-        let [range_a, range_b_1, range_b_2] = key_ranges;
-
-        let datasrc_a =
-            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_a]);
-
-        let datasrc_b1 =
-            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_b_1]);
-        let datasrc_b2 =
-            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_b_2]);
-        let b = sort_exec(
-            ordering.clone(),
-            union_exec(vec![datasrc_b1, datasrc_b2]),
-            false,
-        );
-
-        union_exec(vec![datasrc_a, b])
-    }
-
-    #[test]
-    fn test_extract_disjoint_ranges() -> Result<()> {
-        let plan_ranges_disjoint = &[
-            KeyRange {
+    /// test with three partition ranges that are disjoint (non overlapping)
+    fn test_case_disjoint_3() -> TestCaseBuilder {
+        TestCaseBuilder::new()
+            .with_key_range(SortKeyRange {
                 min: Some(1000),
                 max: Some(2000),
                 null_count: 0,
-            },
-            KeyRange {
+            })
+            .with_key_range(SortKeyRange {
                 min: Some(2001),
                 max: Some(3000),
                 null_count: 0,
-            },
-            KeyRange {
+            })
+            .with_key_range(SortKeyRange {
                 min: Some(3001),
                 max: Some(3500),
                 null_count: 0,
-            },
-        ];
-        let plan_ranges_overlapping = &[
-            KeyRange {
+            })
+    }
+
+    /// test with three partition ranges that are NOT disjoint (are overlapping)
+    fn test_case_overlapping_3() -> TestCaseBuilder {
+        TestCaseBuilder::new()
+            .with_key_range(SortKeyRange {
                 min: Some(1000),
                 max: Some(2010),
                 null_count: 0,
-            },
-            KeyRange {
+            })
+            .with_key_range(SortKeyRange {
                 min: Some(2001),
                 max: Some(3000),
                 null_count: 0,
-            },
-            KeyRange {
+            })
+            .with_key_range(SortKeyRange {
                 min: Some(3001),
                 max: Some(3500),
                 null_count: 0,
-            },
-        ];
-
-        // Test: ASC, is disjoint (not overlapping)
-        // (1000)->(2000), (2001)->(3500)
-        let options = SortOptions {
-            descending: false,
-            ..Default::default()
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint);
-        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
-        else {
-            unreachable!("should find disjoint")
-        };
-        assert_eq!(actual.indices(), &[0, 1], "should find proper ordering");
-        assert_eq!(
-            format!("{}", str_lexical_ranges(actual.value_ranges())),
-            "(1000)->(2000), (2001)->(3500)",
-            "should find proper ranges"
-        );
-
-        // Test: ASC, is overlapping
-        // (1000)->(2010), (2001)->(3500)
-        let options = SortOptions {
-            descending: false,
-            ..Default::default()
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_overlapping);
-        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
-        assert!(actual.is_none(), "should not find disjoint range");
-
-        // Test: DESC, is disjoint (not overlapping)
-        // (2001)->(3500), (1000)->(2000)
-        let options = SortOptions {
-            descending: true,
-            ..Default::default()
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint);
-        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
-        else {
-            unreachable!("should find disjoint")
-        };
-        assert_eq!(actual.indices(), &[1, 0], "should find proper ordering"); // NOTE THE INVERSE ORDER
-        assert_eq!(
-            format!("{}", str_lexical_ranges(actual.value_ranges())),
-            "(2001)->(3500), (1000)->(2000)",
-            "should find proper ranges"
-        );
-
-        // Test: DESC, is overlapping
-        // (2001)->(3500), (1000)->(2010)
-        let options = SortOptions {
-            descending: false,
-            ..Default::default()
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_overlapping);
-        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
-        assert!(actual.is_none(), "should not find disjoint range");
+            })
+    }
 
-        Ok(())
+    #[test]
+    fn test_union_sort_union_disjoint_ranges_asc() {
+        assert_snapshot!(
+        test_case_disjoint_3()
+            .with_descending(false)
+            .with_sort_expr("a")
+            .union_sort_union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+            UnionExec
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2000))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        Output Ranges: [0, 1]
+          (1000)->(2000)
+          (2001)->(3500)
+        ");
     }
 
-    /// Same as `test_extract_disjoint_ranges`, but include nulls first.
     #[test]
-    fn test_extract_disjoint_ranges_nulls_first() -> Result<()> {
-        // NULL is min_value if ASC
-        let plan_ranges_disjoint_if_asc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 1, // null is min value
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 0,
-            },
-        ];
-        let plan_ranges_overlap_if_asc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 1, // null is min value, this will be overlapping
-            },
-        ];
+    fn test_union_sort_union_overlapping_ranges_asc() {
+        assert_snapshot!(
+         test_case_overlapping_3()
+            .with_descending(false)
+            .with_sort_expr("a")
+            .union_sort_union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+            UnionExec
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2010))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        No disjoint ranges found
+        ");
+    }
 
-        // NULL is max_value if DESC
-        let plan_ranges_disjoint_if_desc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 1, // null is max value
-            },
-        ];
-        let plan_ranges_overlap_if_desc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 1, // null is max value, this will be overlapping
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 0,
-            },
-        ];
-
-        // Test: ASC, is disjoint (not overlapping)
-        // (1000)->(2000), (2001)->(3500), and partition<(1000)->(2000)> has the NULL (min value)
-        let options = SortOptions {
-            descending: false,
-            nulls_first: true,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_asc);
-        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
-        else {
-            unreachable!("should find disjoint")
-        };
-        assert_eq!(actual.indices(), &[0, 1], "should find proper ordering");
-        assert_eq!(
-            format!("{}", str_lexical_ranges(actual.value_ranges())),
-            "(NULL)->(2000), (2001)->(3500)",
-            "should find proper ranges"
-        );
-
-        // Test: ASC, is overlapping
-        // (1000)->(2000), (2001)->(3500), and partition<(2001)->(3500)> has the NULL (min value)
-        let options = SortOptions {
-            descending: false,
-            nulls_first: true,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_asc);
-        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
-        assert!(actual.is_none(), "should not find disjoint range");
-
-        // Test: DESC, is disjoint (not overlapping)
-        // (2001)->(3500), (1000)->(2000), and partition<(2001)->(3500)> has the NULL (max value)
-        let options = SortOptions {
-            descending: true,
-            nulls_first: true,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_desc);
-        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
-        else {
-            unreachable!("should find disjoint")
-        };
-        assert_eq!(actual.indices(), &[1, 0], "should find proper ordering"); // NOTE THE INVERSE ORDER
-        assert_eq!(
-            format!("{}", str_lexical_ranges(actual.value_ranges())),
-            "(2001)->(NULL), (1000)->(2000)",
-            "should find proper ranges"
-        );
-
-        // Test: DESC, is overlapping
-        // (2001)->(3500), (1000)->(2000), and partition<(1000)->(2000)> has the NULL (max value)
-        let options = SortOptions {
-            descending: true,
-            nulls_first: true,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_desc);
-        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
-        assert!(actual.is_none(), "should not find disjoint range");
+    #[test]
+    fn test_union_sort_union_disjoint_ranges_desc_nulls_first() {
+        assert_snapshot!(
+        test_case_disjoint_3()
+            .with_descending(true)
+            .with_sort_expr("a")
+            .union_sort_union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+          SortExec: expr=[a@0 DESC], preserve_partitioning=[false]
+            UnionExec
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2000))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        Output Ranges: [1, 0]
+          (2001)->(3500)
+          (1000)->(2000)
+        ");
+    }
 
-        Ok(())
+    #[test]
+    fn test_union_sort_union_overlapping_ranges_desc_nulls_first() {
+        assert_snapshot!(
+         test_case_overlapping_3()
+            .with_descending(true)
+            .with_sort_expr("a")
+            .union_sort_union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+          SortExec: expr=[a@0 DESC], preserve_partitioning=[false]
+            UnionExec
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2010))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        No disjoint ranges found
+        ");
     }
 
-    /// Same as `test_extract_disjoint_ranges`, but include nulls last.
+    // default is NULLS FIRST so try NULLS LAST
     #[test]
-    fn test_extract_disjoint_ranges_nulls_last() -> Result<()> {
-        // NULL is max_value if ASC
-        let plan_ranges_disjoint_if_asc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 1, // null is max value
-            },
-        ];
-        let plan_ranges_overlap_if_asc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 1, // null is max value, this will be overlapping
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 0,
-            },
-        ];
+    fn test_union_sort_union_disjoint_ranges_asc_nulls_last() {
+        assert_snapshot!(
+        test_case_disjoint_3()
+            .with_descending(false)
+            .with_nulls_first(false)
+            .with_sort_expr("a")
+            .union_sort_union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC NULLS LAST], file_type=parquet
+          SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
+            UnionExec
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC NULLS LAST], file_type=parquet
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC NULLS LAST], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2000))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        Output Ranges: [0, 1]
+          (1000)->(2000)
+          (2001)->(3500)
+        ");
+    }
 
-        // NULL is min_value if DESC
-        let plan_ranges_disjoint_if_desc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 1, // null is min value
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 0,
-            },
-        ];
-        let plan_ranges_overlap_if_desc = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 1, // null is min value, this will be overlapping
-            },
-        ];
-
-        // Test: ASC, is disjoint (not overlapping)
-        // (1000)->(2000), (2001)->(3500), and partition<(2001)->(3500)> has the NULL (max value)
-        let options = SortOptions {
-            descending: false,
-            nulls_first: false,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_asc);
-        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
-        else {
-            unreachable!("should find disjoint")
-        };
-        assert_eq!(actual.indices(), &[0, 1], "should find proper ordering");
-        assert_eq!(
-            format!("{}", str_lexical_ranges(actual.value_ranges())),
-            "(1000)->(2000), (2001)->(NULL)",
-            "should find proper ranges"
-        );
-
-        // Test: ASC, is overlapping
-        // (1000)->(2000), (2001)->(3500), and partition<(1000)->(2000)> has the NULL (max value)
-        let options = SortOptions {
-            descending: false,
-            nulls_first: false,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_asc);
-        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
-        assert!(actual.is_none(), "should not find disjoint range");
-
-        // Test: DESC, is disjoint (not overlapping)
-        // (2001)->(3500), (1000)->(2000), and partition<(1000)->(2000)> has the NULL (min value)
-        let options = SortOptions {
-            descending: true,
-            nulls_first: false,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_disjoint_if_desc);
-        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
-        else {
-            unreachable!("should find disjoint")
-        };
-        assert_eq!(actual.indices(), &[1, 0], "should find proper ordering"); // NOTE THE INVERSE ORDER
-        assert_eq!(
-            format!("{}", str_lexical_ranges(actual.value_ranges())),
-            "(2001)->(3500), (NULL)->(2000)",
-            "should find proper ranges"
-        );
-
-        // Test: DESC, is overlapping
-        // (2001)->(3500), (1000)->(2000), and partition<(2001)->(3500)> has the NULL (min value)
-        let options = SortOptions {
-            descending: true,
-            nulls_first: false,
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case(&lex_ordering, plan_ranges_overlap_if_desc);
-        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
-        assert!(actual.is_none(), "should not find disjoint range");
+    // default is NULLS FIRST so try NULLS LAST
+    #[test]
+    fn test_union_sort_union_overlapping_ranges_asc_nulls_last() {
+        assert_snapshot!(
+         test_case_overlapping_3()
+            .with_descending(false)
+            .with_nulls_first(false)
+            .with_sort_expr("a")
+            .union_sort_union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC NULLS LAST], file_type=parquet
+          SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false]
+            UnionExec
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC NULLS LAST], file_type=parquet
+              DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC NULLS LAST], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2010))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        No disjoint ranges found
+        ");
+    }
 
-        Ok(())
+    #[test]
+    fn test_union_disjoint_ranges_asc() {
+        assert_snapshot!(
+         test_case_disjoint_3()
+            .with_descending(false)
+            .with_sort_expr("a")
+            .union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2000))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        Output Ranges: [0, 1, 2]
+          (1000)->(2000)
+          (2001)->(3000)
+          (3001)->(3500)
+        ");
     }
 
-    /// Build a test physical plan like:
-    /// UNION
-    ///     ParquetExec (range_a)
-    ///     ParquetExec (range_b_1, range_b_2)
-    fn build_test_case_multiple_partition_parquet_exec(
-        ordering: &LexOrdering,
-        key_ranges: &[KeyRange; 3],
-    ) -> Arc<dyn ExecutionPlan> {
-        let [range_a, range_b_1, range_b_2] = key_ranges;
-
-        let datasrc_a =
-            parquet_exec_with_sort_with_statistics(vec![ordering.clone()], &[range_a]);
-        let datasrc_b = parquet_exec_with_sort_with_statistics(
-            vec![ordering.clone()],
-            &[range_b_1, range_b_2],
-        );
-
-        union_exec(vec![datasrc_a, datasrc_b])
+    #[test]
+    fn test_union_overlapping_ranges_asc() {
+        assert_snapshot!(
+         test_case_overlapping_3()
+            .with_descending(false)
+            .with_sort_expr("a")
+            .union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2010))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        No disjoint ranges found
+        ");
     }
 
     #[test]
-    fn test_extract_multiple_partitions_union_parquet_exec() -> Result<()> {
-        let plan_ranges_disjoint = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 0,
-            },
-        ];
-        let plan_ranges_overlapping = &[
-            KeyRange {
-                min: Some(1000),
-                max: Some(2010),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(2001),
-                max: Some(3000),
-                null_count: 0,
-            },
-            KeyRange {
-                min: Some(3001),
-                max: Some(3500),
-                null_count: 0,
-            },
-        ];
+    fn test_union_disjoint_ranges_desc() {
+        assert_snapshot!(
+         test_case_disjoint_3()
+            .with_descending(true)
+            .with_sort_expr("a")
+            .union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2000))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        Output Ranges: [2, 1, 0]
+          (3001)->(3500)
+          (2001)->(3000)
+          (1000)->(2000)
+        ");
+    }
 
-        // Test: ASC, is disjoint (not overlapping)
-        let options = SortOptions {
-            descending: false,
-            ..Default::default()
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case_multiple_partition_parquet_exec(
-            &lex_ordering,
-            plan_ranges_disjoint,
-        );
-        let Some(actual) = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?
-        else {
-            unreachable!("should find disjoint")
-        };
-        assert_eq!(actual.indices(), &[0, 1, 2], "should find proper ordering");
-        assert_eq!(
-            format!("{}", str_lexical_ranges(actual.value_ranges())),
-            "(1000)->(2000), (2001)->(3000), (3001)->(3500)",
-            "should find proper ranges"
-        );
-
-        // Test: ASC, is overlapping
-        let options = SortOptions {
-            descending: false,
-            ..Default::default()
-        };
-        let sort_expr = PhysicalSortExpr::new(col("a", &schema())?, options);
-        let lex_ordering = LexOrdering::new(vec![sort_expr]);
-        let plan = build_test_case_multiple_partition_parquet_exec(
-            &lex_ordering,
-            plan_ranges_overlapping,
-        );
-        let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)?;
-        assert!(actual.is_none(), "should not find disjoint range");
-
-        Ok(())
+    #[test]
+    fn test_union_overlapping_ranges_desc() {
+        assert_snapshot!(
+         test_case_overlapping_3()
+            .with_descending(true)
+            .with_sort_expr("a")
+            .union_plan(),
+         @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 DESC], file_type=parquet
+
+        Input Ranges
+          (Some(1000))->(Some(2010))
+          (Some(2001))->(Some(3000))
+          (Some(3001))->(Some(3500))
+        No disjoint ranges found
+        ");
+    }
+
+    /// Helper for building up patterns for testing statistics extraction from
+    /// ExecutionPlans
+    #[derive(Debug)]
+    struct TestCaseBuilder {
+        input_ranges: Vec<SortKeyRange>,
+        sort_options: SortOptions,
+        sort_exprs: Vec<PhysicalSortExpr>,
+        schema: Arc<Schema>,
+    }
+
+    impl TestCaseBuilder {
+        /// Creates a new `TestCaseBuilder` instance with default values.
+        fn new() -> Self {
+            let schema =
+                Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
+
+            Self {
+                input_ranges: vec![],
+                sort_options: SortOptions::default(),
+                sort_exprs: vec![],
+                schema,
+            }
+        }
+
+        /// Add a key range
+        pub fn with_key_range(mut self, key_range: SortKeyRange) -> Self {
+            self.input_ranges.push(key_range);
+            self
+        }
+
+        /// set SortOptions::descending flag
+        pub fn with_descending(mut self, descending: bool) -> Self {
+            self.sort_options.descending = descending;
+            self
+        }
+
+        /// set SortOptions::nulls_first flag
+        pub fn with_nulls_first(mut self, nulls_first: bool) -> Self {
+            self.sort_options.nulls_first = nulls_first;
+            self
+        }
+
+        /// Add a sort expression to the ordering (created with the current SortOptions)
+        pub fn with_sort_expr(mut self, column_name: &str) -> Self {
+            let expr = PhysicalSortExpr::new(
+                col(column_name, &self.schema).unwrap(),
+                self.sort_options,
+            );
+            self.sort_exprs.push(expr);
+            self
+        }
+
+        /// Build a test physical plan like the following, and extract disjoint ranges from it:
+        ///
+        /// ```text
+        /// UNION
+        ///     ParquetExec (key_ranges[0])            (range_a)
+        ///     SORT
+        ///         UNION
+        ///             ParquetExec (key_ranges[1])    (range_b_1)
+        ///             ParquetExec (key_ranges[2])    (range_b_2)
+        /// ```
+        fn union_sort_union_plan(self) -> TestResult {
+            let Self {
+                input_ranges,
+                sort_options: _, // used to create sort exprs, nothere
+                sort_exprs,
+                schema,
+            } = self;
+            let lex_ordering = LexOrdering::new(sort_exprs);
+
+            assert_eq!(input_ranges.len(), 3);
+            let range_a = &input_ranges[0];
+            let range_b_1 = &input_ranges[1];
+            let range_b_2 = &input_ranges[2];
+
+            let datasrc_a = parquet_exec_with_sort_with_statistics_and_schema(
+                &schema,
+                vec![lex_ordering.clone()],
+                &[range_a],
+            );
+
+            let datasrc_b1 = parquet_exec_with_sort_with_statistics_and_schema(
+                &schema,
+                vec![lex_ordering.clone()],
+                &[range_b_1],
+            );
+            let datasrc_b2 = parquet_exec_with_sort_with_statistics_and_schema(
+                &schema,
+                vec![lex_ordering.clone()],
+                &[range_b_2],
+            );
+            let b = sort_exec(
+                &lex_ordering,
+                &union_exec(vec![datasrc_b1, datasrc_b2]),
+                false,
+            );
+
+            let plan = union_exec(vec![datasrc_a, b]);
+
+            let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)
+                .expect("Error extracting disjoint ranges from plan");
+            TestResult {
+                input_ranges,
+                plan,
+                actual,
+            }
+        }
+
+        /// Build a test physical plan like the following, and extract disjoint ranges from it:
+        ///
+        /// ```text
+        /// UNION
+        ///     ParquetExec (key_ranges[0])                (range_a)
+        ///     ParquetExec (key_ranges[1], key_ranges[2]) (range_b_1, range_b_2)
+        /// ```
+        fn union_plan(self) -> TestResult {
+            let Self {
+                input_ranges,
+                sort_options: _, // used to create sort exprs, nothere
+                sort_exprs,
+                schema,
+            } = self;
+
+            let lex_ordering = LexOrdering::new(sort_exprs);
+
+            assert_eq!(input_ranges.len(), 3);
+            let range_a = &input_ranges[0];
+            let range_b_1 = &input_ranges[1];
+            let range_b_2 = &input_ranges[2];
+            let datasrc_a = parquet_exec_with_sort_with_statistics(
+                vec![lex_ordering.clone()],
+                &[range_a],
+            );
+            let datasrc_b = parquet_exec_with_sort_with_statistics_and_schema(
+                &schema,
+                vec![lex_ordering.clone()],
+                &[range_b_1, range_b_2],
+            );
+
+            let plan = union_exec(vec![datasrc_a, datasrc_b]);
+
+            let actual = extract_disjoint_ranges_from_plan(&lex_ordering, &plan)
+                .expect("Error extracting disjoint ranges from plan");
+            TestResult {
+                input_ranges,
+                plan,
+                actual,
+            }
+        }
+    }
+
+    /// Result of running a test case, including the input ranges, the execution
+    /// plan, and the actual disjoint ranges found.
+    ///
+    /// This struct implements `Display` to provide a formatted output of the
+    /// test case results that can be easily compared using `insta` snapshots.
+    struct TestResult {
+        input_ranges: Vec<SortKeyRange>,
+        plan: Arc<dyn ExecutionPlan>,
+        actual: Option<NonOverlappingOrderedLexicalRanges>,
+    }
+
+    impl TestResult {}
+
+    impl Display for TestResult {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            let displayable_plan = displayable(self.plan.as_ref()).indent(false);
+            writeln!(f, "{}", displayable_plan)?;
+
+            writeln!(f, "Input Ranges")?;
+            for range in &self.input_ranges {
+                writeln!(f, "  {}", range)?;
+            }
+
+            match self.actual.as_ref() {
+                Some(actual) => {
+                    writeln!(f, "Output Ranges: {:?}", actual.indices())?;
+                    for range in actual.ordered_ranges() {
+                        writeln!(f, "  {}", range)?;
+                    }
+                }
+                None => {
+                    writeln!(f, "No disjoint ranges found")?;
+                }
+            }
+            Ok(())
+        }
     }
 }
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs b/datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs
index 36783466f2c0..e9c978c96b12 100644
--- a/datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/lexical_ranges.rs
@@ -20,14 +20,16 @@
 use arrow::array::ArrayRef;
 use arrow::compute::SortOptions;
 use arrow::row::{Row, RowConverter, Rows, SortField};
-use datafusion_common::{error::DataFusionError, Result, ScalarValue};
+use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue};
 use std::fmt::Display;
 use std::sync::Arc;
 
+/// Represents a range of sort key values within a lexical space.
+///
 /// # Lexical Space
 ///
-/// The "Lexical Space" is all possible values of a sort order (set of sort
-/// expressions).
+/// The "Lexical Space" is all possible values of columns in a sort order (set
+/// of sort expressions).
 ///
 /// For example, given data with a sort order of `A ASC, B ASC`
 /// (`A` ascending, `B` ascending), then the lexical space is all the unique
@@ -35,32 +37,47 @@ use std::sync::Arc;
 ///
 /// # Lexical Range
 ///
-/// The "lexical range" of an input in this lexical space is
-/// the minimum and maximum sort key values for that range.
+/// The "lexical range" is defined by two points in a lexical space (the
+/// minimum and maximum sort key values for some range)
 ///
 /// For example, for data like
-/// | `a` | `b` |
-/// |--------|--------|
+///
+/// |`a`| `b` |
+/// |---|-----|
 /// | 1 | 100 |
 /// | 1 | 200 |
 /// | 1 | 300 |
 /// | 2 | 100 |
 /// | 2 | 200 |
-/// | 3 | 50 |
+/// | 3 | 50  |
 ///
-/// The lexical range is `min --> max`: `(1,100) --> (3,50)`
+/// The lexical range is
+/// * `min --> max`
+/// * `(a_min, b_min) -> (a_max, b_max)`
+/// * `(1,100) --> (3,50)`
 #[derive(Debug, Default, Clone)]
 pub struct LexicalRange {
-    /// The minimum value in the lexical space (one for each sort key)
+    /// The minimum multi-column value in the lexical space (one `ScalarValue`
+    /// for each sort key)
     min: Vec<ScalarValue>,
-    /// The maximum value in the lexical space (one for each sort key)
+    /// The maximum multi-column value in the lexical space (one `ScalarValue`
+    /// for each sort key)
     max: Vec<ScalarValue>,
 }
 
 impl LexicalRange {
+    /// Create a [`LexicalRangeBuilder`]
     pub fn builder() -> LexicalRangeBuilder {
         LexicalRangeBuilder::new()
     }
+
+    /// Create a new [`LexicalRange`] with the same min and max values.
+    pub fn new_from_constants(constants: Vec<ScalarValue>) -> Self {
+        Self {
+            min: constants.clone(),
+            max: constants,
+        }
+    }
 }
 
 impl Display for LexicalRange {
@@ -83,6 +100,8 @@ impl Display for LexicalRange {
 }
 
 /// Builder for [`LexicalRange`]
+///
+/// This builds up a multi-column min/max range one pair at a time.
 #[derive(Debug, Default, Clone)]
 pub struct LexicalRangeBuilder {
     inner: LexicalRange,
@@ -97,24 +116,48 @@ impl LexicalRangeBuilder {
         self.inner
     }
 
-    /// Adds min and max to the end of the in-progress ranges
+    /// Adds a min and max to the end of the in-progress ranges
     pub fn push(&mut self, min: ScalarValue, max: ScalarValue) {
         self.inner.min.push(min);
         self.inner.max.push(max);
     }
 }
 
-/// Represents ranges of lexically ordered values in a sort order.
+/// Represents a set of non overlapping [`LexicalRange`]s ordered according to
+/// minimum value.
+///
+/// # Single Column
+///
+/// For example given a set of ranges
+/// ```text
+/// (11, 20)  // partition 0
+/// (1,  10)  // partition 1
+/// (21, 30)  // partition 2
+/// ```
+///
+/// Then `[1, 0, 2]` is a non-overlapping ordered set of lexical ranges. When
+/// ordered by minimum value: `(1, 10)` comes first, then `(11, 20)` and then
+/// `(21, 30)`.
+///
+/// There are no `NonOverlappingOrderedLexicalRanges` if the ranges are not
+/// disjoint (they overlap). For example, the following ranges overlap between
+/// 11 and 15:
+///
+/// ```text
+/// (11, 20)  // partition 0
+/// (1,  15)  // partition 1
+/// ```
 ///
-/// One element for each input partition
-
 #[derive(Debug)]
 pub struct NonOverlappingOrderedLexicalRanges {
-    /// Corresponding lexical range per partition, already reordered to match indices
+    /// lexical ranges.
+    ///
+    /// These are typically used to represent the value ranges of each
+    /// DataFusion (not IOx) partition
     value_ranges: Vec<LexicalRange>,
 
-    /// The order of the input partitions (rows of ranges) that would provide ensure they are
-    /// sorted in lexical order
+    /// Indexes into `value_ranges` that define a non overlapping ordering by
+    /// minimum value.
     indices: Vec<usize>,
 }
 
@@ -125,35 +168,31 @@ impl Display for NonOverlappingOrderedLexicalRanges {
 }
 
 impl NonOverlappingOrderedLexicalRanges {
-    /// Attempt to create a new [`NonOverlappingOrderedLexicalRanges`]
+    /// Attempt to create a new [`NonOverlappingOrderedLexicalRanges`] given the
+    /// specified [`SortOptions`].
+    ///
+    /// See struct documentation for more details
     ///
     /// Returns None if:
     /// * - There are no ranges
-    /// * - The ranges are not disjoint (aka they overlap in the lexical space)
+    /// * - The ranges are overlap in lexical space (are not disjoint)
     ///
-    /// Returns Err if there is an error converting values
+    /// Returns Err if there is an error converting values.
     pub fn try_new(
         sort_options: &[SortOptions],
-        ranges_per_partition: Vec<LexicalRange>,
+        value_ranges: Vec<LexicalRange>,
     ) -> Result<Option<Self>> {
-        if ranges_per_partition.is_empty() {
+        if value_ranges.is_empty() {
             return Ok(None);
         }
-        // convert to min/maxes, as VecPerPartition<VecPerSortKey>
-        let (mins, maxes) = ranges_per_partition.clone().into_iter().fold(
-            (vec![], vec![]),
-            |(mut mins, mut maxes), partition_range| {
-                let LexicalRange {
-                    min: min_per_sort_key,
-                    max: max_per_sort_key,
-                } = partition_range;
-                mins.push(min_per_sort_key);
-                maxes.push(max_per_sort_key);
-                (mins, maxes)
-            },
-        );
-        let rows = ConvertedRows::try_new(sort_options, mins, maxes)?;
 
+        // convert to rows, for row ordering per sort key
+        let rows = ConvertedRows::try_new_from_lexical_ranges(
+            sort_options,
+            value_ranges.clone(),
+        )?;
+
+        // order by minimum value
         let mut indices = (0..rows.num_rows()).collect::<Vec<_>>();
         indices.sort_by_key(|&i| rows.min_row(i));
 
@@ -162,47 +201,30 @@ impl NonOverlappingOrderedLexicalRanges {
             return Ok(None);
         };
 
-        // reorder lexical ranges
-        let mut value_ranges = ranges_per_partition
-            .into_iter()
-            .enumerate()
-            .map(|(input_partition_idx, range)| {
-                let reordering_idx = indices
-                    .iter()
-                    .position(|reorder_idx| *reorder_idx == input_partition_idx)
-                    .expect("missing partition in reordering indices");
-                (reordering_idx, range)
-            })
-            .collect::<Vec<_>>();
-        value_ranges.sort_by_key(|(reord_i, _)| *reord_i);
-
-        let value_ranges = value_ranges
-            .into_iter()
-            .map(|(_, range)| range)
-            .collect::<Vec<_>>();
-
         Ok(Some(Self {
             value_ranges,
             indices,
         }))
     }
 
-    /// Returns the indices that describe the order input partitions must be reordered
-    /// to be non overlapping and ordered in lexical order.
+    /// Indices that define an ordered list of non overlapping value ranges.
     pub fn indices(&self) -> &[usize] {
         &self.indices
     }
 
-    /// Returns the lexical ranges already re-ordered
-    pub fn value_ranges(&self) -> &[LexicalRange] {
-        &self.value_ranges
+    /// Iterator over the in lexical ranges in order
+    pub fn ordered_ranges(&self) -> impl Iterator<Item = &LexicalRange> {
+        self.indices.iter().map(|i| &self.value_ranges[*i])
     }
 }
 
 /// Result of converting multiple-column ScalarValue rows to columns.
 #[derive(Debug)]
 struct ConvertedRows {
-    /// Use the same row converter for mins and maxes, otherwise they cannot be compared.
+    /// converter for mins and maxes
+    ///
+    /// Must use the same conveted rows for both, otherwise they cannot be
+    /// compared.
     converter: RowConverter,
 
     mins: Rows,
@@ -218,8 +240,20 @@ impl ConvertedRows {
         min_keys: Vec<Vec<ScalarValue>>,
         max_keys: Vec<Vec<ScalarValue>>,
     ) -> Result<Self> {
-        assert_eq!(sort_options.len(), min_keys[0].len());
-        assert_eq!(sort_options.len(), max_keys[0].len());
+        if sort_options.len() != min_keys[0].len() {
+            return internal_err!(
+                "Expected number of sort options ({}) to match number of sort keys in min_keys ({})",
+                sort_options.len(),
+                min_keys[0].len()
+            );
+        }
+        if sort_options.len() != max_keys[0].len() {
+            return internal_err!(
+                "Expected number of sort options ({}) to match number of sort keys in max_keys ({})",
+                sort_options.len(),
+                max_keys.len()
+            );
+        }
 
         // build converter using min keys
         let arrays = pivot_to_arrays(min_keys)?;
@@ -244,21 +278,45 @@ impl ConvertedRows {
         })
     }
 
+    fn try_new_from_lexical_ranges(
+        sort_options: &[SortOptions],
+        lexical_ranges: Vec<LexicalRange>,
+    ) -> Result<Self> {
+        // convert to min/maxes, as VecPerPartition<VecPerSortKey>
+        let (mins, maxes) = lexical_ranges.clone().into_iter().fold(
+            (vec![], vec![]),
+            |(mut mins, mut maxes), partition_range| {
+                let LexicalRange {
+                    min: min_per_sort_key,
+                    max: max_per_sort_key,
+                } = partition_range;
+                mins.push(min_per_sort_key);
+                maxes.push(max_per_sort_key);
+                (mins, maxes)
+            },
+        );
+        Self::try_new(sort_options, mins, maxes)
+    }
+
+    /// Return the number of partitions (rows) in the converted rows.
     fn num_rows(&self) -> usize {
         self.mins.num_rows()
     }
 
-    /// Return the min (as Row) at the specified index
+    /// Return the min (as [`Row`]) at the specified index
     fn min_row(&self, index: usize) -> Row<'_> {
         self.mins.row(index)
     }
 
-    /// Return the max (as Row) at the specified index
+    /// Return the max (as [`Row`]) at the specified index
     fn max_row(&self, index: usize) -> Row<'_> {
         self.maxes.row(index)
     }
 
-    /// Return the min value at the specified index
+    /// Return the min value at the specified index as a list of single
+    /// row arrays
+    ///
+    /// Used for debugging
     fn min_value(&self, index: usize) -> Result<Vec<ArrayRef>> {
         let values = self
             .converter
@@ -267,7 +325,10 @@ impl ConvertedRows {
         Ok(values.iter().map(Arc::clone).collect::<Vec<_>>())
     }
 
-    /// Return the max value at the specified index
+    /// Return the max value at the specified index as a list of single
+    /// row arrays
+    ///
+    /// Used for debugging
     fn max_value(&self, index: usize) -> Result<Vec<ArrayRef>> {
         let values = self
             .converter
@@ -276,7 +337,20 @@ impl ConvertedRows {
         Ok(values.iter().map(Arc::clone).collect::<Vec<_>>())
     }
 
-    // Return true if ranges are disjoint when order according to ordered_partition_idx.
+    /// Return true if ranges are disjoint assuming the ranges are sorted by
+    /// `ordered_by_min_partition_indices`
+    ///
+    /// This checks each pair of adjacent ranges in the sorted order.
+    ///
+    /// (0 -> 10)  (11 -> 20)  (21 -> 30) => disjoint=true
+    ///      output = true
+    ///
+    /// (0 -> 10)  (10 -> 20)  (21 -> 30) => disjoint=true because 10==10 and does not invalidate the ordering
+    ///      output = true
+    ///
+    /// (0 -> 13)  (12 -> 20)  (21 -> 30) => disjoint=false
+    ///      output = None
+    ///
     fn are_disjoint(&self, ordered_by_min_partition_indices: &[usize]) -> bool {
         for index_index in 1..ordered_by_min_partition_indices.len() {
             let index = ordered_by_min_partition_indices[index_index];
@@ -285,9 +359,10 @@ impl ConvertedRows {
             // Ordering is by sort key, and may be desc.
             // Therefore need to confirm that the min & max of the current range is greater than the previous range.
             let start_exclusive = self.min_row(index) > self.min_row(prev_index)
-                && self.min_row(index) > self.max_row(prev_index);
-            let end_exclusive = self.max_row(index) > self.min_row(prev_index)
+                && self.min_row(index) >= self.max_row(prev_index);
+            let end_exclusive = self.max_row(index) >= self.min_row(prev_index)
                 && self.max_row(index) > self.max_row(prev_index);
+
             if !(start_exclusive && end_exclusive) {
                 return false;
             }
@@ -312,12 +387,10 @@ fn pivot_to_arrays(keys: Vec<Vec<ScalarValue>>) -> Result<Vec<ArrayRef>> {
 
 #[cfg(test)]
 mod tests {
+    use super::*;
     use arrow::compute::SortOptions;
-    use datafusion_common::ScalarValue;
     use itertools::Itertools;
 
-    use super::{LexicalRange, NonOverlappingOrderedLexicalRanges};
-
     struct TestCase {
         partitions: Vec<TestPartition>,
         num_sort_keys: usize,
@@ -329,6 +402,7 @@ mod tests {
 
     impl TestCase {
         fn run(self) {
+            println!("Beginning test : {}", self.name);
             // Test: confirm found proper lexical ranges
             let lexical_ranges = self.build_lexical_ranges();
             let expected = self
@@ -353,13 +427,19 @@ mod tests {
                 TestSortOption::DescNullsFirst,
                 TestSortOption::DescNullsLast,
             ].into_iter().for_each(|sort_ordering| {
-                if let Some(nonoverlapping) = NonOverlappingOrderedLexicalRanges::try_new(&sort_ordering.sort_options(self.num_sort_keys).as_slice(), lexical_ranges.clone()).expect("should not error") {
-                    assert!(self.expect_disjoint, "ERROR {} for {:?}: expected ranges to overlap, instead found disjoint ranges", self.name, &sort_ordering);
+                if let Some(nonoverlapping) = NonOverlappingOrderedLexicalRanges::try_new(&sort_ordering.sort_options(self.num_sort_keys), lexical_ranges.clone()).expect("should not error") {
+                    assert!(self.expect_disjoint,
+                            "ERROR {} for {:?}: expected ranges to overlap, instead found disjoint ranges",
+                            self.name, &sort_ordering);
 
                     let expected_ordered_indices = self.find_expected_indices(&sort_ordering);
-                    assert_eq!(expected_ordered_indices, nonoverlapping.indices(), "ERROR {} for {:?}: expected to find indices ordered {:?}, instead found ordering {:?}", self.name, &sort_ordering, expected_ordered_indices, nonoverlapping.indices());
+                    assert_eq!(expected_ordered_indices, nonoverlapping.indices(),
+                               "ERROR {} for {:?}: expected to find indices ordered {:?}, instead found ordering {:?}",
+                               self.name, &sort_ordering, expected_ordered_indices, nonoverlapping.indices());
                 } else {
-                    assert!(!self.expect_disjoint, "ERROR {} for {:?}: expected to find disjoint ranges, instead could either not detect ranges or found overlapping ranges", self.name, &sort_ordering);
+                    assert!(!self.expect_disjoint,
+                            "ERROR {} for {:?}: expected to find disjoint ranges, instead could either not detect ranges or found overlapping ranges",
+                            self.name, &sort_ordering);
                 };
             });
         }
@@ -407,9 +487,13 @@ mod tests {
         TestPartition { range_per_sort_key }
     }
 
+    /// Build min/maxes for a partition with three sort keys:
+    /// 1. An integer sort key
+    /// 2. A string sort key
+    /// 3. A timestamp sort key (nanoseconds)
     fn build_partition_with_multiple_sort_keys(
         ints: (Option<i64>, Option<i64>),
-        strings: (Option<String>, Option<String>),
+        strings: (Option<&str>, Option<&str>),
         times: (Option<i64>, Option<i64>),
     ) -> TestPartition {
         let range_per_sort_key = vec![
@@ -418,8 +502,8 @@ mod tests {
                 max: ScalarValue::Int64(ints.1),
             },
             SortKeyRange {
-                min: ScalarValue::Utf8(strings.0),
-                max: ScalarValue::Utf8(strings.1),
+                min: ScalarValue::from(strings.0),
+                max: ScalarValue::from(strings.1),
             },
             SortKeyRange {
                 min: ScalarValue::TimestampNanosecond(times.0, None),
@@ -541,17 +625,17 @@ mod tests {
                 partitions: vec![
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(10)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(2), Some(10)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(0), Some(0)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(1), Some(1)),
                     ),
                 ],
@@ -569,17 +653,17 @@ mod tests {
                 partitions: vec![
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(10)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(11), Some(20)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(0), Some(0)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(1), Some(1)),
                     ),
                 ],
@@ -603,17 +687,17 @@ mod tests {
                 partitions: vec![
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("a".into()), Some("d".into())),
+                        (Some("a"), Some("d")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("f".into()), Some("g".into())),
+                        (Some("f"), Some("g")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("c".into()), Some("e".into())),
+                        (Some("c"), Some("e")),
                         (Some(1), Some(1)),
                     ),
                 ],
@@ -631,17 +715,17 @@ mod tests {
                 partitions: vec![
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("a".into()), Some("b".into())),
+                        (Some("a"), Some("b")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("f".into()), Some("g".into())),
+                        (Some("f"), Some("g")),
                         (Some(1), Some(1)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("c".into()), Some("e".into())),
+                        (Some("c"), Some("e")),
                         (Some(1), Some(1)),
                     ),
                 ],
@@ -665,17 +749,17 @@ mod tests {
                 partitions: vec![
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(50000000), Some(50000001)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(700000), Some(50000001)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(100000000), Some(100000001)),
                     ),
                 ],
@@ -693,17 +777,17 @@ mod tests {
                 partitions: vec![
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(50000000), Some(50000001)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(700000), Some(7000001)),
                     ),
                     build_partition_with_multiple_sort_keys(
                         (Some(1), Some(1)),
-                        (Some("same".into()), Some("same".into())),
+                        (Some("same"), Some("same")),
                         (Some(100000000), Some(100000001)),
                     ),
                 ],
@@ -767,6 +851,165 @@ mod tests {
                     (TestSortOption::DescNullsLast, vec![1, 0, 2]).into(),
                 ],
             },
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (None, Some("e")),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("e"), None),
+                        (Some(1), Some(1)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_nulls__overlap",
+                expected_ranges_per_partition: vec![
+                    "(1,NULL,1)->(1,e,1)",
+                    "(1,e,1)->(1,NULL,1)",
+                ],
+                expect_disjoint: false,
+                expected_ordered_indices: vec![],
+            },
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(10), Some(20)),
+                        (None, Some("c")),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(21), Some(22)),
+                        (Some("e"), Some("f")),
+                        (Some(2), Some(3)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_nulls__disjoint2",
+                expected_ranges_per_partition: vec![
+                    "(10,NULL,1)->(20,c,1)",
+                    "(21,e,2)->(22,f,3)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![0, 1]).into(),
+                    (TestSortOption::AscNullsLast, vec![0, 1]).into(),
+                    (TestSortOption::DescNullsFirst, vec![1, 0]).into(),
+                    (TestSortOption::DescNullsLast, vec![1, 0]).into(),
+                ],
+            },
+        ];
+
+        cases.into_iter().for_each(|test_case| test_case.run());
+    }
+
+    #[test]
+    fn test_disjointness_with_touching_disjoint_ranges() {
+        let cases = [
+            /* Using the first sort key, an integer, and has a start(current)==end(previous). */
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(10)), // ends with 10
+                        (Some("same"), Some("same")),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(10), Some(20)), // starts with 10
+                        (Some("same"), Some("same")),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(0), Some(0)),
+                        (Some("same"), Some("same")),
+                        (Some(1), Some(1)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_first_sort_key__disjoint_touching",
+                expected_ranges_per_partition: vec![
+                    "(1,same,1)->(10,same,1)",
+                    "(10,same,1)->(20,same,1)",
+                    "(0,same,1)->(0,same,1)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![2, 0, 1]).into(),
+                    (TestSortOption::AscNullsLast, vec![2, 0, 1]).into(),
+                    (TestSortOption::DescNullsFirst, vec![1, 0, 2]).into(),
+                    (TestSortOption::DescNullsLast, vec![1, 0, 2]).into(),
+                ],
+            },
+            /* Using the middle sort key, a string, as the decider, and has a start(current)==end(previous) */
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("a"), Some("c")), // ends with c
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("f"), Some("g")),
+                        (Some(1), Some(1)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("c"), Some("e")), // starts with c
+                        (Some(1), Some(1)),
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_middle_sort_key__disjoint_touching",
+                expected_ranges_per_partition: vec![
+                    "(1,a,1)->(1,c,1)",
+                    "(1,f,1)->(1,g,1)",
+                    "(1,c,1)->(1,e,1)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![0, 2, 1]).into(),
+                    (TestSortOption::AscNullsLast, vec![0, 2, 1]).into(),
+                    (TestSortOption::DescNullsFirst, vec![1, 2, 0]).into(),
+                    (TestSortOption::DescNullsLast, vec![1, 2, 0]).into(),
+                ],
+            },
+            /* Using the last sort key, a nanosecond timestamp, as the decider, and has a start(current)==end(previous) */
+            TestCase {
+                partitions: vec![
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same"), Some("same")),
+                        (Some(50000000), Some(50000001)), // ends with 50000001
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same"), Some("same")),
+                        (Some(700000), Some(7000001)),
+                    ),
+                    build_partition_with_multiple_sort_keys(
+                        (Some(1), Some(1)),
+                        (Some("same"), Some("same")),
+                        (Some(50000001), Some(100000001)), // starts with 50000001
+                    ),
+                ],
+                num_sort_keys: 3,
+                name: "order_by_last_sort_key__disjoint_touching",
+                expected_ranges_per_partition: vec![
+                    "(1,same,50000000)->(1,same,50000001)",
+                    "(1,same,700000)->(1,same,7000001)",
+                    "(1,same,50000001)->(1,same,100000001)",
+                ],
+                expect_disjoint: true,
+                expected_ordered_indices: vec![
+                    (TestSortOption::AscNullsFirst, vec![1, 0, 2]).into(),
+                    (TestSortOption::AscNullsLast, vec![1, 0, 2]).into(),
+                    (TestSortOption::DescNullsFirst, vec![2, 0, 1]).into(),
+                    (TestSortOption::DescNullsLast, vec![2, 0, 1]).into(),
+                ],
+            },
         ];
 
         cases.into_iter().for_each(|test_case| test_case.run());
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs b/datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs
index 2b2ae31f39bb..8fe706284222 100644
--- a/datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/statistics.rs
@@ -17,278 +17,1692 @@
 
 use std::sync::Arc;
 
-use datafusion_common::stats::Precision;
-use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor};
 use datafusion_common::{
-    internal_datafusion_err, Result as DatafusionResult, ScalarValue,
+    stats::Precision, ColumnStatistics, DataFusionError, Result, ScalarValue, Statistics,
 };
-use datafusion_datasource::file_scan_config::FileScanConfig;
-use datafusion_datasource::source::DataSourceExec;
-use datafusion_physical_plan::aggregates::AggregateExec;
-use datafusion_physical_plan::empty::EmptyExec;
-use datafusion_physical_plan::joins::{
-    CrossJoinExec, HashJoinExec, NestedLoopJoinExec, SymmetricHashJoinExec,
+use datafusion_datasource::{file_scan_config::FileScanConfig, source::DataSourceExec};
+use datafusion_physical_plan::{
+    coalesce_batches::CoalesceBatchesExec,
+    coalesce_partitions::CoalescePartitionsExec,
+    empty::EmptyExec,
+    filter::FilterExec,
+    limit::{GlobalLimitExec, LocalLimitExec},
+    placeholder_row::PlaceholderRowExec,
+    projection::ProjectionExec,
+    repartition::RepartitionExec,
+    sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
+    union::UnionExec,
+    ExecutionPlan, PhysicalExpr,
 };
-use datafusion_physical_plan::placeholder_row::PlaceholderRowExec;
-use datafusion_physical_plan::repartition::RepartitionExec;
-use datafusion_physical_plan::sorts::sort::SortExec;
-use datafusion_physical_plan::ColumnStatistics;
-use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
-
-/// Similar to our in-house `StatisticsVisitor`, except it performs the statistics extraction per partition,
-/// and not on the entire node.
-#[derive(Debug)]
-pub struct PartitionStatisticsVisitor<'a> {
-    column_name: &'a str,
-    partition: usize,
-    statistics: Option<ColumnStatistics>,
-}
+use fetch::apply_fetch;
+use filter::apply_filter;
+use itertools::Itertools;
+use project_schema::{proj_exec_stats, project_select_subset_of_column_statistics};
+use util::{make_column_statistics_inexact, merge_stats_collection, partition_count};
 
-impl<'a> PartitionStatisticsVisitor<'a> {
-    pub fn new(column_name: &'a str, partition: usize) -> Self {
-        Self {
-            column_name,
-            partition,
-            statistics: Some(ColumnStatistics::new_unknown()),
-        }
+mod fetch;
+mod filter;
+mod project_schema;
+pub(crate) use project_schema::project_schema_onto_datasrc_statistics;
+mod util;
+
+/// Return min max of a ColumnStatistics with precise values
+pub fn column_statistics_min_max(
+    column_statistics: ColumnStatistics,
+) -> Option<(ScalarValue, ScalarValue)> {
+    match (column_statistics.min_value, column_statistics.max_value) {
+        (Precision::Exact(min), Precision::Exact(max)) => Some((min, max)),
+        // the statistics values are absent or imprecise
+        _ => None,
     }
+}
 
-    /// Consumes self, returning the found [`ColumnStatistics`].
+/// This matches the planned API for DataFusion's `PartitionedStatistics`.
+pub type PartitionedStatistics = Vec<Arc<Statistics>>;
+
+pub(crate) trait PartitionStatistics {
+    /// Returns a [`Statistics`] per partition.
     ///
-    /// Returning None means the [`PartitionStatisticsVisitor`] could not extract the stats.
-    pub fn result(&mut self) -> Option<ColumnStatistics> {
-        std::mem::take(&mut self.statistics)
+    /// This function matches the planned API for DataFusion's `ExecutionPlan::statistics_by_partition`.
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics>;
+}
+
+impl PartitionStatistics for EmptyExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        let mut stats = Statistics::new_unknown(&self.schema());
+        stats.num_rows = Precision::Exact(0); // tis empty
+        let data = Arc::new(stats);
+
+        Ok(vec![data; partition_count(self)])
     }
+}
+
+impl PartitionStatistics for PlaceholderRowExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        // all partitions have the same single row, and same stats.
+        // refer to `PlaceholderRowExec::execute`.
+        #[expect(deprecated)]
+        let data = self.statistics()?.into();
+        Ok(vec![data; partition_count(self)])
+    }
+}
 
-    /// Merge stats.
-    fn merge_stats<'i>(
-        mut stats: impl Iterator<Item = Option<&'i ColumnStatistics>>,
-    ) -> Option<ColumnStatistics> {
-        let Some(start) = stats.next() else {
-            // stats is empty
-            return None;
+impl PartitionStatistics for DataSourceExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        // Extract partitioned files from the Parquet, which is the datasource we use.
+        let Some(base_config) =
+            self.data_source().as_any().downcast_ref::<FileScanConfig>()
+        else {
+            return Ok(unknown_statistics_by_partition(self));
         };
-        stats.fold(start.cloned(), |a, b| {
-            if let (Some(a), Some(b)) = (a, b) {
-                Some(merge_stats(a, b))
-            } else {
-                None // we hit something that prevented stats extraction
-            }
-        })
+
+        let file_schema = &base_config.file_schema;
+        let target_schema = Arc::clone(&self.schema());
+
+        // get per partition (a.k.a. file group)
+        let per_partition = base_config
+            .file_groups
+            .iter()
+            .map(|file_group| {
+                file_group
+                    .iter()
+                    .map(|file| {
+                        // per file, get projected statistics
+                        let statistics = if let Some(file_stats) = &file.statistics {
+                            project_schema_onto_datasrc_statistics(
+                                file_stats,
+                                file_schema,
+                                &target_schema,
+                            )?
+                        } else {
+                            // doesn't have file stats
+                            Arc::new(Statistics::new_unknown(&target_schema))
+                        };
+                        Ok::<Arc<Statistics>, DataFusionError>(statistics)
+                    })
+                    .process_results(|stats| {
+                        Arc::new(merge_stats_collection(
+                            stats.into_iter(),
+                            &target_schema,
+                        ))
+                    })
+            })
+            .try_collect()?;
+
+        Ok(per_partition)
     }
+}
 
-    /// Find partition within multiple children, and return the partition stats.
-    fn find_stats_per_partition_within_multiple_children(
-        &mut self,
-        node: &Arc<dyn ExecutionPlan>,
-    ) -> DatafusionResult<Option<ColumnStatistics>> {
-        // figure out which input of the union to use
-        let mut child_partition = self.partition;
-        for child in node.children() {
-            let num_child_partitions = partition_count(child);
-            if child_partition < num_child_partitions {
-                self.partition = child_partition;
-                child.visit(self)?;
-                return Ok(self.result());
-            }
-            child_partition -= num_child_partitions;
-        }
-        unreachable!("didn't find the partition in children");
+impl PartitionStatistics for UnionExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        preserve_partitioning_no_projection(self)
     }
+}
 
-    /// Applied per node, merge all statistics across partitions in node.
-    fn merge_column_statistics_across_partitions(
-        &mut self,
-        node: &Arc<dyn ExecutionPlan>,
-    ) -> DatafusionResult<Option<ColumnStatistics>> {
-        let cnt_partitions = partition_count_children(node);
-        let mut extracted_stats = Vec::with_capacity(cnt_partitions);
-
-        for child in node.children() {
-            for partition in 0..partition_count(child) {
-                let mut extractor = Self::new(self.column_name, partition);
-                child.visit(&mut extractor)?;
-                extracted_stats.push(extractor.result());
-            }
-        }
+impl PartitionStatistics for CoalesceBatchesExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        preserve_partitioning_no_projection_apply_fetch(self, self.fetch(), 0)
+    }
+}
 
-        Ok(Self::merge_stats(
-            extracted_stats.iter().map(|a| a.as_ref()),
-        ))
+impl PartitionStatistics for LocalLimitExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        preserve_partitioning_no_projection_apply_fetch(self, Some(self.fetch()), 0)
     }
+}
 
-    /// Extract column statistics from file_group of parquet exec.
-    ///
-    /// This is required since we need the statistics per partition.
-    ///
-    /// This is a hack until upstream code is in place.
-    fn extract_statistics_from_file_group(
-        &self,
-        datasrc_exec: &DataSourceExec,
-    ) -> DatafusionResult<Option<ColumnStatistics>> {
-        let Ok(col_idx) = datasrc_exec.schema().index_of(self.column_name) else {
-            // maybe alias
-            return Ok(None);
-        };
+impl PartitionStatistics for GlobalLimitExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        preserve_partitioning_no_projection_apply_fetch(self, self.fetch(), self.skip())
+    }
+}
 
-        // Extract partitioned files from the ParquetExec
-        let Some(base_config) = datasrc_exec
-            .data_source()
-            .as_any()
-            .downcast_ref::<FileScanConfig>()
-        else {
-            return Ok(None);
-        };
+impl PartitionStatistics for FilterExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        preserve_partitioning_with_schema_projection_apply_filter(
+            self,
+            self.projection(),
+            self.predicate(),
+            self.default_selectivity(),
+        )
+    }
+}
 
-        let file_group_partition = base_config.file_groups.get(self.partition).ok_or(
-            internal_datafusion_err!(
-                "requested partition does not exist in datasource exec"
-            ),
+impl PartitionStatistics for ProjectionExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        // use the specific `proj_exec_stats` to project schema on every incoming partition
+        let partition_cnt = partition_count(self);
+
+        let all_partition_stats = self.children().into_iter().try_fold(
+            Vec::with_capacity(partition_cnt),
+            |mut acc, child| {
+                let child_stats = statistics_by_partition(child.as_ref())?;
+
+                let child_stats_with_project_exec_projected =
+                    child_stats.into_iter().map(|stats| {
+                        proj_exec_stats(
+                            Arc::unwrap_or_clone(stats),
+                            self.expr().iter(),
+                            &self.schema(),
+                        )
+                    });
+
+                acc.extend(child_stats_with_project_exec_projected);
+                Ok::<PartitionedStatistics, DataFusionError>(acc)
+            },
         )?;
 
-        Ok(file_group_partition
-            .statistics()
-            .as_ref()
-            .and_then(|stats| stats.column_statistics.get(col_idx).cloned()))
+        Ok(all_partition_stats)
     }
+}
 
-    /// Extract from datasource, with consideration of partitioning.
-    fn extract_from_data_source(
-        &mut self,
-        datasrc: &Arc<dyn ExecutionPlan>,
-    ) -> DatafusionResult<Option<ColumnStatistics>> {
-        if datasrc.as_any().downcast_ref::<EmptyExec>().is_some()
-            || datasrc
-                .as_any()
-                .downcast_ref::<PlaceholderRowExec>()
-                .is_some()
-        {
-            Ok(self.statistics.clone())
-        } else if let Some(datasrc_exec) =
-            datasrc.as_any().downcast_ref::<DataSourceExec>()
-        {
-            let datarsc_stats =
-                match self.extract_statistics_from_file_group(datasrc_exec) {
-                    Ok(Some(col_stats)) => Some(convert_min_max_to_exact(col_stats)),
-                    _ => None,
-                };
-            Ok(datarsc_stats)
-        } else {
-            // specific constraint on our plans
-            unreachable!("should not have another unsupported data source")
-        }
+impl PartitionStatistics for SortPreservingMergeExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        merge_partitions_no_projection_apply_fetch(self, self.fetch(), 0)
     }
 }
 
-impl<'n> TreeNodeVisitor<'n> for PartitionStatisticsVisitor<'_> {
-    type Node = Arc<dyn ExecutionPlan>;
-
-    fn f_down(&mut self, node: &'n Self::Node) -> DatafusionResult<TreeNodeRecursion> {
-        if !is_supported(node) {
-            self.statistics = None;
-            Ok(TreeNodeRecursion::Stop)
-        } else if is_leaf(node) {
-            self.statistics = self.extract_from_data_source(node)?;
-            Ok(TreeNodeRecursion::Stop)
-        } else if should_merge_partition_statistics(node) {
-            self.statistics = self.merge_column_statistics_across_partitions(node)?;
-            Ok(TreeNodeRecursion::Jump)
-        } else if should_pass_thru_partition_statistics(node) {
-            self.statistics =
-                self.find_stats_per_partition_within_multiple_children(node)?;
-            Ok(TreeNodeRecursion::Stop)
+// TODO: once this is implemented
+// impl PartitionStatistics for ProgressiveEvalExec {
+//     fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+//         merge_partitions_no_projection_apply_fetch(self, self.fetch(), 0)
+//     }
+// }
+
+impl PartitionStatistics for CoalescePartitionsExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        merge_partitions_no_projection(self)
+    }
+}
+
+impl PartitionStatistics for SortExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        if self.preserve_partitioning() {
+            preserve_partitioning_no_projection_apply_fetch(self, self.fetch(), 0)
         } else {
-            self.statistics = None;
-            Ok(TreeNodeRecursion::Stop)
+            merge_partitions_no_projection_apply_fetch(self, self.fetch(), 0)
         }
     }
 }
 
-fn is_supported(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    !(plan.as_any().downcast_ref::<HashJoinExec>().is_some()
-        || plan.as_any().downcast_ref::<SymmetricHashJoinExec>().is_some()
-        || plan.as_any().downcast_ref::<CrossJoinExec>().is_some()
-        || plan.as_any().downcast_ref::<NestedLoopJoinExec>().is_some()
-        || plan.as_any().downcast_ref::<AggregateExec>().is_some() // aggregate values can be different than min/max
-        || plan.as_any().downcast_ref::<RepartitionExec>().is_some())
-}
+impl PartitionStatistics for RepartitionExec {
+    fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
+        // takes N input partitions, returning M output partitions.
 
-/// Has multiple input partitions, and 1 output partition. Merge stats.
-///
-/// This is a heuristic (based on our plans) and contingent on [`is_supported`].
-fn should_merge_partition_statistics(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    let sort_preserving_partitioning =
-        if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
-            sort_exec.preserve_partitioning()
-        } else {
-            false
-        };
-    partition_count(plan) == 1
-        && has_multiple_child_partitions(plan)
-        && !sort_preserving_partitioning
+        // the statistics are "merged" into the same value (because any input can go to any output)
+        let mut merged = merge_partitions_no_projection(self)?;
+        let merged_single = merged.pop().expect("should have a single merged statistic");
+        let merged_single = Arc::unwrap_or_clone(merged_single);
+
+        // then the merged stat is turned to inexact (since unknown division across output partitions)
+        let inexact_merged = Arc::new(Statistics {
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: make_column_statistics_inexact(
+                merged_single.column_statistics,
+            ),
+        });
+
+        // finally, all output partitions have the same merged stat
+        Ok(vec![inexact_merged; partition_count(self)])
+    }
 }
 
-/// The input partitions and output partitions are expected to have unchanged statistics.
-///
-/// This is a heuristic (based on our plans) and contingent on [`is_supported`].
-fn should_pass_thru_partition_statistics(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.children().len() == 1 || partition_count(plan) == partition_count_children(plan)
+/// Handle downcasting of the `dyn ExecutionPlan` to the specific execution nodes which
+/// have implemented [`PartitionStatistics`].
+pub fn statistics_by_partition(
+    plan: &dyn ExecutionPlan,
+) -> Result<PartitionedStatistics> {
+    if let Some(exec) = plan.as_any().downcast_ref::<EmptyExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<PlaceholderRowExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<DataSourceExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<UnionExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<CoalesceBatchesExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<LocalLimitExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<GlobalLimitExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<FilterExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<ProjectionExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<SortPreservingMergeExec>() {
+        exec.statistics_by_partition()
+    // } else if let Some(exec) = plan.as_any().downcast_ref::<ProgressiveEvalExec>() {
+    //     exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<CoalescePartitionsExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<SortExec>() {
+        exec.statistics_by_partition()
+    } else if let Some(exec) = plan.as_any().downcast_ref::<RepartitionExec>() {
+        exec.statistics_by_partition()
+    } else {
+        /* These include, but not limited to, the following operators used in our example plans:
+                WindowAggExec
+                SymmetricHashJoinExec
+                HashJoinExec
+                NestedLoopJoinExec
+                ValuesExec
+                AggregateExec
+        */
+        Ok(unknown_statistics_by_partition(plan))
+    }
 }
 
-fn is_leaf(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.children().is_empty()
+/// Provide unknown/absent statistics for all partitions in the plan.
+fn unknown_statistics_by_partition(plan: &dyn ExecutionPlan) -> PartitionedStatistics {
+    let data = Arc::new(Statistics::new_unknown(&plan.schema()));
+    vec![data; partition_count(plan)]
 }
 
-fn has_multiple_child_partitions(plan: &Arc<dyn ExecutionPlan>) -> bool {
-    plan.children().len() > 1 || partition_count_children(plan) > 1
+/// Preserve partitioning from plan inputs.
+///
+/// This does not perform any schema projection.
+fn preserve_partitioning_no_projection(
+    plan: &dyn ExecutionPlan,
+) -> Result<PartitionedStatistics> {
+    preserve_partitioning_with_schema_projection(plan, None)
 }
 
-fn partition_count_children(plan: &Arc<dyn ExecutionPlan>) -> usize {
-    plan.children()
-        .iter()
-        .map(|child| partition_count(child))
-        .sum::<usize>()
+/// Preserve partitioning from plan inputs.
+/// Apply fetch variables (fetch/skip) to modify the [`Statistics::num_rows`].
+///
+/// This does not perform any schema projection.
+fn preserve_partitioning_no_projection_apply_fetch(
+    plan: &dyn ExecutionPlan,
+    fetch: Option<usize>,
+    skip: usize,
+) -> Result<PartitionedStatistics> {
+    let partition_cnt = partition_count(plan);
+    let all_partition_stats = plan.children().into_iter().try_fold(
+        Vec::with_capacity(partition_cnt),
+        |mut acc, child| {
+            let child_stats = statistics_by_partition(child.as_ref())?;
+
+            let child_stats_with_fetch = child_stats
+                .into_iter()
+                .map(|stats| apply_fetch(stats, fetch, skip));
+
+            acc.extend(child_stats_with_fetch);
+            Ok::<PartitionedStatistics, DataFusionError>(acc)
+        },
+    )?;
+
+    Ok(all_partition_stats)
 }
 
-fn partition_count(plan: &Arc<dyn ExecutionPlan>) -> usize {
-    plan.output_partitioning().partition_count()
+/// Preserve partitioning from plan inputs.
+/// Performs a schema projection to modify the [`Statistics::column_statistics`].
+///
+/// A schema projection is only required if either a subset of input fields are projected
+/// into the plan output (e.g. it has a `self.projection`), or we need to add column_statistics
+/// for a chunk order column.
+fn preserve_partitioning_with_schema_projection(
+    plan: &dyn ExecutionPlan,
+    subset_selected: Option<&Vec<usize>>,
+) -> Result<PartitionedStatistics> {
+    let target_schema = plan.schema();
+    let mut all_partition_stats = Vec::with_capacity(partition_count(plan));
+
+    for child in plan.children() {
+        let child_stats = statistics_by_partition(child.as_ref())?;
+
+        child_stats
+            .into_iter()
+            .map(|stats| {
+                if let Some(proj_idx) = subset_selected {
+                    // apply a schema projection
+                    project_select_subset_of_column_statistics(
+                        &stats,
+                        &child.schema(),
+                        proj_idx,
+                        &target_schema,
+                    )
+                } else {
+                    Ok(stats)
+                }
+            })
+            .process_results(|iter| all_partition_stats.extend(iter))?;
+    }
+
+    Ok(all_partition_stats)
 }
 
-fn merge_stats(a: ColumnStatistics, b: &ColumnStatistics) -> ColumnStatistics {
-    ColumnStatistics {
-        null_count: a.null_count.add(&b.null_count),
-        min_value: a.min_value.min(&b.min_value),
-        max_value: a.max_value.max(&b.max_value),
-        distinct_count: Precision::Absent,
-        sum_value: Precision::Absent,
+/// Preserve partitioning from plan inputs.
+/// Apply filter variables to modify the num_rows, total_byte_size, column_statistics in [`Statistics`].
+///
+/// Then performs a schema projection to modify the [`Statistics::column_statistics`].
+fn preserve_partitioning_with_schema_projection_apply_filter(
+    plan: &dyn ExecutionPlan,
+    project: Option<&Vec<usize>>,
+    predicate: &Arc<dyn PhysicalExpr>,
+    default_selectivity: u8,
+) -> Result<PartitionedStatistics> {
+    let target_schema = plan.schema();
+    let mut all_partition_stats = Vec::with_capacity(partition_count(plan));
+
+    for child in plan.children() {
+        let child_stats = statistics_by_partition(child.as_ref())?;
+
+        child_stats
+            .into_iter()
+            .map(|stats| {
+                // apply filter first on input child
+                apply_filter(
+                    Arc::unwrap_or_clone(stats),
+                    &child.schema(),
+                    predicate,
+                    default_selectivity,
+                )
+            })
+            // then apply schema projection on output
+            .map_ok(|stats| {
+                if let Some(proj_idx) = project {
+                    project_select_subset_of_column_statistics(
+                        &stats,
+                        &child.schema(),
+                        proj_idx,
+                        &target_schema,
+                    )
+                } else {
+                    Ok(Arc::new(stats))
+                }
+            })
+            .flatten_ok()
+            .process_results(|iter| all_partition_stats.extend(iter))?;
     }
+
+    Ok(all_partition_stats)
 }
 
-/// DataFusion statistics can't distinguish "inexact" from "known to be within the range"
+/// Merge partition stats across plan inputs.
 ///
-/// Converts inexact values to exact if possible for the purposes of
-/// analysis by [`PartitionStatisticsVisitor`]
-fn convert_min_max_to_exact(mut column_statistics: ColumnStatistics) -> ColumnStatistics {
-    column_statistics.min_value = to_exact(column_statistics.min_value);
-    column_statistics.max_value = to_exact(column_statistics.max_value);
-    column_statistics
+/// This does not perform any schema projection.
+fn merge_partitions_no_projection(
+    plan: &dyn ExecutionPlan,
+) -> Result<PartitionedStatistics> {
+    merge_partitions_no_projection_apply_fetch(plan, None, 0)
 }
 
-/// Convert [`Precision::Inexact`] to [`Precision::Exact`] if possible
-fn to_exact(v: Precision<ScalarValue>) -> Precision<ScalarValue> {
-    match v {
-        Precision::Exact(v) | Precision::Inexact(v) => Precision::Exact(v),
-        Precision::Absent => Precision::Absent,
-    }
+/// Merge partition stats across plan inputs.
+/// Apply fetch variables (fetch/skip) to modify the [`Statistics::num_rows`].
+///
+/// This does not perform any schema projection.
+fn merge_partitions_no_projection_apply_fetch(
+    plan: &dyn ExecutionPlan,
+    fetch: Option<usize>,
+    skip: usize,
+) -> Result<PartitionedStatistics> {
+    let merged_partition_stats = plan
+        .children()
+        .into_iter()
+        .map(|child| {
+            let child_stats = statistics_by_partition(child.as_ref())?;
+
+            // merge stats for all partitions in each child
+            let merged_child_partitions = if fetch.is_some() || skip > 0 {
+                let child_stats_with_fetch = child_stats
+                    .into_iter()
+                    .map(|stats| apply_fetch(stats, fetch, skip));
+
+                merge_stats_collection(child_stats_with_fetch, &plan.schema())
+            } else {
+                merge_stats_collection(child_stats.into_iter(), &plan.schema())
+            };
+
+            Ok::<Statistics, DataFusionError>(merged_child_partitions)
+        })
+        .process_results(|stats| {
+            // merge stats across children
+            Arc::new(merge_stats_collection(stats, &plan.schema()))
+        })?;
+
+    Ok(vec![merged_partition_stats])
 }
 
-/// Return min max of a ColumnStatistics with precise values
-pub fn column_statistics_min_max(
-    column_statistics: ColumnStatistics,
-) -> Option<(ScalarValue, ScalarValue)> {
-    match (column_statistics.min_value, column_statistics.max_value) {
-        (Precision::Exact(min), Precision::Exact(max)) => Some((min, max)),
-        // the statistics values are absent or imprecise
-        _ => None,
+#[cfg(test)]
+mod test {
+    use std::fmt::{Display, Formatter};
+
+    use crate::progressive_evaluation::util::test_utils::{
+        coalesce_exec, crossjoin_exec, file_scan_config, filter_exec, limit_exec,
+        parquet_exec_with_sort_with_statistics, proj_exec, repartition_exec,
+        single_column_schema, sort_exec, spm_exec, union_exec, PartitionedFilesAndRanges,
+        SortKeyRange,
+    };
+
+    use super::*;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion_common::{ColumnStatistics, DataFusionError};
+    use datafusion_expr::Operator;
+    use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr};
+    use datafusion_physical_plan::{
+        displayable,
+        expressions::{col, lit, BinaryExpr, IsNullExpr, NoOp},
+        union::InterleaveExec,
+        Partitioning,
+    };
+
+    use insta::assert_snapshot;
+    use itertools::Itertools;
+
+    /// For running test cases on the [`statistics_by_partition`].
+    #[derive(Debug)]
+    struct TestCase<'a> {
+        /// Input place to test.
+        input_plan: &'a Arc<dyn ExecutionPlan>,
+        /// Column to extract.
+        col_name: &'a str,
+        /// Expected column statistics, per partition.
+        expected_ranges_per_partition: Option<&'a [&'a SortKeyRange]>,
+        /// Actual results per partition, populated after [`TestCase::run`].
+        result_per_partition: PartitionedStatistics,
+    }
+
+    impl<'a> TestCase<'a> {
+        fn new(
+            input_plan: &'a Arc<dyn ExecutionPlan>,
+            col_name: &'a str,
+            expected_ranges_per_partition: Option<&'a [&'a SortKeyRange]>,
+        ) -> Self {
+            Self {
+                input_plan,
+                col_name,
+                expected_ranges_per_partition,
+                result_per_partition: vec![],
+            }
+        }
+
+        /// Run the test cases, and populate the results in [`TestCase::result_per_partition`].
+        fn run(mut self) -> Result<Self, DataFusionError> {
+            let partition_cnt = self.input_partition_cnt();
+
+            if let Some(per_partition) = &self.expected_ranges_per_partition {
+                assert_eq!(
+                    per_partition.len(),
+                    partition_cnt,
+                    "failure in test setup, the count of expected ranges should equal the partition count"
+                );
+            };
+
+            // run test case with PartitionStatisitics implementations
+            self.result_per_partition =
+                statistics_by_partition(self.input_plan.as_ref())?;
+
+            Ok(self)
+        }
+
+        fn input_partition_cnt(&self) -> usize {
+            self.input_plan
+                .properties()
+                .output_partitioning()
+                .partition_count()
+        }
+
+        /// Resultant [`ColumnStatistics`] per partition, extracted from the schema [`Statistics`],
+        /// for the test column.
+        ///
+        /// If the schema does not have the resultant test column (e.g. the output plan projection is `select another-col`)
+        /// Then the result is None.
+        fn results_for_col_name(&self) -> Vec<Option<ColumnStatistics>> {
+            if let Ok(col_idx) = self.input_plan.schema().index_of(self.col_name) {
+                self.result_per_partition
+                    .iter()
+                    .map(|stats| Some(stats.column_statistics[col_idx].clone()))
+                    .collect_vec()
+            } else {
+                vec![None; self.input_partition_cnt()]
+            }
+        }
+    }
+
+    impl Display for TestCase<'_> {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            let displayable_plan = displayable(self.input_plan.as_ref()).indent(false);
+            writeln!(f, "{}", displayable_plan)?;
+
+            writeln!(f, "Expected column statistics per partition:")?;
+            for partition in 0..self.input_partition_cnt() {
+                if let Some(expected_per_partition) = self.expected_ranges_per_partition {
+                    writeln!(
+                        f,
+                        "    partition {:?}:  {}",
+                        partition, expected_per_partition[partition]
+                    )?;
+                } else {
+                    writeln!(f, "    partition {:?}:  None", partition)?;
+                }
+            }
+
+            writeln!(f, "\nActual column statistics per partition:")?;
+            for (partition, actual_stats) in
+                self.results_for_col_name().iter().enumerate()
+            {
+                writeln!(
+                    f,
+                    "    partition {:?}:  {}",
+                    partition,
+                    str_column_stats(actual_stats)
+                )?;
+            }
+
+            Ok(())
+        }
+    }
+
+    /// Provide the [`ColumnStatistics`] as a string for insta.
+    fn str_column_stats(stats: &Option<ColumnStatistics>) -> String {
+        let Some(stats) = stats else {
+            return "None".into();
+        };
+        if matches!(stats.min_value, Precision::Absent)
+            && matches!(stats.max_value, Precision::Absent)
+        {
+            return "None".into();
+        }
+
+        if stats.null_count.get_value().cloned().unwrap_or_default() > 0 {
+            format!(
+                "({:?})->({:?}) null_count={}",
+                stats.min_value.get_value().unwrap(),
+                stats.max_value.get_value().unwrap(),
+                stats.null_count.get_value().unwrap()
+            )
+        } else {
+            format!(
+                "({:?})->({:?})",
+                stats.min_value.get_value().unwrap(),
+                stats.max_value.get_value().unwrap()
+            )
+        }
+    }
+
+    /// Plan nodes which generate the original statistics (a.k.a. the data sources).
+    #[test]
+    fn test_handles_datasources() -> Result<(), DataFusionError> {
+        let col_name = "a";
+        let lex_ordering = LexOrdering::new(vec![PhysicalSortExpr::new(
+            col(col_name, &single_column_schema())?,
+            Default::default(),
+        )]);
+        let ranges_per_partition = &[
+            &SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        /* Test Case: parquet */
+        let plan = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            ranges_per_partition,
+        );
+        let test_case = TestCase::new(&plan, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        /* Test Case: empty exec */
+        // note: empty exec always has only 1 partition, and has absent stats (except for 0 num_rows)
+        let plan = Arc::new(EmptyExec::new(single_column_schema())) as _;
+        let test_case = TestCase::new(&plan, col_name, None);
+        let result = test_case.run()?;
+        assert_snapshot!(
+            result,
+            @r"
+        EmptyExec
+
+        Expected column statistics per partition:
+            partition 0:  None
+
+        Actual column statistics per partition:
+            partition 0:  None
+        ");
+        assert_eq!(
+            result.result_per_partition[0].num_rows.get_value(),
+            Some(&0),
+            "empty exec should have zero rows"
+        );
+
+        /* Test Case: placeholder row */
+        // note: placeholder row is 1 row with null values for all columns
+        let plan =
+            Arc::new(PlaceholderRowExec::new(single_column_schema()).with_partitions(2))
+                as _;
+        let test_case = TestCase::new(&plan, col_name, None);
+        let result = test_case.run()?;
+        assert_snapshot!(
+            result,
+            @r"
+        PlaceholderRowExec
+
+        Expected column statistics per partition:
+            partition 0:  None
+            partition 1:  None
+
+        Actual column statistics per partition:
+            partition 0:  None
+            partition 1:  None
+        ");
+        assert_eq!(
+            result.result_per_partition.len(),
+            2,
+            "should have stats for 2 partitions"
+        );
+        let [p0, p1] = &result.result_per_partition[..] else {
+            unreachable!()
+        };
+        assert_eq!(
+            p0.num_rows.get_value(),
+            Some(1).as_ref(),
+            "should have only 1 row"
+        );
+        assert_eq!(
+            p0.column_statistics[0].null_count.get_value(),
+            Some(1).as_ref(),
+            "should be a null value"
+        );
+        assert_eq!(
+            p1.num_rows.get_value(),
+            Some(1).as_ref(),
+            "should have only 1 row"
+        );
+        assert_eq!(
+            p1.column_statistics[0].null_count.get_value(),
+            Some(1).as_ref(),
+            "should be a null value"
+        );
+
+        Ok(())
+    }
+
+    /// Plan nodes which pass through the partitions without impacting range.
+    #[test]
+    fn test_handles_partition_pass_thru() -> Result<(), DataFusionError> {
+        let col_name = "a";
+        let lex_ordering = LexOrdering::new(vec![PhysicalSortExpr::new(
+            col(col_name, &single_column_schema())?,
+            Default::default(),
+        )]);
+        let ranges_per_partition = &[
+            &SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        /* Test Case: union */
+        let left_plan = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            &ranges_per_partition[0..1],
+        );
+        let right_plan = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            &ranges_per_partition[1..],
+        );
+        let union = union_exec(vec![left_plan, right_plan]);
+        let test_case = TestCase::new(&union, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        /* Test Case: sorts, with preserve partitioning */
+        let preserve_partitioning = true;
+        let input = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            ranges_per_partition,
+        );
+        let sort = sort_exec(&lex_ordering, &input, preserve_partitioning);
+        let test_case = TestCase::new(&sort, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[true]
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        /* Test Case: coalesce */
+        let coalesce = coalesce_exec(&input);
+        let test_case = TestCase::new(&coalesce, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        CoalesceBatchesExec: target_batch_size=10
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        /* Test Case: limit */
+        let limit = limit_exec(&input, 2);
+        let test_case = TestCase::new(&limit, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        LocalLimitExec: fetch=2
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        /* Test Case: empty projection */
+        let proj = proj_exec(
+            &input,
+            vec![(col(col_name, &single_column_schema())?, "a".into())],
+        );
+        let test_case = TestCase::new(&proj, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        ProjectionExec: expr=[a@0 as a]
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        /* Test Case: pass thru projection */
+        let pass_thru_single_col = col("a", &single_column_schema())?;
+        let proj = proj_exec(
+            &input,
+            vec![(Arc::clone(&pass_thru_single_col), "a".into())],
+        );
+        let test_case = TestCase::new(&proj, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        ProjectionExec: expr=[a@0 as a]
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        /* Test Case: projection that modifies -> will not pass thru*/
+        let col_plus_2 = Arc::new(BinaryExpr::new(
+            pass_thru_single_col,
+            Operator::Plus,
+            lit(2),
+        ));
+        let proj = proj_exec(&input, vec![(col_plus_2, "foo".into())]);
+        let test_case = TestCase::new(&proj, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        ProjectionExec: expr=[a@0 + 2 as foo]
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  None
+            partition 1:  None
+            partition 2:  None
+        ");
+
+        /* Test Case: filter (for now, we don't narrow the range further) */
+        let filter =
+            filter_exec(&input, Arc::new(IsNullExpr::new(Arc::new(NoOp::new()))));
+        let test_case = TestCase::new(&filter, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        FilterExec: NoOp IS NULL
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000))
+            partition 1:  (Some(2001))->(Some(3000))
+            partition 2:  (Some(3001))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000))
+            partition 1:  (Int32(2001))->(Int32(3000))
+            partition 2:  (Int32(3001))->(Int32(3500))
+        ");
+
+        Ok(())
+    }
+
+    /// Plan nodes which merge the partitions into a single output partition, producing a range
+    /// predictable from the merged input partitions.
+    #[test]
+    fn test_handles_partition_merging() -> Result<(), DataFusionError> {
+        let col_name = "a";
+        let lex_ordering = LexOrdering::new(vec![PhysicalSortExpr::new(
+            col(col_name, &single_column_schema())?,
+            Default::default(),
+        )]);
+        let ranges_per_partition = &[
+            &SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        // Expected result from all test cases.
+        let partitioned_input = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            ranges_per_partition,
+        );
+        let expect_merged = &[&SortKeyRange {
+            min: Some(1000),
+            max: Some(3500),
+            null_count: 0,
+        }];
+
+        /* Test Case: SPM */
+        let spm = spm_exec(&partitioned_input, &lex_ordering);
+        let test_case = TestCase::new(&spm, col_name, Some(expect_merged));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        SortPreservingMergeExec: [a@0 ASC]
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(3500))
+        ");
+
+        /* Test Case: sorts, without preserve partitioning */
+        let sort = sort_exec(&lex_ordering, &partitioned_input, false);
+        let test_case = TestCase::new(&sort, col_name, Some(expect_merged));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        SortExec: expr=[a@0 ASC], preserve_partitioning=[false]
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(3500))
+        ");
+
+        Ok(())
+    }
+
+    /// Plan nodes which has N input partitions and M output partitions, where N may not equal M
+    /// and partitions are reshuffled.
+    #[test]
+    fn test_handles_repartitioning() -> Result<(), DataFusionError> {
+        let col_name = "a";
+        let lex_ordering = LexOrdering::new(vec![PhysicalSortExpr::new(
+            col(col_name, &single_column_schema())?,
+            Default::default(),
+        )]);
+        let ranges_per_partition = &[
+            &SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        // Expected result from all test cases.
+        let expect_merged = &[&SortKeyRange {
+            min: Some(1000),
+            max: Some(3500),
+            null_count: 0,
+        }];
+
+        /* Test Case: Repartitioning */
+        let partitioned_input = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            ranges_per_partition,
+        );
+        let partitioning = Partitioning::Hash(vec![], 4);
+        let repartition = repartition_exec(&partitioned_input, partitioning);
+        // expect all 4 hashed partitions to potentially cover the same range
+        let expected = std::iter::repeat_n(expect_merged[0], 4).collect_vec();
+        let test_case = TestCase::new(&repartition, col_name, Some(&expected));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        RepartitionExec: partitioning=Hash([], 4), input_partitions=3
+          DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(3500))
+            partition 1:  (Some(1000))->(Some(3500))
+            partition 2:  (Some(1000))->(Some(3500))
+            partition 3:  (Some(1000))->(Some(3500))
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(3500))
+            partition 1:  (Int32(1000))->(Int32(3500))
+            partition 2:  (Int32(1000))->(Int32(3500))
+            partition 3:  (Int32(1000))->(Int32(3500))
+        ");
+
+        Ok(())
+    }
+
+    fn build_interleave_plan(
+        lex_ordering: LexOrdering,
+        ranges_per_partition: &[&SortKeyRange],
+    ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+        let partitioning = Partitioning::Hash(vec![], 4);
+        let left_plan = repartition_exec(
+            &parquet_exec_with_sort_with_statistics(
+                vec![lex_ordering.clone()],
+                &ranges_per_partition[0..1],
+            ),
+            partitioning.clone(),
+        );
+        let right_plan = repartition_exec(
+            &parquet_exec_with_sort_with_statistics(
+                vec![lex_ordering],
+                &ranges_per_partition[1..],
+            ),
+            partitioning,
+        );
+        Ok(Arc::new(InterleaveExec::try_new(vec![
+            left_plan, right_plan,
+        ])?))
+    }
+
+    /// Plan nodes which short circuit the statistics_by_partition, returning None (or absent) statistics.
+    #[test]
+    fn test_returns_none() -> Result<(), DataFusionError> {
+        let col_name = "a";
+        let lex_ordering = LexOrdering::new(vec![PhysicalSortExpr::new(
+            col(col_name, &single_column_schema())?,
+            Default::default(),
+        )]);
+        let ranges_per_partition = &[
+            &SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 0,
+            },
+            &SortKeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 0,
+            },
+        ];
+
+        /* Test Case: Joins */
+        let left_plan = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            &ranges_per_partition[0..1],
+        );
+        let right_plan = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            &ranges_per_partition[1..],
+        );
+        let crossjoin = crossjoin_exec(&left_plan, &right_plan);
+        let test_case = TestCase::new(&crossjoin, col_name, None);
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        CrossJoinExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  None
+            partition 1:  None
+
+        Actual column statistics per partition:
+            partition 0:  None
+            partition 1:  None
+        ");
+
+        /* Test Case: interleave */
+        let interleave =
+            build_interleave_plan(lex_ordering.clone(), ranges_per_partition)?;
+        let test_case = TestCase::new(&interleave, col_name, None);
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        InterleaveExec
+          RepartitionExec: partitioning=Hash([], 4), input_partitions=1
+            DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          RepartitionExec: partitioning=Hash([], 4), input_partitions=2
+            DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  None
+            partition 1:  None
+            partition 2:  None
+            partition 3:  None
+
+        Actual column statistics per partition:
+            partition 0:  None
+            partition 1:  None
+            partition 2:  None
+            partition 3:  None
+        ");
+
+        /* Test Case: None will override later merges with partitions having stats */
+        // (because None means cannot determine all of the subplan stats)
+        let partitioned_input = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            ranges_per_partition,
+        );
+        let spm = spm_exec(
+            &union_exec(vec![partitioned_input, interleave]),
+            &lex_ordering,
+        );
+        let test_case = TestCase::new(&spm, col_name, None);
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        SortPreservingMergeExec: [a@0 ASC]
+          UnionExec
+            DataSourceExec: file_groups={3 groups: [[0.parquet], [1.parquet], [2.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+            InterleaveExec
+              RepartitionExec: partitioning=Hash([], 4), input_partitions=1
+                DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+              RepartitionExec: partitioning=Hash([], 4), input_partitions=2
+                DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  None
+
+        Actual column statistics per partition:
+            partition 0:  None
+        ");
+
+        Ok(())
+    }
+
+    /// How null counts are handled.
+    #[test]
+    fn test_null_counts() -> Result<(), DataFusionError> {
+        let col_name = "a";
+        let lex_ordering = LexOrdering::new(vec![PhysicalSortExpr::new(
+            col(col_name, &single_column_schema())?,
+            Default::default(),
+        )]);
+        let ranges_per_partition = &[
+            &SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 1,
+            },
+            &SortKeyRange {
+                min: Some(2001),
+                max: Some(3000),
+                null_count: 4,
+            },
+            &SortKeyRange {
+                min: Some(3001),
+                max: Some(3500),
+                null_count: 3,
+            },
+        ];
+
+        /* Test Case: keeps null counts separate with pass thru */
+        let left_plan = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            &ranges_per_partition[0..1],
+        );
+        let right_plan = parquet_exec_with_sort_with_statistics(
+            vec![lex_ordering.clone()],
+            &ranges_per_partition[1..],
+        );
+        let union = union_exec(vec![left_plan, right_plan]);
+        let test_case = TestCase::new(&union, col_name, Some(ranges_per_partition));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        UnionExec
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(2000)) null_count=1
+            partition 1:  (Some(2001))->(Some(3000)) null_count=4
+            partition 2:  (Some(3001))->(Some(3500)) null_count=3
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(2000)) null_count=1
+            partition 1:  (Int32(2001))->(Int32(3000)) null_count=4
+            partition 2:  (Int32(3001))->(Int32(3500)) null_count=3
+        ");
+
+        /* Test Case: merges null counts */
+        let spm = spm_exec(&union, &lex_ordering);
+        let expect_merged = &[&SortKeyRange {
+            min: Some(1000),
+            max: Some(3500),
+            null_count: 8,
+        }];
+        let test_case = TestCase::new(&spm, col_name, Some(expect_merged));
+        assert_snapshot!(
+            test_case.run()?,
+            @r"
+        SortPreservingMergeExec: [a@0 ASC]
+          UnionExec
+            DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+            DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[a], output_ordering=[a@0 ASC], file_type=parquet
+
+        Expected column statistics per partition:
+            partition 0:  (Some(1000))->(Some(3500)) null_count=8
+
+        Actual column statistics per partition:
+            partition 0:  (Int32(1000))->(Int32(3500)) null_count=8
+        ");
+
+        Ok(())
+    }
+
+    /// Test we are using the proper file schema
+    #[test]
+    fn test_file_group() -> Result<(), DataFusionError> {
+        // target column
+        let col_name = "C";
+
+        // File schema, vs plan schema.
+        let file_schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            Field::new("a", DataType::Int64, true),
+            Field::new("b", DataType::Int64, true),
+            Field::new(col_name, DataType::Int64, true),
+        ]));
+        let plan_schema = Arc::new(arrow::datatypes::Schema::new(vec![Field::new(
+            col_name,
+            DataType::Int64,
+            true,
+        )]));
+
+        // File scan config uses the file schema and ALL columns in file.
+        let ranges_for_file_0 = vec![
+            // col a
+            SortKeyRange {
+                min: Some(20),
+                max: Some(30),
+                null_count: 1,
+            },
+            // col b
+            SortKeyRange {
+                min: Some(2000),
+                max: Some(3000),
+                null_count: 1,
+            },
+            // col C (our tested key to extract)
+            SortKeyRange {
+                min: Some(200_000),
+                max: Some(300_000),
+                null_count: 1,
+            },
+        ];
+        let ranges_for_file_1 = vec![
+            // col a
+            SortKeyRange {
+                min: Some(30),
+                max: Some(40),
+                null_count: 1,
+            },
+            // col b
+            SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 1,
+            },
+            // col C (our tested key to extract)
+            SortKeyRange {
+                min: Some(500_000),
+                max: Some(700_000),
+                null_count: 1,
+            },
+        ];
+        let lex_ordering_on_c = LexOrdering::new(vec![PhysicalSortExpr::new(
+            col(col_name, &file_schema)?, // file_schema
+            Default::default(),
+        )]);
+        let multiple_column_key_ranges_per_file = PartitionedFilesAndRanges {
+            per_file: vec![ranges_for_file_0.clone(), ranges_for_file_1.clone()],
+        };
+        #[expect(deprecated)]
+        let filegroups_config = file_scan_config(
+            &file_schema,
+            vec![lex_ordering_on_c],
+            multiple_column_key_ranges_per_file,
+        )
+        .with_projection(Some(vec![2]));
+
+        // use a plan with only col C
+        let parquet_exec = DataSourceExec::from_data_source(filegroups_config) as _;
+        let projected = col(col_name, &plan_schema)?; // plan_schema
+        let plan = proj_exec(&parquet_exec, vec![(projected, "C".into())]);
+        insta::assert_snapshot!(
+            displayable(plan.as_ref()).indent(true),
+            @r"
+        ProjectionExec: expr=[C@0 as C]
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[C], output_ordering=[C@0 ASC], file_type=parquet
+        ",
+        );
+
+        // Test Case: show that plan schema is different from file schema
+        assert_ne!(
+            &plan_schema, &file_schema,
+            "plan and file schema are not equivalent"
+        );
+        assert_eq!(
+            plan.schema().fields().len(),
+            1,
+            "plan schema should have only 1 field"
+        );
+        assert_eq!(
+            plan.schema(),
+            plan_schema,
+            "plan schema should only have col C"
+        );
+        let Some(parquet_exec) =
+            plan.children()[0].as_any().downcast_ref::<DataSourceExec>()
+        else {
+            unreachable!()
+        };
+        assert_eq!(
+            parquet_exec.schema(),
+            plan_schema,
+            "parquet exec plan schema should only have col C"
+        );
+
+        /* Test Case: the statistics_by_partition will still extract the proper file_stats for col C */
+        let actual = statistics_by_partition(plan.as_ref())?;
+        let [actual_partition_0, actual_partition_1] = &actual[..] else {
+            panic!("should have stats for 2 partitions");
+        };
+        assert_eq!(
+            actual_partition_0.column_statistics.len(),
+            1,
+            "should have only 1 column for the ProjectExec C@0"
+        );
+        assert_eq!(
+            actual_partition_1.column_statistics.len(),
+            1,
+            "should have only 1 column for the ProjectExec C@0"
+        );
+
+        // partition 0 == ranges_for_file_0
+        let expected = ranges_for_file_0[2];
+        assert_eq!(actual_partition_0.column_statistics[0], expected.into());
+
+        // partition 1 == ranges_for_file_1
+        let expected = ranges_for_file_1[2];
+        assert_eq!(actual_partition_1.column_statistics[0], expected.into());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_extracts_multiple_cols_at_once() -> Result<(), DataFusionError> {
+        // plan with multiple fields
+        let mut fields = vec![
+            Field::new("_not_used_file_col", DataType::Int64, true),
+            Field::new("b", DataType::Int64, true),
+            Field::new("c", DataType::Int64, true),
+        ];
+        let file_schema = Arc::new(arrow::datatypes::Schema::new(fields.clone()));
+        let plan_schema = Arc::new(arrow::datatypes::Schema::new(fields.split_off(1)));
+
+        // File scan config uses the file schema and ALL columns in file.
+        let ranges_for_file_0 = vec![
+            // col _not_used_file_col
+            SortKeyRange {
+                min: Some(20),
+                max: Some(30),
+                null_count: 1,
+            },
+            // col b
+            SortKeyRange {
+                min: Some(2000),
+                max: Some(3000),
+                null_count: 1,
+            },
+            // col c
+            SortKeyRange {
+                min: Some(200_000),
+                max: Some(300_000),
+                null_count: 1,
+            },
+        ];
+        let ranges_for_file_1 = vec![
+            // col _not_used_file_col
+            SortKeyRange {
+                min: Some(30),
+                max: Some(40),
+                null_count: 1,
+            },
+            // col b
+            SortKeyRange {
+                min: Some(1000),
+                max: Some(2000),
+                null_count: 1,
+            },
+            // col c
+            SortKeyRange {
+                min: Some(500_000),
+                max: Some(700_000),
+                null_count: 1,
+            },
+        ];
+
+        // make file config, using file schema
+        let lex_ordering_on_c = LexOrdering::new(vec![
+            PhysicalSortExpr::new(col("c", &file_schema)?, Default::default()),
+            PhysicalSortExpr::new(col("b", &file_schema)?, Default::default()),
+        ]);
+        let multiple_column_key_ranges_per_file = PartitionedFilesAndRanges {
+            per_file: vec![ranges_for_file_0.clone(), ranges_for_file_1.clone()],
+        };
+        #[expect(deprecated)]
+        let filegroups_config = file_scan_config(
+            &file_schema, // file_schema
+            vec![lex_ordering_on_c],
+            multiple_column_key_ranges_per_file,
+        )
+        .with_projection(Some(vec![1, 2]));
+
+        // make plan config, using a plan with only cols b & c
+        let parquet_exec = DataSourceExec::from_data_source(filegroups_config) as _;
+        let proj_c = col("c", &plan_schema)?; // plan_schema
+        let proj_b = col("b", &plan_schema)?; // plan_schema
+                                              // plan reverses the 2 cols, c then b
+        let plan = proj_exec(
+            &parquet_exec,
+            vec![(proj_c, "c".into()), (proj_b, "b".into())],
+        );
+        insta::assert_snapshot!(
+            displayable(plan.as_ref()).indent(true),
+            @r"
+        ProjectionExec: expr=[c@1 as c, b@0 as b]
+          DataSourceExec: file_groups={2 groups: [[0.parquet], [1.parquet]]}, projection=[b, c], output_ordering=[c@1 ASC, b@0 ASC], file_type=parquet
+        ",
+        );
+
+        /* Test Case: the statistics_by_partition will still extract the proper file_stats for both cols c and b */
+        let actual = statistics_by_partition(plan.as_ref())?;
+        let [actual_partition_0, actual_partition_1] = &actual[..] else {
+            panic!("should have stats for 2 partitions");
+        };
+
+        // partition 0 == ranges_for_file_0
+        // use cols c then b, in reverse order for the projection [2..=1]
+        let expected: Vec<ColumnStatistics> = ranges_for_file_0[1..=2]
+            .iter()
+            .rev()
+            .map(|sort_range| (*sort_range).into())
+            .collect_vec();
+        assert_eq!(
+            &expected,
+            &[
+                // col c, partition 0
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(200_000))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(300_000))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+                // col b, partition 0
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(2000))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(3000))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+            ]
+        );
+        assert_eq!(actual_partition_0.column_statistics, expected);
+
+        // partition 1 == ranges_for_file_1
+        // use cols c then b, in reverse order for the projection [2..=1]
+        let expected: Vec<ColumnStatistics> = ranges_for_file_1[1..=2]
+            .iter()
+            .rev()
+            .map(|sort_range| (*sort_range).into())
+            .collect_vec();
+        assert_eq!(
+            &expected,
+            &[
+                // col c, partition 1
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(500_000))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(700_000))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+                // col b, partition 1
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(2000))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+            ]
+        );
+        assert_eq!(actual_partition_1.column_statistics, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_will_not_extract_for_non_passthru_projections() -> Result<(), DataFusionError>
+    {
+        // plan with multiple fields
+        let file_schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            Field::new("a", DataType::Int64, true),
+            Field::new("b", DataType::Int64, true),
+            Field::new("c", DataType::Int64, true),
+        ]));
+        let plan_schema = Arc::new(arrow::datatypes::Schema::new(vec![
+            Field::new("b", DataType::Int64, true), // will be `a + 1 as b``
+            Field::new("c", DataType::Int64, true),
+        ]));
+
+        // file stats for 3 columns in file_schema
+        let ranges_for_file = vec![
+            // col a
+            SortKeyRange {
+                min: Some(20),
+                max: Some(30),
+                null_count: 1,
+            },
+            // col b
+            SortKeyRange {
+                min: Some(2000),
+                max: Some(3000),
+                null_count: 1,
+            },
+            // col c
+            SortKeyRange {
+                min: Some(200_000),
+                max: Some(300_000),
+                null_count: 1,
+            },
+        ];
+
+        // make scan plan
+        let multiple_column_key_ranges_per_file = PartitionedFilesAndRanges {
+            per_file: vec![ranges_for_file],
+        };
+        #[expect(deprecated)]
+        let filegroups_config = file_scan_config(
+            &file_schema, // file_schema
+            vec![],
+            multiple_column_key_ranges_per_file,
+        )
+        .with_projection(Some(vec![0, 1, 2]));
+        let scan = DataSourceExec::from_data_source(filegroups_config) as _;
+
+        // make projection which modifies columns and aliases to an existing columns
+        let pass_thru_c = col("c", &file_schema)?;
+        let make_new = Arc::new(BinaryExpr::new(
+            col("a", &file_schema)?,
+            Operator::Minus,
+            lit(1),
+        )) as _;
+        let plan = proj_exec(
+            &scan,
+            vec![(make_new, "b".into()), (pass_thru_c, "c".into())],
+        );
+        insta::assert_snapshot!(
+            displayable(plan.as_ref()).indent(true),
+            @r"
+        ProjectionExec: expr=[a@0 - 1 as b, c@2 as c]
+          DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[a, b, c], file_type=parquet
+        ",
+        );
+        assert_eq!(
+            plan.schema(),
+            plan_schema,
+            "should have plan schema with the 2 final projected columns"
+        );
+
+        /* Test Case: the statistics_by_partition will still extract the proper file_stats for both cols c and b */
+        let actual = statistics_by_partition(plan.as_ref())?;
+
+        // will selectively detect which projection exprs are not a pass thru, and return absent statistics
+        assert_eq!(
+            &actual[0].column_statistics,
+            &[
+                // a + 1 as b, partition 0
+                ColumnStatistics::new_unknown(),
+                // col c, partition 0
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(200_000))),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(300_000))),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+            ]
+        );
+
+        Ok(())
     }
 }
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/statistics/fetch.rs b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/fetch.rs
new file mode 100644
index 000000000000..4498c16378fa
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/fetch.rs
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_common::{stats::Precision, Statistics};
+
+use super::util::make_column_statistics_inexact;
+
+/// Applies fetch and skip to a num_rows.
+///
+/// Fetch is the total num of rows fetched, after the skip is applied.
+///
+/// This logic is extracted from [`Statistics::with_fetch`]. Refer to
+/// <https://docs.rs/datafusion-common/47.0.0/src/datafusion_common/stats.rs.html#391-397>.
+pub(super) fn apply_fetch(
+    stats: Arc<Statistics>,
+    fetch: Option<usize>,
+    skip: usize,
+) -> Arc<Statistics> {
+    let fetch_applied = match (stats.num_rows, fetch, skip) {
+        (num_rows, None, 0) => num_rows,
+        (Precision::Absent, None, _) => Precision::Absent,
+
+        // absent, but can estimate
+        (Precision::Absent, Some(fetch), _) => Precision::Inexact(fetch),
+
+        // if we are skipping more rows than we have => then 0 output row.
+        (num_rows, _, skip)
+            if skip >= *num_rows.get_value().expect("should not be absent") =>
+        {
+            Precision::Exact(0)
+        }
+
+        // have exact, return exact
+        (Precision::Exact(num_rows), fetch, skip) => {
+            let num_rows_remaining = num_rows.saturating_sub(skip);
+            let fetched =
+                std::cmp::min(num_rows_remaining, fetch.unwrap_or(num_rows_remaining));
+            Precision::Exact(fetched)
+        }
+
+        // have inexact, return inexact
+        (Precision::Inexact(num_rows), fetch, skip) => {
+            let num_rows_remaining = num_rows.saturating_sub(skip);
+            let fetched =
+                std::cmp::min(num_rows_remaining, fetch.unwrap_or(num_rows_remaining));
+            Precision::Inexact(fetched)
+        }
+    };
+
+    if stats.num_rows == fetch_applied {
+        stats
+    } else {
+        let mut stats = Arc::unwrap_or_clone(stats);
+        stats.num_rows = fetch_applied;
+
+        // since the num_rows have to change, also need to modify other stats
+        stats.total_byte_size = Precision::Absent;
+        stats.column_statistics = make_column_statistics_inexact(stats.column_statistics);
+
+        Arc::new(stats)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::progressive_evaluation::util::test_utils::single_column_schema;
+
+    use super::*;
+
+    fn build_stats(num_rows: Precision<usize>) -> Arc<Statistics> {
+        let mut stats = Statistics::new_unknown(&single_column_schema());
+        stats.num_rows = num_rows;
+        stats.into()
+    }
+
+    #[test]
+    fn test_apply_fetch_to_num_rows() {
+        /* starting with num_rows = Precision::Absent */
+
+        // nothing is there
+        let stats_absent = build_stats(Precision::Absent);
+        let actual = apply_fetch(Arc::clone(&stats_absent), None, 0);
+        assert_eq!(actual.num_rows, Precision::Absent, "should still be absent");
+
+        // absent input, + fetch
+        let actual = apply_fetch(stats_absent, Some(42), 42);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Inexact(42),
+            "should return an inexact(fetch) when then input rows are unknown"
+        );
+
+        /* starting with num_rows = Precision::Exact */
+
+        // input rows=100
+        let starting_num_rows = Precision::Exact(100);
+        let stats = build_stats(starting_num_rows);
+        let actual = apply_fetch(Arc::clone(&stats), None, 0);
+        assert_eq!(
+            actual.num_rows, starting_num_rows,
+            "should return starting input, without fetch nor skip"
+        );
+
+        // input rows=100, + skip 25
+        let actual = apply_fetch(Arc::clone(&stats), None, 25);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Exact(75),
+            "should account for skipped rows"
+        );
+
+        // input rows=100, + fetch 80  => returns 80
+        let actual = apply_fetch(Arc::clone(&stats), Some(80), 0);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Exact(80),
+            "should account for fetched rows"
+        );
+
+        // input rows=100, + fetch 200  => returns 100
+        let actual = apply_fetch(Arc::clone(&stats), Some(200), 0);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Exact(100),
+            "should account for total rows, even with >fetch"
+        );
+
+        // input rows=100, + fetch 80 + skip 25   => returns 75
+        let actual = apply_fetch(Arc::clone(&stats), Some(80), 25);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Exact(75),
+            "should be able to return fewer rows, after skip"
+        );
+
+        // input rows=100, + fetch 80 + skip 100   => returns 0
+        let actual = apply_fetch(Arc::clone(&stats), Some(80), 100);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Exact(0),
+            "should have no rows to return after skip"
+        );
+
+        /* starting with num_rows = Precision::InExact */
+
+        // input rows=100
+        let starting_num_rows = Precision::Inexact(100);
+        let stats = build_stats(starting_num_rows);
+        let actual = apply_fetch(Arc::clone(&stats), None, 0);
+        assert_eq!(
+            actual.num_rows, starting_num_rows,
+            "should return starting input, without fetch nor skip"
+        );
+
+        // input rows=100, + skip 25
+        let actual = apply_fetch(Arc::clone(&stats), None, 25);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Inexact(75),
+            "should account for skipped rows"
+        );
+
+        // input rows=100, + fetch 80  => returns 80
+        let actual = apply_fetch(Arc::clone(&stats), Some(80), 0);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Inexact(80),
+            "should account for fetched rows"
+        );
+
+        // input rows=100, + fetch 200  => returns 100
+        let actual = apply_fetch(Arc::clone(&stats), Some(200), 0);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Inexact(100),
+            "should account for total rows, even with >fetch"
+        );
+
+        // input rows=100, + fetch 80 + skip 25   => returns 75
+        let actual = apply_fetch(Arc::clone(&stats), Some(80), 25);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Inexact(75),
+            "should be able to return fewer rows, after skip"
+        );
+
+        // input rows=100, + fetch 80 + skip 100   => returns 0
+        let actual = apply_fetch(Arc::clone(&stats), Some(80), 100);
+        assert_eq!(
+            actual.num_rows,
+            Precision::Exact(0),
+            "should have no rows to return after skip"
+        );
+    }
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/statistics/filter.rs b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/filter.rs
new file mode 100644
index 000000000000..5c0f0e5585ed
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/filter.rs
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion_common::Statistics;
+use datafusion_physical_expr::{
+    analyze, intervals::utils::check_support, AnalysisContext,
+};
+use datafusion_physical_plan::{filter::collect_new_statistics, PhysicalExpr};
+
+/// Calculates [`Statistics`] by applying selectivity (either default, or estimated) to input statistics.
+///
+/// This is a (slightly) modified from a private function in datafusion. Refer to:
+/// <https://github.com/apache/datafusion/blob/07a310fea8d287805d1490e9dc3c1b7b1c2775d8/datafusion/physical-plan/src/filter.rs#L175-L216>
+pub(super) fn apply_filter(
+    input_stats: Statistics,
+    input_schema: &SchemaRef,
+    predicate: &Arc<dyn PhysicalExpr>,
+    default_selectivity: u8,
+) -> datafusion_common::Result<Statistics> {
+    if !check_support(predicate, input_schema) {
+        let selectivity = default_selectivity as f64 / 100.0;
+        let mut stats = input_stats.to_inexact();
+        stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity);
+        stats.total_byte_size = stats
+            .total_byte_size
+            .with_estimated_selectivity(selectivity);
+        return Ok(stats);
+    }
+
+    let num_rows = input_stats.num_rows;
+    let total_byte_size = input_stats.total_byte_size;
+    let input_analysis_ctx = AnalysisContext::try_from_statistics(
+        input_schema,
+        &input_stats.column_statistics,
+    )?;
+
+    let analysis_ctx = analyze(predicate, input_analysis_ctx, input_schema)?;
+
+    // Estimate (inexact) selectivity of predicate
+    let selectivity = analysis_ctx.selectivity.unwrap_or(1.0);
+    let num_rows = num_rows.with_estimated_selectivity(selectivity);
+    let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity);
+
+    let column_statistics =
+        collect_new_statistics(&input_stats.column_statistics, analysis_ctx.boundaries);
+    Ok(Statistics {
+        num_rows,
+        total_byte_size,
+        column_statistics,
+    })
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/statistics/project_schema.rs b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/project_schema.rs
new file mode 100644
index 000000000000..1984c2c32bfe
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/project_schema.rs
@@ -0,0 +1,632 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{borrow::Borrow, sync::Arc};
+
+use arrow::datatypes::SchemaRef;
+use datafusion_common::{
+    internal_datafusion_err, stats::Precision, ColumnStatistics, Result, ScalarValue,
+    Statistics,
+};
+use datafusion_physical_plan::{expressions::Column, PhysicalExpr};
+use itertools::Itertools;
+
+use super::util::pretty_fmt_fields;
+
+/// Takes in a datasource schema and statistics, and projects the scan schema on top.
+///
+/// This handles use cases where we may be either:
+/// (1) scanning a subset of columns, (b) adding a __chunk_order column, (c) resolving different schema across files.
+///
+/// This will error if the src_schema (e.g. file schema) has a field without a corresponding column_statistics.
+pub(crate) fn project_schema_onto_datasrc_statistics(
+    src_statistics: &Statistics,
+    src_schema: &SchemaRef,
+    project_schema: &SchemaRef,
+) -> Result<Arc<Statistics>> {
+    // (1) scanning a subset of columns (a.k.a. project_schema.fields)
+    let column_statistics = project_schema
+        .fields()
+        .iter()
+        .map(|field| {
+            let Ok(col_idx) = src_schema.index_of(field.name()) else {
+                // adding an empty column (e.g. different parquet files could have slightly different schema).
+                return Ok(ColumnStatistics {
+                    null_count: src_statistics.num_rows,
+                    max_value: Precision::Exact(ScalarValue::Null),
+                    min_value: Precision::Exact(ScalarValue::Null),
+                    sum_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                });
+            };
+
+            let Some(col_stats) = src_statistics.column_statistics.get(col_idx) else {
+                return Err(internal_datafusion_err!(
+                    "missing column statistics for {}@{}",
+                    field.name(),
+                    col_idx
+                ));
+            };
+            Ok(col_stats.clone())
+        })
+        .try_collect()?;
+
+    Ok(Arc::new(Statistics {
+        num_rows: src_statistics.num_rows,
+        total_byte_size: src_statistics.total_byte_size,
+        column_statistics,
+    }))
+}
+
+/// Determine the output statistics based upon a subset of columns.
+///
+/// [`Statistics`] have column_statistics indexed to the plan's schema fields.
+/// When the output projected schema has fewer columns than the input schema,
+/// (a.k.a. it takes a subselection of columns),
+/// then we also need to take a subselection of column stats.
+///
+/// This does not have a concept of aliasing, as it requires a `subset_selected` of columns.
+///
+/// This will error if the src_schema has a field without a corresponding src column_statistics.
+pub(super) fn project_select_subset_of_column_statistics<
+    T: Borrow<Statistics> + Clone + Into<Arc<Statistics>> + std::fmt::Debug,
+>(
+    src_statistics: &T,
+    src_schema: &SchemaRef,
+    subset_selected: &Vec<usize>,
+    project_schema: &SchemaRef,
+) -> Result<Arc<Statistics>> {
+    // The src_statistics columns does not match the src schema.
+    if src_statistics.borrow().column_statistics.len() != src_schema.fields().len() {
+        return Err(internal_datafusion_err!(
+            "statistics do not have the same number of columns as the schema, the schema has {:?} fields whereas the statistics have {:?} columns",
+            src_schema.fields().len(),
+            src_statistics.borrow().column_statistics.len()
+        ));
+    }
+
+    // Check if projection schema matches the len of the subset_selected.
+    // Do this before using the subset_selected for projecting onto the src_schema.
+    if project_schema.fields().len() != subset_selected.len() {
+        return Err(internal_datafusion_err!(
+            "projected schema cannot be a subset of columns, the schema has {:?} fields whereas the projection indices have {:?} columns",
+            project_schema.fields().len(),
+            subset_selected.len()
+        ));
+    }
+
+    // Select a subset of columns, erroring if the output column does not exist in the input schema.
+    let column_statistics = src_schema
+        .project(subset_selected)?
+        .fields()
+        .iter()
+        .zip(subset_selected)
+        .enumerate()
+        .map(|(proj_col_idx, (src_field, src_col_idx))| {
+            let project_field = &project_schema.fields()[proj_col_idx];
+
+            // aliasing not permitted
+            if src_field.name() != project_field.name() {
+                return Err(internal_datafusion_err!(
+                    "field `{}`@{} does not exist in input schema, found fields [{}]",
+                    project_field.name(),
+                    src_col_idx,
+                    pretty_fmt_fields(src_schema)
+                ));
+            }
+
+            // confirm idx is in the col_stats
+            let Some(col_stats) =
+                src_statistics.borrow().column_statistics.get(*src_col_idx)
+            else {
+                return Err(internal_datafusion_err!(
+                    "missing column statistics for {}@{}",
+                    src_field.name(),
+                    src_col_idx
+                ));
+            };
+            Ok(col_stats.clone())
+        })
+        .try_collect()?;
+
+    Ok(Arc::new(Statistics {
+        num_rows: src_statistics.borrow().num_rows,
+        total_byte_size: src_statistics.borrow().total_byte_size,
+        column_statistics,
+    }))
+}
+
+/// For [`ProjectionExec`](datafusion_physical_plan::projection::ProjectionExec),
+/// selectively cast schema projection on [`Statistics`].
+///
+/// This differs from other schema projections, as non-[`Column`] expressions may be present
+/// or aliasing may be used.
+/// In these circumstances, instead of erroring it returns unknown/absent column_statistics.
+///
+/// This code is pulled from a private internal DataFusion function:
+/// <https://github.com/influxdata/arrow-datafusion/blob/b754eb4b9ccadc03d13838ac6e416c407975b405/datafusion/physical-plan/src/projection.rs#L294-L326>
+pub(super) fn proj_exec_stats<'a>(
+    mut stats: Statistics,
+    exprs: impl Iterator<Item = &'a (Arc<dyn PhysicalExpr>, String)>,
+    projexec_schema: &SchemaRef,
+) -> Arc<Statistics> {
+    let mut primitive_row_size = 0;
+    let mut primitive_row_size_possible = true;
+    let mut column_statistics = vec![];
+    for (expr, _) in exprs {
+        let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() {
+            stats.column_statistics[col.index()].clone()
+        } else {
+            // TODO stats: estimate more statistics from expressions
+            // (expressions should compute their statistics themselves)
+            ColumnStatistics::new_unknown()
+        };
+        column_statistics.push(col_stats);
+        if let Ok(data_type) = expr.data_type(projexec_schema) {
+            if let Some(value) = data_type.primitive_width() {
+                primitive_row_size += value;
+                continue;
+            }
+        }
+        primitive_row_size_possible = false;
+    }
+
+    if primitive_row_size_possible {
+        stats.total_byte_size =
+            Precision::Exact(primitive_row_size).multiply(&stats.num_rows);
+    }
+    stats.column_statistics = column_statistics;
+    Arc::new(stats)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{Field, Schema};
+    use datafusion_physical_plan::expressions::{col, NoOp};
+
+    fn build_schema(col_names: Vec<&str>) -> SchemaRef {
+        let fields = col_names
+            .into_iter()
+            .map(|col| Field::new(col, arrow::datatypes::DataType::Int64, false))
+            .collect_vec()
+            .into();
+        Arc::new(Schema {
+            fields,
+            metadata: Default::default(),
+        })
+    }
+
+    fn build_col_stats(
+        values: Vec<(&str, i64, i64, usize)>,
+    ) -> (Arc<Statistics>, SchemaRef) {
+        let (column_statistics, fields) = values.into_iter().fold(
+            (vec![], vec![]),
+            |(mut col_stats, mut fields), (col_name, min, max, nulls)| {
+                let col_stat = if col_name == "I am an alias"
+                    || col_name == "I am an alias for possible idx bounds-failure"
+                {
+                    ColumnStatistics::new_unknown()
+                } else {
+                    ColumnStatistics {
+                        min_value: Precision::Exact(ScalarValue::Int64(Some(min))),
+                        max_value: Precision::Exact(ScalarValue::Int64(Some(max))),
+                        null_count: Precision::Exact(nulls),
+                        sum_value: Precision::Absent,
+                        distinct_count: Precision::Absent,
+                    }
+                };
+
+                col_stats.push(col_stat);
+                fields.push(col_name);
+                (col_stats, fields)
+            },
+        );
+
+        (
+            Arc::new(Statistics {
+                column_statistics,
+                num_rows: Precision::Absent,
+                total_byte_size: Precision::Absent,
+            }),
+            build_schema(fields),
+        )
+    }
+
+    #[test]
+    fn test_retain_all_columns() {
+        let (src_stats, src_schema) = build_col_stats(vec![
+            ("col_a", 0, 24, 1),
+            ("col_b", 354, 242650, 2),
+            ("col_c", 67, 1111, 3),
+        ]);
+        let (expected_stats, project_schema) = build_col_stats(vec![
+            ("col_a", 0, 24, 1),
+            ("col_b", 354, 242650, 2),
+            ("col_c", 67, 1111, 3),
+        ]);
+
+        /* Test: works for subset projection */
+        let actual = project_select_subset_of_column_statistics(
+            &src_stats,
+            &src_schema,
+            &vec![0, 1, 2],
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to project all columns"
+        );
+
+        /* Test: works for datasource projection */
+        let actual = project_schema_onto_datasrc_statistics(
+            &src_stats,
+            &src_schema,
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to project all columns"
+        );
+
+        /* Test: works for projection_exec */
+        let exprs = [
+            (col("col_a", &src_schema).unwrap(), "col_a".into()),
+            (col("col_b", &src_schema).unwrap(), "col_b".into()),
+            (col("col_c", &src_schema).unwrap(), "col_c".into()),
+        ];
+        let actual = proj_exec_stats(
+            Arc::unwrap_or_clone(src_stats),
+            exprs.iter(),
+            &project_schema,
+        );
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to project all columns"
+        );
+    }
+
+    #[test]
+    fn test_remove_and_reorder_columns() {
+        let (src_stats, src_schema) = build_col_stats(vec![
+            ("col_a", 0, 24, 1),
+            ("col_b", 354, 242650, 2),
+            ("col_c", 67, 1111, 3),
+        ]);
+        let (expected_stats, project_schema) = build_col_stats(vec![
+            ("col_c", 67, 1111, 3),
+            // projection removed col_b
+            ("col_a", 0, 24, 1),
+        ]);
+
+        /* Test: works for subset projection */
+        let actual = project_select_subset_of_column_statistics(
+            &src_stats,
+            &src_schema,
+            &vec![2, 0],
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to remove and re-order columns"
+        );
+
+        /* Test: correctly errors for subset projection, if the subset_selected doesn't match the projection schema */
+        let err = project_select_subset_of_column_statistics(
+            &src_stats,
+            &src_schema,
+            &vec![0, 2], // this is incorrect
+            &project_schema,
+        )
+        .unwrap_err();
+        assert!(err.message().contains(
+            "field `col_c`@0 does not exist in input schema, found fields [col_a,col_b,col_c]"
+        ));
+
+        /* Test: works for datasource projection */
+        let actual = project_schema_onto_datasrc_statistics(
+            &src_stats,
+            &src_schema,
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to remove and re-order columns"
+        );
+
+        /* Test: works for projection_exec */
+        let exprs = [
+            (col("col_c", &src_schema).unwrap(), "col_c".into()),
+            (col("col_a", &src_schema).unwrap(), "col_a".into()),
+        ];
+        let actual = proj_exec_stats(
+            Arc::unwrap_or_clone(src_stats),
+            exprs.iter(),
+            &project_schema,
+        );
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to remove and re-order columns"
+        );
+    }
+
+    #[test]
+    fn test_projection_with_aliases() {
+        /* Test: can handle schema with aliases */
+        let (src_stats, src_schema) =
+            build_col_stats(vec![("col_a", 0, 24, 1), ("col_b", 354, 242650, 2)]);
+        let (expected_stats, project_schema) = build_col_stats(vec![
+            ("col_a", 0, 24, 1),
+            ("I am an alias", 0, 0, 0),
+            ("I am an alias for possible idx bounds-failure", 0, 0, 0),
+        ]);
+
+        /* Test: errors for subset projection, when number of subset_selected don't match the number of projected fields */
+        let err = project_select_subset_of_column_statistics(
+            &src_stats,
+            &src_schema,
+            &vec![0],
+            &project_schema,
+        )
+        .unwrap_err();
+        assert!(err.message().contains(
+            "projected schema cannot be a subset of columns, the schema has 3 fields whereas the projection indices have 1 columns"
+        ));
+
+        /* Test: errors for subset projection, when field names in the src vs projected schema do not match */
+        let err = project_select_subset_of_column_statistics(
+            &src_stats,
+            &src_schema,
+            &vec![0, 1, 1],
+            &project_schema,
+        )
+        .unwrap_err();
+        assert!(err.message().contains(
+            "field `I am an alias`@1 does not exist in input schema, found fields [col_a,col_b]."
+        ));
+
+        /* Test: works for datasource projection */
+        // ** SPECIAL FOR DATASRC = fills in nulls for aliases missing in filegroup schema **
+        let actual = project_schema_onto_datasrc_statistics(
+            &src_stats,
+            &src_schema,
+            &project_schema,
+        )
+        .expect("ok");
+        let datasrc_null_columns = ColumnStatistics {
+            null_count: expected_stats.num_rows,
+            max_value: Precision::Exact(ScalarValue::Null),
+            min_value: Precision::Exact(ScalarValue::Null),
+            sum_value: Precision::Absent,
+            distinct_count: Precision::Absent,
+        };
+        let expected_stats_with_nulls_at_datasrc = Arc::new(Statistics {
+            num_rows: expected_stats.num_rows,
+            total_byte_size: expected_stats.total_byte_size,
+            column_statistics: vec![
+                expected_stats.column_statistics[0].clone(),
+                datasrc_null_columns.clone(),
+                datasrc_null_columns.clone(),
+            ],
+        });
+        assert_eq!(
+            actual, expected_stats_with_nulls_at_datasrc,
+            "should be able to handle schema with aliases"
+        );
+        assert_eq!(
+            actual.column_statistics[1], datasrc_null_columns,
+            "aliased column should have NULL min/max"
+        );
+
+        /* Test: works for projection_exec */
+        let exprs = [
+            (col("col_a", &src_schema).unwrap(), "col_a".into()),
+            (Arc::new(NoOp::new()), "I am an alias".into()),
+            (
+                Arc::new(NoOp::new()),
+                "I am an alias for possible idx bounds-failure".into(),
+            ),
+        ];
+        let actual = proj_exec_stats(
+            Arc::unwrap_or_clone(src_stats),
+            exprs.iter(),
+            &project_schema,
+        );
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to handle schema with aliases"
+        );
+        assert_eq!(
+            actual.column_statistics[1],
+            ColumnStatistics::new_unknown(),
+            "aliased column should have unknown statistics"
+        );
+    }
+
+    #[test]
+    fn test_src_statistics_mismatches_src_schema() {
+        /* Test: will error if src-statistics and src-schema have a different number of columns, for a non-scan node */
+        let (src_stats_2_cols, _) = build_col_stats(vec![
+            ("col_a", 0, 24, 1),
+            ("make an extra column stats", 0, 24, 1),
+        ]);
+        let (_, src_schema_1_col) = build_col_stats(vec![("col_a", 0, 24, 1)]);
+        let (_expected_stats, project_schema) =
+            build_col_stats(vec![("col_a", 0, 24, 1)]);
+        let expected_error =
+            "statistics do not have the same number of columns as the schema";
+        let actual_err = project_select_subset_of_column_statistics(
+            &src_stats_2_cols,
+            &src_schema_1_col,
+            &vec![0, 1],
+            &project_schema,
+        )
+        .unwrap_err();
+        assert!(actual_err.message().contains(expected_error));
+    }
+
+    /// col_a (0), col_a (1) -->  col_a (0), col_a (1)
+    #[test]
+    fn test_projection_with_2_fields_having_same_name_and_same_projection_order() {
+        let (src_stats, src_schema) = build_col_stats(vec![
+            ("col_a", 0, 24, 1),    // first field with smaller stats
+            ("col_a", 10, 100, 42), // second field with larger stats
+        ]);
+        let (expected_stats, project_schema) =
+            build_col_stats(vec![("col_a", 0, 24, 1), ("col_a", 10, 100, 42)]);
+
+        /* Test: works for subset projection  */
+        let actual = project_select_subset_of_column_statistics(
+            &src_stats,
+            &src_schema,
+            &vec![0, 1],
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to handle schema with same-named fields"
+        );
+        assert_eq!(
+            actual.column_statistics[0].null_count.get_value(),
+            Some(&1),
+            "smaller stats col_a should come first"
+        );
+        assert_eq!(
+            actual.column_statistics[1].null_count.get_value(),
+            Some(&42),
+            "larger stats col_a should come second"
+        );
+
+        /* Test: DOES NOT WORK for datasource projection, we take the first matching field_name in the schema */
+        let actual = project_schema_onto_datasrc_statistics(
+            &src_stats,
+            &src_schema,
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual.column_statistics[0].null_count.get_value(),
+            Some(&1),
+            "smaller stats col_a should come first"
+        );
+        assert_eq!(
+            actual.column_statistics[1].null_count.get_value(),
+            Some(&1),
+            "** should take the first matching, which is again the smaller stats **"
+        );
+
+        /* Test: works for projection_exec */
+        let exprs = [
+            (Arc::new(Column::new("col_a", 0)) as _, "col_a_idx0".into()),
+            (Arc::new(Column::new("col_a", 1)) as _, "col_a_idx1".into()),
+        ];
+        let actual = proj_exec_stats(
+            Arc::unwrap_or_clone(src_stats),
+            exprs.iter(),
+            &project_schema,
+        );
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to handle schema with same-named fields"
+        );
+        assert_eq!(
+            actual.column_statistics[0].null_count.get_value(),
+            Some(&1),
+            "smaller stats col_a should come first"
+        );
+        assert_eq!(
+            actual.column_statistics[1].null_count.get_value(),
+            Some(&42),
+            "larger stats col_a should come second"
+        );
+    }
+
+    /// col_a (0), col_a (1) -->  col_a (1), col_a (0)
+    #[test]
+    fn test_projection_with_2_fields_having_same_name_and_inverse_projection_order() {
+        let (src_stats, src_schema) = build_col_stats(vec![
+            ("col_a", 0, 24, 1),    // first field with smaller stats
+            ("col_a", 10, 100, 42), // second field with larger stats
+        ]);
+        // project as [1,0] a.k.a. reverse the columns
+        let (expected_stats, project_schema) =
+            build_col_stats(vec![("col_a", 10, 100, 42), ("col_a", 0, 24, 1)]);
+
+        /* Test: works for subset projection  */
+        let actual = project_select_subset_of_column_statistics(
+            &src_stats,
+            &src_schema,
+            &vec![1, 0], // reverse the columns
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to handle schema with same-named fields, reversed ordering"
+        );
+        assert_eq!(
+            actual.column_statistics[0].null_count.get_value(),
+            Some(&42),
+            "bigger stats col_a should come first"
+        );
+
+        /* Test: DOES NOT WORK for datasource projection, we take the first matching field_name in the schema */
+        let actual = project_schema_onto_datasrc_statistics(
+            &src_stats,
+            &src_schema,
+            &project_schema,
+        )
+        .expect("ok");
+        assert_eq!(
+            actual.column_statistics[0].null_count.get_value(),
+            Some(&1),
+            "smaller stats col_a should come first"
+        );
+        assert_eq!(
+            actual.column_statistics[1].null_count.get_value(),
+            Some(&1),
+            "** should take the first matching, which is again the smaller stats **"
+        );
+
+        /* Test: NOT POSSIBLE for chunk order projection, since it's a pass-thru with no re-ordering of columns */
+
+        /* Test: works for projection_exec */
+        let exprs = [
+            (Arc::new(Column::new("col_a", 1)) as _, "col_a_idx1".into()),
+            (Arc::new(Column::new("col_a", 0)) as _, "col_a_idx0".into()),
+        ];
+        let actual = proj_exec_stats(
+            Arc::unwrap_or_clone(src_stats),
+            exprs.iter(),
+            &project_schema,
+        );
+        assert_eq!(
+            actual, expected_stats,
+            "should be able to handle schema with same-named fields, reversed ordering"
+        );
+        assert_eq!(
+            actual.column_statistics[0].null_count.get_value(),
+            Some(&42),
+            "bigger stats col_a should come first"
+        );
+    }
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/statistics/util.rs b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/util.rs
new file mode 100644
index 000000000000..e382c2428e5f
--- /dev/null
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/statistics/util.rs
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{borrow::Borrow, fmt::Debug, sync::Arc};
+
+use arrow::datatypes::SchemaRef;
+use datafusion_common::{stats::Precision, ColumnStatistics, Statistics};
+use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
+use itertools::Itertools;
+
+pub(super) fn partition_count(plan: &dyn ExecutionPlan) -> usize {
+    plan.output_partitioning().partition_count()
+}
+
+pub(super) fn pretty_fmt_fields(schema: &SchemaRef) -> String {
+    schema.fields().iter().map(|field| field.name()).join(",")
+}
+
+pub(super) fn make_column_statistics_inexact(
+    column_statistics: Vec<ColumnStatistics>,
+) -> Vec<ColumnStatistics> {
+    column_statistics
+        .into_iter()
+        .map(
+            |ColumnStatistics {
+                 null_count,
+                 min_value,
+                 max_value,
+                 sum_value,
+                 distinct_count,
+             }| {
+                ColumnStatistics {
+                    null_count: make_inexact_or_keep_absent(null_count),
+                    min_value: make_inexact_or_keep_absent(min_value),
+                    max_value: make_inexact_or_keep_absent(max_value),
+                    sum_value: make_inexact_or_keep_absent(sum_value),
+                    distinct_count: make_inexact_or_keep_absent(distinct_count),
+                }
+            },
+        )
+        .collect_vec()
+}
+
+fn make_inexact_or_keep_absent<T: Debug + Clone + Eq + PartialOrd>(
+    val: Precision<T>,
+) -> Precision<T> {
+    match val {
+        Precision::Exact(val) => Precision::Inexact(val),
+        _ => val,
+    }
+}
+
+/// Merge a collection of [`Statistics`] into a single stat.
+///
+/// This takes statistics references, which may or may not be arc'ed.
+pub(super) fn merge_stats_collection<
+    T: Borrow<Statistics> + Clone + Into<Arc<Statistics>>,
+>(
+    mut stats: impl Iterator<Item = T>,
+    target_schema: &SchemaRef,
+) -> Statistics {
+    let Some(start) = stats.next() else {
+        // stats is empty
+        return Statistics::new_unknown(target_schema);
+    };
+    stats.fold(start.borrow().clone(), |a, b| merge_stats(a, b.borrow()))
+}
+
+/// Merge together two [`Statistics`].
+fn merge_stats(a: Statistics, b: &Statistics) -> Statistics {
+    assert_eq!(
+        a.column_statistics.len(),
+        b.column_statistics.len(),
+        "failed to merge statistics, due to different column schema"
+    );
+
+    Statistics {
+        num_rows: a.num_rows.add(&b.num_rows),
+        total_byte_size: a.total_byte_size.add(&b.total_byte_size),
+        column_statistics: a
+            .column_statistics
+            .into_iter()
+            .zip(b.column_statistics.iter())
+            .map(|(a, b)| merge_col_stats(a, b))
+            .collect_vec(),
+    }
+}
+
+fn merge_col_stats(a: ColumnStatistics, b: &ColumnStatistics) -> ColumnStatistics {
+    ColumnStatistics {
+        null_count: a.null_count.add(&b.null_count),
+        min_value: a.min_value.min(&b.min_value),
+        max_value: a.max_value.max(&b.max_value),
+        sum_value: a.sum_value.add(&b.sum_value),
+        distinct_count: Precision::Absent,
+    }
+}
diff --git a/datafusion/physical-optimizer/src/progressive_evaluation/util.rs b/datafusion/physical-optimizer/src/progressive_evaluation/util.rs
index c6b76e6e77e9..0f53dc29b89d 100644
--- a/datafusion/physical-optimizer/src/progressive_evaluation/util.rs
+++ b/datafusion/physical-optimizer/src/progressive_evaluation/util.rs
@@ -165,7 +165,264 @@ fn transform_parquet_exec_single_file_each_group(
 fn build_file_group_with_stats(file: &PartitionedFile) -> FileGroup {
     let mut group = FileGroup::new(vec![file.clone()]);
     if let Some(stats) = &file.statistics {
-        group = group.with_statistics(Arc::unwrap_or_clone(stats.to_owned()))
+        group = group.with_statistics(Arc::clone(stats))
     }
     group
 }
+
+#[cfg(test)]
+pub(crate) mod test_utils {
+    use arrow::datatypes::DataType;
+    use arrow::datatypes::Field;
+    use arrow::datatypes::SchemaRef;
+    use datafusion_common::ScalarValue;
+    use datafusion_common::{stats::Precision, Statistics};
+    use datafusion_datasource::file_scan_config::FileScanConfig;
+    use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
+    use datafusion_datasource::source::DataSourceExec;
+    use datafusion_datasource::PartitionedFile;
+    use datafusion_datasource_parquet::source::ParquetSource;
+    use datafusion_execution::object_store::ObjectStoreUrl;
+    use datafusion_physical_expr::LexOrdering;
+    use datafusion_physical_plan::{
+        coalesce_batches::CoalesceBatchesExec,
+        filter::FilterExec,
+        joins::CrossJoinExec,
+        limit::LocalLimitExec,
+        projection::ProjectionExec,
+        repartition::RepartitionExec,
+        sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
+        union::UnionExec,
+        Partitioning, PhysicalExpr,
+    };
+    use datafusion_physical_plan::{ColumnStatistics, ExecutionPlan};
+
+    use itertools::Itertools;
+
+    use std::{
+        fmt::{self, Display, Formatter},
+        sync::Arc,
+    };
+
+    /// Return a schema with a single column `a` of type int64.
+    pub fn single_column_schema() -> SchemaRef {
+        Arc::new(arrow::datatypes::Schema::new(vec![Field::new(
+            "a",
+            DataType::Int64,
+            true,
+        )]))
+    }
+
+    #[derive(Debug, Copy, Clone)]
+    pub struct SortKeyRange {
+        pub min: Option<i32>,
+        pub max: Option<i32>,
+        pub null_count: usize,
+    }
+
+    impl From<SortKeyRange> for ColumnStatistics {
+        fn from(val: SortKeyRange) -> Self {
+            Self {
+                null_count: Precision::Exact(val.null_count),
+                max_value: Precision::Exact(ScalarValue::Int32(val.max)),
+                min_value: Precision::Exact(ScalarValue::Int32(val.min)),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+            }
+        }
+    }
+
+    impl Display for SortKeyRange {
+        fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+            write!(f, "({:?})->({:?})", self.min, self.max)?;
+            if self.null_count > 0 {
+                write!(f, " null_count={}", self.null_count)?;
+            }
+            Ok(())
+        }
+    }
+
+    /// Create a single parquet, with a given ordering and using the statistics from the [`SortKeyRange`]
+    pub fn parquet_exec_with_sort_with_statistics(
+        output_ordering: Vec<LexOrdering>,
+        key_ranges: &[&SortKeyRange],
+    ) -> Arc<dyn ExecutionPlan> {
+        parquet_exec_with_sort_with_statistics_and_schema(
+            &single_column_schema(),
+            output_ordering,
+            key_ranges,
+        )
+    }
+
+    pub type RangeForMultipleColumns = Vec<SortKeyRange>; // vec![col0, col1, col2]
+    pub struct PartitionedFilesAndRanges {
+        pub per_file: Vec<RangeForMultipleColumns>,
+    }
+
+    /// Create a single parquet exec, with multiple parquet, with a given ordering and using the statistics from the [`SortKeyRange`].
+    /// Assumes a single column schema.
+    pub fn parquet_exec_with_sort_with_statistics_and_schema(
+        schema: &SchemaRef,
+        output_ordering: Vec<LexOrdering>,
+        key_ranges_for_single_column_multiple_files: &[&SortKeyRange], // VecPerFile<KeyForSingleColumn>
+    ) -> Arc<dyn ExecutionPlan> {
+        let per_file_ranges = PartitionedFilesAndRanges {
+            per_file: key_ranges_for_single_column_multiple_files
+                .iter()
+                .map(|single_col_range_per_file| vec![**single_col_range_per_file])
+                .collect_vec(),
+        };
+
+        let file_scan_config = file_scan_config(schema, output_ordering, per_file_ranges);
+
+        DataSourceExec::from_data_source(file_scan_config)
+    }
+
+    /// Create a file scan config with a given file [`SchemaRef`], ordering,
+    /// and [`ColumnStatistics`] for multiple columns.
+    pub fn file_scan_config(
+        schema: &SchemaRef,
+        output_ordering: Vec<LexOrdering>,
+        multiple_column_key_ranges_per_file: PartitionedFilesAndRanges,
+    ) -> FileScanConfig {
+        let PartitionedFilesAndRanges { per_file } = multiple_column_key_ranges_per_file;
+        let mut statistics = Statistics::new_unknown(schema);
+        let mut file_groups = Vec::with_capacity(per_file.len());
+
+        // cummulative statistics for the entire parquet exec, per sort key
+        let num_sort_keys = per_file[0].len();
+        let mut cum_null_count = vec![0; num_sort_keys];
+        let mut cum_min = vec![None; num_sort_keys];
+        let mut cum_max = vec![None; num_sort_keys];
+
+        // iterate thru files, creating the PartitionedFile and the associated statistics
+        for (file_idx, multiple_column_key_ranges_per_file) in
+            per_file.into_iter().enumerate()
+        {
+            // gather stats for all columns
+            let mut per_file_col_stats = Vec::with_capacity(num_sort_keys);
+            for (col_idx, key_range) in
+                multiple_column_key_ranges_per_file.into_iter().enumerate()
+            {
+                let SortKeyRange {
+                    min,
+                    max,
+                    null_count,
+                } = key_range;
+
+                // update per file stats
+                per_file_col_stats.push(ColumnStatistics {
+                    null_count: Precision::Exact(null_count),
+                    min_value: Precision::Exact(ScalarValue::Int32(min)),
+                    max_value: Precision::Exact(ScalarValue::Int32(max)),
+                    ..Default::default()
+                });
+
+                // update cummulative statistics for entire parquet exec
+                cum_min[col_idx] = match (cum_min[col_idx], min) {
+                    (None, x) => x,
+                    (x, None) => x,
+                    (Some(a), Some(b)) => Some(std::cmp::min(a, b)),
+                };
+                cum_max[col_idx] = match (cum_max[col_idx], max) {
+                    (None, x) => x,
+                    (x, None) => x,
+                    (Some(a), Some(b)) => Some(std::cmp::max(a, b)),
+                };
+                cum_null_count[col_idx] += null_count;
+            }
+
+            // Create single file with statistics.
+            let mut file = PartitionedFile::new(format!("{file_idx}.parquet"), 100);
+            file.statistics = Some(Arc::new(Statistics {
+                num_rows: Precision::Absent,
+                total_byte_size: Precision::Absent,
+                column_statistics: per_file_col_stats,
+            }));
+            file_groups.push(vec![file].into());
+        }
+
+        // add stats, for the whole parquet exec, for all columns
+        for col_idx in 0..num_sort_keys {
+            statistics.column_statistics[col_idx] = ColumnStatistics {
+                null_count: Precision::Exact(cum_null_count[col_idx]),
+                min_value: Precision::Exact(ScalarValue::Int32(cum_min[col_idx])),
+                max_value: Precision::Exact(ScalarValue::Int32(cum_max[col_idx])),
+                ..Default::default()
+            };
+        }
+
+        FileScanConfigBuilder::new(
+            ObjectStoreUrl::parse("test:///").unwrap(),
+            Arc::clone(schema),
+            Arc::new(ParquetSource::new(Default::default())),
+        )
+        .with_file_groups(file_groups)
+        .with_output_ordering(output_ordering)
+        .with_statistics(statistics)
+        .build()
+    }
+
+    pub fn union_exec(input: Vec<Arc<dyn ExecutionPlan>>) -> Arc<dyn ExecutionPlan> {
+        Arc::new(UnionExec::new(input))
+    }
+
+    pub fn sort_exec(
+        sort_exprs: &LexOrdering,
+        input: &Arc<dyn ExecutionPlan>,
+        preserve_partitioning: bool,
+    ) -> Arc<dyn ExecutionPlan> {
+        let new_sort = SortExec::new(sort_exprs.clone(), Arc::clone(input))
+            .with_preserve_partitioning(preserve_partitioning);
+        Arc::new(new_sort)
+    }
+
+    pub fn coalesce_exec(input: &Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
+        Arc::new(CoalesceBatchesExec::new(Arc::clone(input), 10))
+    }
+
+    pub fn filter_exec(
+        input: &Arc<dyn ExecutionPlan>,
+        predicate: Arc<dyn PhysicalExpr>,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(FilterExec::try_new(predicate, Arc::clone(input)).unwrap())
+    }
+
+    pub fn limit_exec(
+        input: &Arc<dyn ExecutionPlan>,
+        fetch: usize,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(LocalLimitExec::new(Arc::clone(input), fetch))
+    }
+
+    pub fn proj_exec(
+        input: &Arc<dyn ExecutionPlan>,
+        projects: Vec<(Arc<dyn PhysicalExpr>, String)>,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(ProjectionExec::try_new(projects, Arc::clone(input)).unwrap())
+    }
+
+    pub fn spm_exec(
+        input: &Arc<dyn ExecutionPlan>,
+        sort_exprs: &LexOrdering,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(SortPreservingMergeExec::new(
+            sort_exprs.clone(),
+            Arc::clone(input),
+        ))
+    }
+
+    pub fn repartition_exec(
+        input: &Arc<dyn ExecutionPlan>,
+        partitioning: Partitioning,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(RepartitionExec::try_new(Arc::clone(input), partitioning).unwrap())
+    }
+
+    pub fn crossjoin_exec(
+        left: &Arc<dyn ExecutionPlan>,
+        right: &Arc<dyn ExecutionPlan>,
+    ) -> Arc<dyn ExecutionPlan> {
+        Arc::new(CrossJoinExec::new(Arc::clone(left), Arc::clone(right)))
+    }
+}
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index b99bef38f324..acff6234a7e2 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -591,7 +591,7 @@ impl EmbeddedProjection for FilterExec {
 /// converted to closed bounds. If a lower/upper bound is initially open, it
 /// is adjusted by using the next/previous value for its data type to convert
 /// it into a closed bound.
-fn collect_new_statistics(
+pub fn collect_new_statistics(
     input_column_stats: &[ColumnStatistics],
     analysis_boundaries: Vec<ExprBoundaries>,
 ) -> Vec<ColumnStatistics> {