@@ -1433,7 +1433,7 @@ pub(crate) mod tests {
1433
1433
use datafusion_physical_optimizer:: output_requirements:: OutputRequirements ;
1434
1434
1435
1435
use arrow:: datatypes:: { DataType , Field , Schema , SchemaRef } ;
1436
- use datafusion_common:: ScalarValue ;
1436
+ use datafusion_common:: { ColumnStatistics , ScalarValue } ;
1437
1437
use datafusion_expr:: { AggregateUDF , Operator } ;
1438
1438
use datafusion_physical_expr:: expressions:: { BinaryExpr , Literal } ;
1439
1439
use datafusion_physical_expr:: {
@@ -1550,6 +1550,25 @@ pub(crate) mod tests {
1550
1550
] ) )
1551
1551
}
1552
1552
1553
+ fn int64_stats ( ) -> ColumnStatistics {
1554
+ ColumnStatistics {
1555
+ null_count : Precision :: Absent ,
1556
+ max_value : Precision :: Exact ( 1_000_000 . into ( ) ) ,
1557
+ min_value : Precision :: Exact ( 0 . into ( ) ) ,
1558
+ distinct_count : Precision :: Absent ,
1559
+ }
1560
+ }
1561
+
1562
+ fn column_stats ( ) -> Vec < ColumnStatistics > {
1563
+ vec ! [
1564
+ int64_stats( ) , // a
1565
+ int64_stats( ) , // b
1566
+ int64_stats( ) , // c
1567
+ ColumnStatistics :: default ( ) ,
1568
+ ColumnStatistics :: default ( ) ,
1569
+ ]
1570
+ }
1571
+
1553
1572
fn parquet_exec ( ) -> Arc < ParquetExec > {
1554
1573
parquet_exec_with_sort ( vec ! [ ] )
1555
1574
}
@@ -1566,6 +1585,20 @@ pub(crate) mod tests {
1566
1585
. build_arc ( )
1567
1586
}
1568
1587
1588
+ fn parquet_exec_with_stats ( ) -> Arc < ParquetExec > {
1589
+ let mut statistics = Statistics :: new_unknown ( & schema ( ) ) ;
1590
+ statistics. num_rows = Precision :: Inexact ( 10 ) ;
1591
+ statistics. column_statistics = column_stats ( ) ;
1592
+
1593
+ let config =
1594
+ FileScanConfig :: new ( ObjectStoreUrl :: parse ( "test:///" ) . unwrap ( ) , schema ( ) )
1595
+ . with_file ( PartitionedFile :: new ( "x" . to_string ( ) , 10000 ) )
1596
+ . with_statistics ( statistics) ;
1597
+ assert_eq ! ( config. statistics. num_rows, Precision :: Inexact ( 10 ) ) ;
1598
+
1599
+ ParquetExec :: builder ( config) . build_arc ( )
1600
+ }
1601
+
1569
1602
fn parquet_exec_multiple ( ) -> Arc < ParquetExec > {
1570
1603
parquet_exec_multiple_sorted ( vec ! [ ] )
1571
1604
}
@@ -1870,6 +1903,10 @@ pub(crate) mod tests {
1870
1903
} ;
1871
1904
1872
1905
( $EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $TARGET_PARTITIONS: expr, $REPARTITION_FILE_SCANS: expr, $REPARTITION_FILE_MIN_SIZE: expr, $PREFER_EXISTING_UNION: expr) => {
1906
+ assert_optimized!( $EXPECTED_LINES, $PLAN, $FIRST_ENFORCE_DIST, $PREFER_EXISTING_SORT, $TARGET_PARTITIONS, $REPARTITION_FILE_SCANS, $REPARTITION_FILE_MIN_SIZE, $PREFER_EXISTING_UNION, 1 ) ;
1907
+ } ;
1908
+
1909
+ ( $EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr, $PREFER_EXISTING_SORT: expr, $TARGET_PARTITIONS: expr, $REPARTITION_FILE_SCANS: expr, $REPARTITION_FILE_MIN_SIZE: expr, $PREFER_EXISTING_UNION: expr, $BATCH_SIZE: expr) => {
1873
1910
let expected_lines: Vec <& str > = $EXPECTED_LINES. iter( ) . map( |s| * s) . collect( ) ;
1874
1911
1875
1912
let mut config = ConfigOptions :: new( ) ;
@@ -1879,7 +1916,12 @@ pub(crate) mod tests {
1879
1916
config. optimizer. prefer_existing_sort = $PREFER_EXISTING_SORT;
1880
1917
config. optimizer. prefer_existing_union = $PREFER_EXISTING_UNION;
1881
1918
// Use a small batch size, to trigger RoundRobin in tests
1882
- config. execution. batch_size = 1 ;
1919
+ config. execution. batch_size = $BATCH_SIZE;
1920
+
1921
+ // This triggers the use of column statisticals estimates in the repartition calculation.
1922
+ // Without this setting, the testing of `get_repartition_requirement_status` misses
1923
+ // several branches.
1924
+ config. execution. use_row_number_estimates_to_optimize_partitioning = true ;
1883
1925
1884
1926
// NOTE: These tests verify the joint `EnforceDistribution` + `EnforceSorting` cascade
1885
1927
// because they were written prior to the separation of `BasicEnforcement` into
@@ -3548,9 +3590,25 @@ pub(crate) mod tests {
3548
3590
}
3549
3591
}
3550
3592
3593
+ macro_rules! assert_optimized_without_forced_roundrobin {
3594
+ ( $EXPECTED_LINES: expr, $PLAN: expr, $FIRST_ENFORCE_DIST: expr) => {
3595
+ assert_optimized!(
3596
+ $EXPECTED_LINES,
3597
+ $PLAN,
3598
+ $FIRST_ENFORCE_DIST,
3599
+ false ,
3600
+ 10 ,
3601
+ false ,
3602
+ 1024 ,
3603
+ false ,
3604
+ 100
3605
+ ) ;
3606
+ } ;
3607
+ }
3608
+
3551
3609
#[ test]
3552
3610
fn repartitions_for_extension_node_with_aggregate_after_union ( ) -> Result < ( ) > {
3553
- let union = union_exec ( vec ! [ parquet_exec ( ) ; 2 ] ) ;
3611
+ let union = union_exec ( vec ! [ parquet_exec_with_stats ( ) ; 2 ] ) ;
3554
3612
let plan =
3555
3613
aggregate_exec_with_alias ( union, vec ! [ ( "a" . to_string( ) , "a1" . to_string( ) ) ] ) ;
3556
3614
let plan: Arc < dyn ExecutionPlan > = Arc :: new ( MyExtensionNode :: new ( plan) ) ;
@@ -3563,23 +3621,22 @@ pub(crate) mod tests {
3563
3621
"MyExtensionNode" ,
3564
3622
"CoalescePartitionsExec" ,
3565
3623
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[]" ,
3566
- "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 " ,
3624
+ "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=2 " ,
3567
3625
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[]" ,
3568
- "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2" ,
3569
3626
"UnionExec" ,
3570
3627
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3571
3628
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3572
3629
] ;
3573
3630
3574
- assert_optimized ! ( expected, plan. clone( ) , true ) ;
3575
- assert_optimized ! ( expected, plan. clone( ) , false ) ;
3631
+ assert_optimized_without_forced_roundrobin ! ( expected, plan. clone( ) , true ) ;
3632
+ assert_optimized_without_forced_roundrobin ! ( expected, plan. clone( ) , false ) ;
3576
3633
3577
3634
Ok ( ( ) )
3578
3635
}
3579
3636
3580
3637
#[ test]
3581
3638
fn repartitions_for_aggregate_after_sorted_union ( ) -> Result < ( ) > {
3582
- let union = union_exec ( vec ! [ parquet_exec ( ) ; 2 ] ) ;
3639
+ let union = union_exec ( vec ! [ parquet_exec_with_stats ( ) ; 2 ] ) ;
3583
3640
let schema = schema ( ) ;
3584
3641
let sort_key = LexOrdering :: new ( vec ! [ PhysicalSortExpr {
3585
3642
expr: col( "a" , & schema) . unwrap( ) ,
@@ -3593,45 +3650,48 @@ pub(crate) mod tests {
3593
3650
let checker = check_plan_sanity ( plan. clone ( ) , & Default :: default ( ) ) ;
3594
3651
assert ! ( checker. is_ok( ) ) ;
3595
3652
3596
- // it still repartitions
3653
+ // does not repartition on the first run
3597
3654
let expected_after_first_run = & [
3598
3655
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[], ordering_mode=Sorted" ,
3599
- "SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3600
- "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10" ,
3601
3656
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[], ordering_mode=Sorted" ,
3602
- "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1" ,
3603
3657
"SortPreservingMergeExec: [a@0 ASC]" ,
3604
3658
"UnionExec" ,
3605
3659
"SortExec: expr=[a@0 ASC], preserve_partitioning=[false]" ,
3606
3660
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3607
3661
"SortExec: expr=[a@0 ASC], preserve_partitioning=[false]" ,
3608
3662
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3609
3663
] ;
3610
- assert_optimized ! ( expected_after_first_run, plan. clone( ) , true ) ;
3664
+ assert_optimized_without_forced_roundrobin ! (
3665
+ expected_after_first_run,
3666
+ plan. clone( ) ,
3667
+ true
3668
+ ) ;
3611
3669
3670
+ // does repartition on the second run
3612
3671
let expected_after_second_run = & [
3613
3672
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[], ordering_mode=Sorted" ,
3614
3673
"SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3615
- "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10 " ,
3674
+ "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=2 " ,
3616
3675
"SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3617
3676
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[], ordering_mode=Sorted" ,
3618
- "SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]" , // adds another sort
3619
- "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2" ,
3620
- "SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]" , // removes the SPM
3621
3677
"UnionExec" ,
3622
3678
"SortExec: expr=[a@0 ASC], preserve_partitioning=[false]" ,
3623
3679
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3624
3680
"SortExec: expr=[a@0 ASC], preserve_partitioning=[false]" ,
3625
3681
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3626
3682
] ;
3627
- assert_optimized ! ( expected_after_second_run, plan. clone( ) , false ) ;
3683
+ assert_optimized_without_forced_roundrobin ! (
3684
+ expected_after_second_run,
3685
+ plan. clone( ) ,
3686
+ false
3687
+ ) ;
3628
3688
3629
3689
Ok ( ( ) )
3630
3690
}
3631
3691
3632
3692
#[ test]
3633
- fn repartition_for_aggregate_after_sorted_union_projection ( ) -> Result < ( ) > {
3634
- let union = union_exec ( vec ! [ parquet_exec ( ) ; 2 ] ) ;
3693
+ fn does_not_repartition_for_aggregate_after_sorted_union_projection ( ) -> Result < ( ) > {
3694
+ let union = union_exec ( vec ! [ parquet_exec_with_stats ( ) ; 2 ] ) ;
3635
3695
let projection = projection_exec_with_alias (
3636
3696
union,
3637
3697
vec ! [
@@ -3652,45 +3712,47 @@ pub(crate) mod tests {
3652
3712
let checker = check_plan_sanity ( plan. clone ( ) , & Default :: default ( ) ) ;
3653
3713
assert ! ( checker. is_ok( ) ) ;
3654
3714
3655
- // it still repartitions
3715
+ // does not repartition on the first run
3656
3716
let expected_after_first_run = & [
3657
3717
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[], ordering_mode=Sorted" ,
3658
- "SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3659
- "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10" ,
3660
3718
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[], ordering_mode=Sorted" ,
3661
- "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1" ,
3662
3719
"SortPreservingMergeExec: [a@0 ASC]" ,
3663
3720
"SortExec: expr=[a@0 ASC], preserve_partitioning=[true]" ,
3664
3721
"ProjectionExec: expr=[a@0 as a, b@1 as value]" ,
3665
3722
"UnionExec" ,
3666
3723
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3667
3724
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3668
3725
] ;
3669
- assert_optimized ! ( expected_after_first_run, plan. clone( ) , true ) ;
3726
+ assert_optimized_without_forced_roundrobin ! (
3727
+ expected_after_first_run,
3728
+ plan. clone( ) ,
3729
+ true
3730
+ ) ;
3670
3731
3732
+ // does not repartition on the second run
3671
3733
let expected_after_second_run = & [
3672
3734
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[], ordering_mode=Sorted" ,
3673
- "SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3674
- "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10" ,
3675
- "SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" , // adds another sort
3676
3735
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[], ordering_mode=Sorted" ,
3677
- "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1" ,
3678
- // removes the SPM
3679
3736
"SortExec: expr=[a@0 ASC], preserve_partitioning=[false]" ,
3680
- "CoalescePartitionsExec" , // adds the coalesce
3737
+ "CoalescePartitionsExec" ,
3681
3738
"ProjectionExec: expr=[a@0 as a, b@1 as value]" ,
3682
3739
"UnionExec" ,
3683
3740
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3684
3741
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3685
3742
] ;
3686
- assert_optimized ! ( expected_after_second_run, plan. clone( ) , false ) ;
3743
+ assert_optimized_without_forced_roundrobin ! (
3744
+ expected_after_second_run,
3745
+ plan. clone( ) ,
3746
+ false
3747
+ ) ;
3687
3748
3688
3749
Ok ( ( ) )
3689
3750
}
3690
3751
3691
3752
#[ test]
3692
- fn repartition_for_aggregate_sum_after_sorted_union_projection ( ) -> Result < ( ) > {
3693
- let union = union_exec ( vec ! [ parquet_exec( ) ; 2 ] ) ;
3753
+ fn does_not_repartition_for_aggregate_sum_after_sorted_union_projection ( ) -> Result < ( ) >
3754
+ {
3755
+ let union = union_exec ( vec ! [ parquet_exec_with_stats( ) ; 2 ] ) ;
3694
3756
let projection = projection_exec_with_alias (
3695
3757
union,
3696
3758
vec ! [
@@ -3715,41 +3777,41 @@ pub(crate) mod tests {
3715
3777
let checker = check_plan_sanity ( plan. clone ( ) , & Default :: default ( ) ) ;
3716
3778
assert ! ( checker. is_ok( ) ) ;
3717
3779
3718
- // it still repartitions
3780
+ // does not repartition on the first run
3719
3781
let expected_after_first_run = & [
3720
3782
"MyExtensionNode" ,
3721
- "CoalescePartitionsExec" ,
3722
3783
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[sum], ordering_mode=Sorted" ,
3723
- "SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3724
- "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10" ,
3725
3784
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[sum], ordering_mode=Sorted" ,
3726
- "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1" ,
3727
3785
"SortPreservingMergeExec: [a@0 ASC]" ,
3728
3786
"SortExec: expr=[a@0 ASC], preserve_partitioning=[true]" ,
3729
3787
"ProjectionExec: expr=[a@0 as a, b@1 as b]" ,
3730
3788
"UnionExec" ,
3731
3789
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3732
3790
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3733
3791
] ;
3734
- assert_optimized ! ( expected_after_first_run, plan. clone( ) , true ) ;
3792
+ assert_optimized_without_forced_roundrobin ! (
3793
+ expected_after_first_run,
3794
+ plan. clone( ) ,
3795
+ true
3796
+ ) ;
3735
3797
3798
+ // does not repartition on the second run
3736
3799
let expected_after_second_run = & [
3737
3800
"MyExtensionNode" ,
3738
- "CoalescePartitionsExec" ,
3739
3801
"AggregateExec: mode=FinalPartitioned, gby=[a1@0 as a1], aggr=[sum], ordering_mode=Sorted" ,
3740
- "SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3741
- "RepartitionExec: partitioning=Hash([a1@0], 10), input_partitions=10" ,
3742
- "SortExec: expr=[a1@0 ASC NULLS LAST], preserve_partitioning=[true]" ,
3743
3802
"AggregateExec: mode=Partial, gby=[a@0 as a1], aggr=[sum], ordering_mode=Sorted" ,
3744
- "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1" ,
3745
3803
"SortExec: expr=[a@0 ASC], preserve_partitioning=[false]" ,
3746
3804
"CoalescePartitionsExec" ,
3747
3805
"ProjectionExec: expr=[a@0 as a, b@1 as b]" ,
3748
3806
"UnionExec" ,
3749
3807
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3750
3808
"ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]" ,
3751
3809
] ;
3752
- assert_optimized ! ( expected_after_second_run, plan. clone( ) , false ) ;
3810
+ assert_optimized_without_forced_roundrobin ! (
3811
+ expected_after_second_run,
3812
+ plan. clone( ) ,
3813
+ false
3814
+ ) ;
3753
3815
3754
3816
Ok ( ( ) )
3755
3817
}
0 commit comments