@@ -1473,6 +1473,7 @@ mod tests {
14731473 use arrow:: error:: { ArrowError , Result as ArrowResult } ;
14741474 use arrow_schema:: SortOptions ;
14751475
1476+ use datafusion_common:: stats:: Precision :: { Absent , Exact , Inexact } ;
14761477 use datafusion_common:: { arrow_datafusion_err, arrow_err, ScalarValue } ;
14771478
14781479 fn check ( left : & [ Column ] , right : & [ Column ] , on : & [ ( Column , Column ) ] ) -> Result < ( ) > {
@@ -1640,25 +1641,26 @@ mod tests {
16401641 }
16411642
16421643 fn create_column_stats (
1643- min : Option < i64 > ,
1644- max : Option < i64 > ,
1645- distinct_count : Option < usize > ,
1644+ min : Precision < i64 > ,
1645+ max : Precision < i64 > ,
1646+ distinct_count : Precision < usize > ,
1647+ null_count : Precision < usize > ,
16461648 ) -> ColumnStatistics {
16471649 ColumnStatistics {
1648- distinct_count : distinct_count
1649- . map ( Precision :: Inexact )
1650- . unwrap_or ( Precision :: Absent ) ,
1651- min_value : min
1652- . map ( |size| Precision :: Inexact ( ScalarValue :: from ( size) ) )
1653- . unwrap_or ( Precision :: Absent ) ,
1654- max_value : max
1655- . map ( |size| Precision :: Inexact ( ScalarValue :: from ( size) ) )
1656- . unwrap_or ( Precision :: Absent ) ,
1657- ..Default :: default ( )
1650+ distinct_count,
1651+ min_value : min. map ( ScalarValue :: from) ,
1652+ max_value : max. map ( ScalarValue :: from) ,
1653+ null_count,
16581654 }
16591655 }
16601656
1661- type PartialStats = ( usize , Option < i64 > , Option < i64 > , Option < usize > ) ;
1657+ type PartialStats = (
1658+ usize ,
1659+ Precision < i64 > ,
1660+ Precision < i64 > ,
1661+ Precision < usize > ,
1662+ Precision < usize > ,
1663+ ) ;
16621664
16631665 // This is mainly for validating the all edge cases of the estimation, but
16641666 // more advanced (and real world test cases) are below where we need some control
@@ -1675,133 +1677,156 @@ mod tests {
16751677 //
16761678 // distinct(left) == NaN, distinct(right) == NaN
16771679 (
1678- ( 10 , Some ( 1 ) , Some ( 10 ) , None ) ,
1679- ( 10 , Some ( 1 ) , Some ( 10 ) , None ) ,
1680- Some ( Precision :: Inexact ( 10 ) ) ,
1680+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1681+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1682+ Some ( Inexact ( 10 ) ) ,
16811683 ) ,
16821684 // range(left) > range(right)
16831685 (
1684- ( 10 , Some ( 6 ) , Some ( 10 ) , None ) ,
1685- ( 10 , Some ( 8 ) , Some ( 10 ) , None ) ,
1686- Some ( Precision :: Inexact ( 20 ) ) ,
1686+ ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
1687+ ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
1688+ Some ( Inexact ( 20 ) ) ,
16871689 ) ,
16881690 // range(right) > range(left)
16891691 (
1690- ( 10 , Some ( 8 ) , Some ( 10 ) , None ) ,
1691- ( 10 , Some ( 6 ) , Some ( 10 ) , None ) ,
1692- Some ( Precision :: Inexact ( 20 ) ) ,
1692+ ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
1693+ ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
1694+ Some ( Inexact ( 20 ) ) ,
16931695 ) ,
16941696 // range(left) > len(left), range(right) > len(right)
16951697 (
1696- ( 10 , Some ( 1 ) , Some ( 15 ) , None ) ,
1697- ( 20 , Some ( 1 ) , Some ( 40 ) , None ) ,
1698- Some ( Precision :: Inexact ( 10 ) ) ,
1698+ ( 10 , Inexact ( 1 ) , Inexact ( 15 ) , Absent , Absent ) ,
1699+ ( 20 , Inexact ( 1 ) , Inexact ( 40 ) , Absent , Absent ) ,
1700+ Some ( Inexact ( 10 ) ) ,
16991701 ) ,
17001702 // When we have distinct count.
17011703 (
1702- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 10 ) ) ,
1703- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 10 ) ) ,
1704- Some ( Precision :: Inexact ( 10 ) ) ,
1704+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 10 ) , Absent ) ,
1705+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 10 ) , Absent ) ,
1706+ Some ( Inexact ( 10 ) ) ,
17051707 ) ,
17061708 // distinct(left) > distinct(right)
17071709 (
1708- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 5 ) ) ,
1709- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 2 ) ) ,
1710- Some ( Precision :: Inexact ( 20 ) ) ,
1710+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1711+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 2 ) , Absent ) ,
1712+ Some ( Inexact ( 20 ) ) ,
17111713 ) ,
17121714 // distinct(right) > distinct(left)
17131715 (
1714- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 2 ) ) ,
1715- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 5 ) ) ,
1716- Some ( Precision :: Inexact ( 20 ) ) ,
1716+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 2 ) , Absent ) ,
1717+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1718+ Some ( Inexact ( 20 ) ) ,
17171719 ) ,
17181720 // min(left) < 0 (range(left) > range(right))
17191721 (
1720- ( 10 , Some ( -5 ) , Some ( 5 ) , None ) ,
1721- ( 10 , Some ( 1 ) , Some ( 5 ) , None ) ,
1722- Some ( Precision :: Inexact ( 10 ) ) ,
1722+ ( 10 , Inexact ( -5 ) , Inexact ( 5 ) , Absent , Absent ) ,
1723+ ( 10 , Inexact ( 1 ) , Inexact ( 5 ) , Absent , Absent ) ,
1724+ Some ( Inexact ( 10 ) ) ,
17231725 ) ,
17241726 // min(right) < 0, max(right) < 0 (range(right) > range(left))
17251727 (
1726- ( 10 , Some ( -25 ) , Some ( -20 ) , None ) ,
1727- ( 10 , Some ( -25 ) , Some ( -15 ) , None ) ,
1728- Some ( Precision :: Inexact ( 10 ) ) ,
1728+ ( 10 , Inexact ( -25 ) , Inexact ( -20 ) , Absent , Absent ) ,
1729+ ( 10 , Inexact ( -25 ) , Inexact ( -15 ) , Absent , Absent ) ,
1730+ Some ( Inexact ( 10 ) ) ,
17291731 ) ,
17301732 // range(left) < 0, range(right) >= 0
17311733 // (there isn't a case where both left and right ranges are negative
17321734 // so one of them is always going to work, this just proves negative
17331735 // ranges with bigger absolute values are not are not accidentally used).
17341736 (
1735- ( 10 , Some ( -10 ) , Some ( 0 ) , None ) ,
1736- ( 10 , Some ( 0 ) , Some ( 10 ) , Some ( 5 ) ) ,
1737- Some ( Precision :: Inexact ( 10 ) ) ,
1737+ ( 10 , Inexact ( -10 ) , Inexact ( 0 ) , Absent , Absent ) ,
1738+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Inexact ( 5 ) , Absent ) ,
1739+ Some ( Inexact ( 10 ) ) ,
17381740 ) ,
17391741 // range(left) = 1, range(right) = 1
17401742 (
1741- ( 10 , Some ( 1 ) , Some ( 1 ) , None ) ,
1742- ( 10 , Some ( 1 ) , Some ( 1 ) , None ) ,
1743- Some ( Precision :: Inexact ( 100 ) ) ,
1743+ ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
1744+ ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
1745+ Some ( Inexact ( 100 ) ) ,
17441746 ) ,
17451747 //
17461748 // Edge cases
17471749 // ==========
17481750 //
17491751 // No column level stats.
1750- ( ( 10 , None , None , None ) , ( 10 , None , None , None ) , None ) ,
1752+ (
1753+ ( 10 , Absent , Absent , Absent , Absent ) ,
1754+ ( 10 , Absent , Absent , Absent , Absent ) ,
1755+ None ,
1756+ ) ,
17511757 // No min or max (or both).
1752- ( ( 10 , None , None , Some ( 3 ) ) , ( 10 , None , None , Some ( 3 ) ) , None ) ,
17531758 (
1754- ( 10 , Some ( 2 ) , None , Some ( 3 ) ) ,
1755- ( 10 , None , Some ( 5 ) , Some ( 3 ) ) ,
1759+ ( 10 , Absent , Absent , Inexact ( 3 ) , Absent ) ,
1760+ ( 10 , Absent , Absent , Inexact ( 3 ) , Absent ) ,
1761+ None ,
1762+ ) ,
1763+ (
1764+ ( 10 , Inexact ( 2 ) , Absent , Inexact ( 3 ) , Absent ) ,
1765+ ( 10 , Absent , Inexact ( 5 ) , Inexact ( 3 ) , Absent ) ,
17561766 None ,
17571767 ) ,
17581768 (
1759- ( 10 , None , Some ( 3 ) , Some ( 3 ) ) ,
1760- ( 10 , Some ( 1 ) , None , Some ( 3 ) ) ,
1769+ ( 10 , Absent , Inexact ( 3 ) , Inexact ( 3 ) , Absent ) ,
1770+ ( 10 , Inexact ( 1 ) , Absent , Inexact ( 3 ) , Absent ) ,
1771+ None ,
1772+ ) ,
1773+ (
1774+ ( 10 , Absent , Inexact ( 3 ) , Absent , Absent ) ,
1775+ ( 10 , Inexact ( 1 ) , Absent , Absent , Absent ) ,
17611776 None ,
17621777 ) ,
1763- ( ( 10 , None , Some ( 3 ) , None ) , ( 10 , Some ( 1 ) , None , None ) , None ) ,
17641778 // Non overlapping min/max (when exact=False).
17651779 (
1766- ( 10 , Some ( 0 ) , Some ( 10 ) , None ) ,
1767- ( 10 , Some ( 11 ) , Some ( 20 ) , None ) ,
1768- Some ( Precision :: Inexact ( 0 ) ) ,
1780+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
1781+ ( 10 , Inexact ( 11 ) , Inexact ( 20 ) , Absent , Absent ) ,
1782+ Some ( Inexact ( 0 ) ) ,
17691783 ) ,
17701784 (
1771- ( 10 , Some ( 11 ) , Some ( 20 ) , None ) ,
1772- ( 10 , Some ( 0 ) , Some ( 10 ) , None ) ,
1773- Some ( Precision :: Inexact ( 0 ) ) ,
1785+ ( 10 , Inexact ( 11 ) , Inexact ( 20 ) , Absent , Absent ) ,
1786+ ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
1787+ Some ( Inexact ( 0 ) ) ,
17741788 ) ,
17751789 // distinct(left) = 0, distinct(right) = 0
17761790 (
1777- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 0 ) ) ,
1778- ( 10 , Some ( 1 ) , Some ( 10 ) , Some ( 0 ) ) ,
1791+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
1792+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
17791793 None ,
17801794 ) ,
1795+ // Inexact row count < exact null count with absent distinct count
1796+ (
1797+ ( 0 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Exact ( 5 ) ) ,
1798+ ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
1799+ Some ( Inexact ( 0 ) ) ,
1800+ ) ,
17811801 ] ;
17821802
17831803 for ( left_info, right_info, expected_cardinality) in cases {
17841804 let left_num_rows = left_info. 0 ;
1785- let left_col_stats =
1786- vec ! [ create_column_stats( left_info. 1 , left_info. 2 , left_info. 3 ) ] ;
1805+ let left_col_stats = vec ! [ create_column_stats(
1806+ left_info. 1 ,
1807+ left_info. 2 ,
1808+ left_info. 3 ,
1809+ left_info. 4 ,
1810+ ) ] ;
17871811
17881812 let right_num_rows = right_info. 0 ;
17891813 let right_col_stats = vec ! [ create_column_stats(
17901814 right_info. 1 ,
17911815 right_info. 2 ,
17921816 right_info. 3 ,
1817+ right_info. 4 ,
17931818 ) ] ;
17941819
17951820 assert_eq ! (
17961821 estimate_inner_join_cardinality(
17971822 Statistics {
1798- num_rows: Precision :: Inexact ( left_num_rows) ,
1799- total_byte_size: Precision :: Absent ,
1823+ num_rows: Inexact ( left_num_rows) ,
1824+ total_byte_size: Absent ,
18001825 column_statistics: left_col_stats. clone( ) ,
18011826 } ,
18021827 Statistics {
1803- num_rows: Precision :: Inexact ( right_num_rows) ,
1804- total_byte_size: Precision :: Absent ,
1828+ num_rows: Inexact ( right_num_rows) ,
1829+ total_byte_size: Absent ,
18051830 column_statistics: right_col_stats. clone( ) ,
18061831 } ,
18071832 ) ,
@@ -1819,9 +1844,7 @@ mod tests {
18191844 ) ;
18201845
18211846 assert_eq ! (
1822- partial_join_stats
1823- . clone( )
1824- . map( |s| Precision :: Inexact ( s. num_rows) ) ,
1847+ partial_join_stats. clone( ) . map( |s| Inexact ( s. num_rows) ) ,
18251848 expected_cardinality. clone( )
18261849 ) ;
18271850 assert_eq ! (
@@ -1837,13 +1860,13 @@ mod tests {
18371860 #[ test]
18381861 fn test_inner_join_cardinality_multiple_column ( ) -> Result < ( ) > {
18391862 let left_col_stats = vec ! [
1840- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1841- create_column_stats( Some ( 100 ) , Some ( 500 ) , Some ( 150 ) ) ,
1863+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1864+ create_column_stats( Inexact ( 100 ) , Inexact ( 500 ) , Inexact ( 150 ) , Absent ) ,
18421865 ] ;
18431866
18441867 let right_col_stats = vec ! [
1845- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1846- create_column_stats( Some ( 100 ) , Some ( 500 ) , Some ( 200 ) ) ,
1868+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
1869+ create_column_stats( Inexact ( 100 ) , Inexact ( 500 ) , Inexact ( 200 ) , Absent ) ,
18471870 ] ;
18481871
18491872 // We have statistics about 4 columns, where the highest distinct
@@ -1921,15 +1944,15 @@ mod tests {
19211944 ] ;
19221945
19231946 let left_col_stats = vec ! [
1924- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1925- create_column_stats( Some ( 0 ) , Some ( 500 ) , Some ( 500 ) ) ,
1926- create_column_stats( Some ( 1000 ) , Some ( 10000 ) , None ) ,
1947+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1948+ create_column_stats( Inexact ( 0 ) , Inexact ( 500 ) , Inexact ( 500 ) , Absent ) ,
1949+ create_column_stats( Inexact ( 1000 ) , Inexact ( 10000 ) , Absent , Absent ) ,
19271950 ] ;
19281951
19291952 let right_col_stats = vec ! [
1930- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1931- create_column_stats( Some ( 0 ) , Some ( 2000 ) , Some ( 2500 ) ) ,
1932- create_column_stats( Some ( 0 ) , Some ( 100 ) , None ) ,
1953+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
1954+ create_column_stats( Inexact ( 0 ) , Inexact ( 2000 ) , Inexact ( 2500 ) , Absent ) ,
1955+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Absent , Absent ) ,
19331956 ] ;
19341957
19351958 for ( join_type, expected_num_rows) in cases {
@@ -1970,15 +1993,15 @@ mod tests {
19701993 // Join on a=c, x=y (ignores b/d) where x and y does not intersect
19711994
19721995 let left_col_stats = vec ! [
1973- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 100 ) ) ,
1974- create_column_stats( Some ( 0 ) , Some ( 500 ) , Some ( 500 ) ) ,
1975- create_column_stats( Some ( 1000 ) , Some ( 10000 ) , None ) ,
1996+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 100 ) , Absent ) ,
1997+ create_column_stats( Inexact ( 0 ) , Inexact ( 500 ) , Inexact ( 500 ) , Absent ) ,
1998+ create_column_stats( Inexact ( 1000 ) , Inexact ( 10000 ) , Absent , Absent ) ,
19761999 ] ;
19772000
19782001 let right_col_stats = vec ! [
1979- create_column_stats( Some ( 0 ) , Some ( 100 ) , Some ( 50 ) ) ,
1980- create_column_stats( Some ( 0 ) , Some ( 2000 ) , Some ( 2500 ) ) ,
1981- create_column_stats( Some ( 0 ) , Some ( 100 ) , None ) ,
2002+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Inexact ( 50 ) , Absent ) ,
2003+ create_column_stats( Inexact ( 0 ) , Inexact ( 2000 ) , Inexact ( 2500 ) , Absent ) ,
2004+ create_column_stats( Inexact ( 0 ) , Inexact ( 100 ) , Absent , Absent ) ,
19822005 ] ;
19832006
19842007 let join_on = vec ! [
0 commit comments