From 4f769d90c1327f048ec80fa2190b276ee12a1672 Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Mon, 17 Nov 2025 21:02:24 +0100 Subject: [PATCH 1/3] HIVE-29275: Stats autogather calculates the min statistic incorrectly --- .../apache/hadoop/hive/ql/ddl/ShowUtils.java | 9 +- .../llap/llap_decimal64_reader.q.out | 28 +++--- .../clientpositive/llap/stats_histogram.q.out | 2 +- .../llap/stats_histogram_null.q.out | 2 +- .../perf/tpcds30tb/tez/query28.q.out | 60 ++++++------- .../perf/tpcds30tb/tez/query48.q.out | 16 ++-- .../perf/tpcds30tb/tez/query49.q.out | 62 +++++++------- .../metastore/api/utils/DecimalUtils.java | 6 ++ .../merge/DecimalColumnStatsMerger.java | 20 ++++- .../metastore/utils/MetaStoreServerUtils.java | 10 +++ .../merge/DecimalColumnStatsMergerTest.java | 85 ++++++++++++------- 11 files changed, 173 insertions(+), 127 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/ddl/ShowUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/ddl/ShowUtils.java index 386bfd97748e..e96288369acb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/ddl/ShowUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/ddl/ShowUtils.java @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData; +import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.session.SessionState; @@ -54,7 +55,6 @@ import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; -import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.time.ZoneId; import java.util.ArrayList; @@ -233,12 +233,7 @@ public static String[] extractColumnValues(FieldSchema column, boolean isColumnS } public static String convertToString(Decimal val) { - if (val == null) { - return ""; - } - - HiveDecimal result = HiveDecimal.create(new BigInteger(val.getUnscaled()), val.getScale()); - return (result != null) ? result.toString() : ""; + return MetaStoreServerUtils.decimalToString(val); } public static String convertToString(org.apache.hadoop.hive.metastore.api.Date val) { diff --git a/ql/src/test/results/clientpositive/llap/llap_decimal64_reader.q.out b/ql/src/test/results/clientpositive/llap/llap_decimal64_reader.q.out index a20cedd10dd1..fcf8bf6892b5 100644 --- a/ql/src/test/results/clientpositive/llap/llap_decimal64_reader.q.out +++ b/ql/src/test/results/clientpositive/llap/llap_decimal64_reader.q.out @@ -136,25 +136,25 @@ STAGE PLANS: Statistics: Num rows: 24576 Data size: 5505024 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (cdecimal1) IN (3.35, 4.46) (type: boolean) - Statistics: Num rows: 12288 Data size: 2752512 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24576 Data size: 5505024 Basic stats: COMPLETE Column stats: COMPLETE Top N Key Operator sort order: ++ keys: cdecimal1 (type: decimal(10,2)), cdecimal2 (type: decimal(38,5)) null sort order: zz - Statistics: Num rows: 12288 Data size: 2752512 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24576 Data size: 5505024 Basic stats: COMPLETE Column stats: COMPLETE top n: 2 Group By Operator keys: cdecimal1 (type: decimal(10,2)), cdecimal2 (type: decimal(38,5)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(10,2)), _col1 (type: decimal(38,5)) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: decimal(10,2)), _col1 (type: decimal(38,5)) - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: vectorized, llap LLAP IO: all inputs Map Vectorization: @@ -179,13 +179,13 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(10,2)), KEY._col1 (type: decimal(38,5)) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 2 - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -243,25 +243,25 @@ STAGE PLANS: Statistics: Num rows: 24576 Data size: 5505024 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (cdecimal1) IN (3.35, 4.46) (type: boolean) - Statistics: Num rows: 12288 Data size: 2752512 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24576 Data size: 5505024 Basic stats: COMPLETE Column stats: COMPLETE Top N Key Operator sort order: ++ keys: cdecimal1 (type: decimal(10,2)), cdecimal2 (type: decimal(38,5)) null sort order: zz - Statistics: Num rows: 12288 Data size: 2752512 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24576 Data size: 5505024 Basic stats: COMPLETE Column stats: COMPLETE top n: 2 Group By Operator keys: cdecimal1 (type: decimal(10,2)), cdecimal2 (type: decimal(38,5)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(10,2)), _col1 (type: decimal(38,5)) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: decimal(10,2)), _col1 (type: decimal(38,5)) - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: vectorized, llap LLAP IO: all inputs Map Vectorization: @@ -287,13 +287,13 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(10,2)), KEY._col1 (type: decimal(38,5)) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 2 - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/stats_histogram.q.out b/ql/src/test/results/clientpositive/llap/stats_histogram.q.out index 5ff94404746f..439ebba09c5a 100644 --- a/ql/src/test/results/clientpositive/llap/stats_histogram.q.out +++ b/ql/src/test/results/clientpositive/llap/stats_histogram.q.out @@ -361,7 +361,7 @@ POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_stats col_name e data_type decimal(5,2) -min -10.2 +min -123.2 max 12.2 num_nulls 1 distinct_count 11 diff --git a/ql/src/test/results/clientpositive/llap/stats_histogram_null.q.out b/ql/src/test/results/clientpositive/llap/stats_histogram_null.q.out index dbde9c67daea..07e48b5888ae 100644 --- a/ql/src/test/results/clientpositive/llap/stats_histogram_null.q.out +++ b/ql/src/test/results/clientpositive/llap/stats_histogram_null.q.out @@ -436,7 +436,7 @@ POSTHOOK: type: DESCTABLE POSTHOOK: Input: default@test_stats col_name e data_type decimal(5,2) -min -12.3 +min -123.2 max 12.2 num_nulls 1 distinct_count 15 diff --git a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query28.q.out b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query28.q.out index d0e8628d188d..025630093e3f 100644 --- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query28.q.out +++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query28.q.out @@ -34,129 +34,129 @@ STAGE PLANS: Statistics: Num rows: 86404891377 Data size: 28054250053192 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (ss_quantity BETWEEN 0 AND 5 and (ss_list_price BETWEEN 11 AND 21 or ss_coupon_amt BETWEEN 460 AND 1460 or ss_wholesale_cost BETWEEN 14 AND 34)) (type: boolean) - Statistics: Num rows: 1965380184 Data size: 638126687968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1955303836 Data size: 634855063288 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_list_price (type: decimal(7,2)) outputColumnNames: ss_list_price - Statistics: Num rows: 1965380184 Data size: 638126687968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1955303836 Data size: 634855063288 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(ss_list_price), count(ss_list_price) keys: ss_list_price (type: decimal(7,2)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 982690092 Data size: 223025312544 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 977651918 Data size: 221881879504 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(7,2)) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: decimal(7,2)) - Statistics: Num rows: 982690092 Data size: 223025312544 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 977651918 Data size: 221881879504 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: decimal(17,2)), _col2 (type: bigint) Filter Operator predicate: (ss_quantity BETWEEN 16 AND 20 and (ss_list_price BETWEEN 142 AND 152 or ss_coupon_amt BETWEEN 3054 AND 4054 or ss_wholesale_cost BETWEEN 80 AND 100)) (type: boolean) - Statistics: Num rows: 2571445780 Data size: 834906239572 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2551512553 Data size: 828434247976 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_list_price (type: decimal(7,2)) outputColumnNames: ss_list_price - Statistics: Num rows: 2571445780 Data size: 834906239572 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2551512553 Data size: 828434247976 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(ss_list_price), count(ss_list_price) keys: ss_list_price (type: decimal(7,2)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1285722890 Data size: 291799776608 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1275756276 Data size: 289537815088 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(7,2)) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: decimal(7,2)) - Statistics: Num rows: 1285722890 Data size: 291799776608 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1275756276 Data size: 289537815088 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: decimal(17,2)), _col2 (type: bigint) Filter Operator predicate: (ss_quantity BETWEEN 21 AND 25 and (ss_list_price BETWEEN 135 AND 145 or ss_coupon_amt BETWEEN 14180 AND 15180 or ss_wholesale_cost BETWEEN 38 AND 58)) (type: boolean) - Statistics: Num rows: 3432340414 Data size: 1114424597248 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3482601258 Data size: 1130743468324 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_list_price (type: decimal(7,2)) outputColumnNames: ss_list_price - Statistics: Num rows: 3432340414 Data size: 1114424597248 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3482601258 Data size: 1130743468324 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(ss_list_price), count(ss_list_price) keys: ss_list_price (type: decimal(7,2)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1716170207 Data size: 389491457960 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1741300629 Data size: 395194904312 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(7,2)) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: decimal(7,2)) - Statistics: Num rows: 1716170207 Data size: 389491457960 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1741300629 Data size: 395194904312 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: decimal(17,2)), _col2 (type: bigint) Filter Operator predicate: (ss_quantity BETWEEN 26 AND 30 and (ss_list_price BETWEEN 28 AND 38 or ss_coupon_amt BETWEEN 2513 AND 3513 or ss_wholesale_cost BETWEEN 42 AND 62)) (type: boolean) - Statistics: Num rows: 2913592254 Data size: 945995583960 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2894732905 Data size: 939872262804 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_list_price (type: decimal(7,2)) outputColumnNames: ss_list_price - Statistics: Num rows: 2913592254 Data size: 945995583960 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2894732905 Data size: 939872262804 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(ss_list_price), count(ss_list_price) keys: ss_list_price (type: decimal(7,2)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1456796127 Data size: 330625508552 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1447366452 Data size: 328485407488 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(7,2)) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: decimal(7,2)) - Statistics: Num rows: 1456796127 Data size: 330625508552 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1447366452 Data size: 328485407488 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: decimal(17,2)), _col2 (type: bigint) Filter Operator predicate: (ss_quantity BETWEEN 11 AND 15 and (ss_list_price BETWEEN 66 AND 76 or ss_coupon_amt BETWEEN 920 AND 1920 or ss_wholesale_cost BETWEEN 4 AND 24)) (type: boolean) - Statistics: Num rows: 2457715925 Data size: 797980022328 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2445115019 Data size: 793888714924 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_list_price (type: decimal(7,2)) outputColumnNames: ss_list_price - Statistics: Num rows: 2457715925 Data size: 797980022328 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2445115019 Data size: 793888714924 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(ss_list_price), count(ss_list_price) keys: ss_list_price (type: decimal(7,2)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1228857962 Data size: 278894061568 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1222557509 Data size: 277464149464 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(7,2)) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: decimal(7,2)) - Statistics: Num rows: 1228857962 Data size: 278894061568 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1222557509 Data size: 277464149464 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: decimal(17,2)), _col2 (type: bigint) Filter Operator predicate: (ss_quantity BETWEEN 6 AND 10 and (ss_list_price BETWEEN 91 AND 101 or ss_coupon_amt BETWEEN 1430 AND 2430 or ss_wholesale_cost BETWEEN 32 AND 52)) (type: boolean) - Statistics: Num rows: 3193426694 Data size: 1036853233656 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3178448591 Data size: 1031990089344 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_list_price (type: decimal(7,2)) outputColumnNames: ss_list_price - Statistics: Num rows: 3193426694 Data size: 1036853233656 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3178448591 Data size: 1031990089344 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(ss_list_price), count(ss_list_price) keys: ss_list_price (type: decimal(7,2)) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1596713347 Data size: 362380262200 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1589224295 Data size: 360680592952 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: decimal(7,2)) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: decimal(7,2)) - Statistics: Num rows: 1596713347 Data size: 362380262200 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1589224295 Data size: 360680592952 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: decimal(17,2)), _col2 (type: bigint) Execution mode: vectorized, llap LLAP IO: may be used (ACID table) @@ -168,7 +168,7 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(7,2)) mode: partial2 outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1228857962 Data size: 278894061568 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1222557509 Data size: 277464149464 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), count(_col2), count(_col0) mode: partial2 @@ -204,7 +204,7 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(7,2)) mode: partial2 outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1596713347 Data size: 362380262200 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1589224295 Data size: 360680592952 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), count(_col2), count(_col0) mode: partial2 @@ -240,7 +240,7 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(7,2)) mode: partial2 outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 982690092 Data size: 223025312544 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 977651918 Data size: 221881879504 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), count(_col2), count(_col0) mode: partial2 @@ -276,7 +276,7 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(7,2)) mode: partial2 outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1285722890 Data size: 291799776608 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1275756276 Data size: 289537815088 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), count(_col2), count(_col0) mode: partial2 @@ -312,7 +312,7 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(7,2)) mode: partial2 outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1716170207 Data size: 389491457960 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1741300629 Data size: 395194904312 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), count(_col2), count(_col0) mode: partial2 @@ -394,7 +394,7 @@ STAGE PLANS: keys: KEY._col0 (type: decimal(7,2)) mode: partial2 outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1456796127 Data size: 330625508552 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1447366452 Data size: 328485407488 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), count(_col2), count(_col0) mode: partial2 diff --git a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query48.q.out b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query48.q.out index 59b49bfa8fc1..bd3b9e37e5ad 100644 --- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query48.q.out +++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query48.q.out @@ -16,15 +16,15 @@ STAGE PLANS: TableScan alias: store_sales filterExpr: (ss_sales_price BETWEEN 50 AND 200 and ss_net_profit is not null and ss_cdemo_sk is not null and ss_addr_sk is not null and ss_store_sk is not null) (type: boolean) - probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_66_container, bigKeyColName:ss_addr_sk, smallTablePos:1, keyRatio:3.323733066508898E-4 + probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_66_container, bigKeyColName:ss_addr_sk, smallTablePos:1, keyRatio:3.323843839779123E-4 Statistics: Num rows: 82510879939 Data size: 20962809999708 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (ss_sales_price BETWEEN 50 AND 200 and ss_net_profit is not null and ss_cdemo_sk is not null and ss_addr_sk is not null and ss_store_sk is not null) (type: boolean) - Statistics: Num rows: 56248293349 Data size: 14290506744864 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 56250168542 Data size: 14290983158452 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_cdemo_sk (type: bigint), ss_addr_sk (type: bigint), ss_quantity (type: int), ss_sold_date_sk (type: bigint), ss_net_profit BETWEEN 0 AND 2000 (type: boolean), ss_net_profit BETWEEN 150 AND 3000 (type: boolean), ss_net_profit BETWEEN 50 AND 25000 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 - Statistics: Num rows: 56248293349 Data size: 2223391490876 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 56250168542 Data size: 2223465613804 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 @@ -34,7 +34,7 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col4, _col5, _col6 input vertices: 1 Map 3 - Statistics: Num rows: 11304950271 Data size: 335218165588 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 11305327153 Data size: 335229341020 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 @@ -44,7 +44,7 @@ STAGE PLANS: outputColumnNames: _col1, _col2, _col4, _col5, _col6 input vertices: 1 Map 4 - Statistics: Num rows: 322998581 Data size: 3875982984 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 323009350 Data size: 3876112212 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 @@ -54,14 +54,14 @@ STAGE PLANS: outputColumnNames: _col2, _col4, _col5, _col6, _col10, _col11, _col12 input vertices: 1 Map 5 - Statistics: Num rows: 27424414 Data size: 658185940 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 27425328 Data size: 658207876 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((_col10 and _col4) or (_col11 and _col5) or (_col12 and _col6)) (type: boolean) - Statistics: Num rows: 20568309 Data size: 493639420 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 20568996 Data size: 493655908 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col2 (type: int) outputColumnNames: _col2 - Statistics: Num rows: 20568309 Data size: 493639420 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 20568996 Data size: 493655908 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col2) minReductionHashAggr: 0.99 diff --git a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query49.q.out b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query49.q.out index 98ae7182489b..276726ad4964 100644 --- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query49.q.out +++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query49.q.out @@ -41,11 +41,11 @@ STAGE PLANS: Statistics: Num rows: 21594638446 Data size: 5441536184068 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((ws_quantity > 0) and (ws_net_profit > 1) and (ws_net_paid > 0)) (type: boolean) - Statistics: Num rows: 14321294654 Data size: 3608758871252 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 14390903321 Data size: 3626299247340 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ws_item_sk (type: bigint), ws_order_number (type: bigint), ws_quantity (type: int), ws_net_paid (type: decimal(7,2)), ws_sold_date_sk (type: bigint) outputColumnNames: _col0, _col1, _col2, _col3, _col5 - Statistics: Num rows: 14321294654 Data size: 2004773870004 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 14390903321 Data size: 2014518075388 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 @@ -55,18 +55,18 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3 input vertices: 1 Map 19 - Statistics: Num rows: 243129259 Data size: 31885680632 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 244310989 Data size: 32040660996 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint), _col1 (type: bigint) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint) - Statistics: Num rows: 243129259 Data size: 31885680632 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 244310989 Data size: 32040660996 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: decimal(7,2)) Select Operator expressions: _col0 (type: bigint), _col1 (type: bigint), hash(_col0,_col1) (type: int) outputColumnNames: _col0, _col1, _col3 - Statistics: Num rows: 243129259 Data size: 4862585180 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 244310989 Data size: 4886219780 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: min(_col0), max(_col0), min(_col1), max(_col1), bloom_filter(_col3, expectedEntries=1000000) minReductionHashAggr: 0.99 @@ -88,17 +88,17 @@ STAGE PLANS: Statistics: Num rows: 2160007345 Data size: 273845125140 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((wr_return_amt > 10000) and wr_item_sk BETWEEN DynamicValue(RS[225]_col0) AND DynamicValue(RS[225]_col1) and wr_order_number BETWEEN DynamicValue(RS[225]_col2) AND DynamicValue(RS[225]_col3) and in_bloom_filter(hash(wr_item_sk,wr_order_number), DynamicValue(RS[225]_col4))) (type: boolean) - Statistics: Num rows: 1418116903 Data size: 179788463076 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1420050734 Data size: 180033633704 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: wr_item_sk (type: bigint), wr_order_number (type: bigint), wr_return_quantity (type: int), wr_return_amt (type: decimal(7,2)) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 1418116903 Data size: 179788463076 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1420050734 Data size: 180033633704 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint), _col1 (type: bigint) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint) - Statistics: Num rows: 1418116903 Data size: 179788463076 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1420050734 Data size: 180033633704 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: decimal(7,2)) Execution mode: vectorized, llap LLAP IO: may be used (ACID table) @@ -110,11 +110,11 @@ STAGE PLANS: Statistics: Num rows: 43005109025 Data size: 10824794628716 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((cs_quantity > 0) and (cs_net_profit > 1) and (cs_net_paid > 0)) (type: boolean) - Statistics: Num rows: 28554178173 Data size: 7187358002848 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 28650456622 Data size: 7211592203468 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: cs_item_sk (type: bigint), cs_order_number (type: bigint), cs_quantity (type: int), cs_net_paid (type: decimal(7,2)), cs_sold_date_sk (type: bigint) outputColumnNames: _col0, _col1, _col2, _col3, _col5 - Statistics: Num rows: 28554178173 Data size: 3989290047472 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 28650456622 Data size: 4002741061804 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 @@ -124,18 +124,18 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3 input vertices: 1 Map 19 - Statistics: Num rows: 481330829 Data size: 55240772680 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 482953772 Data size: 55427032628 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint), _col1 (type: bigint) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint) - Statistics: Num rows: 481330829 Data size: 55240772680 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 482953772 Data size: 55427032628 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: decimal(7,2)) Select Operator expressions: _col0 (type: bigint), _col1 (type: bigint), hash(_col0,_col1) (type: int) outputColumnNames: _col0, _col1, _col3 - Statistics: Num rows: 481330829 Data size: 9626616580 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 482953772 Data size: 9659075440 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: min(_col0), max(_col0), min(_col1), max(_col1), bloom_filter(_col3, expectedEntries=1000000) minReductionHashAggr: 0.99 @@ -260,11 +260,11 @@ STAGE PLANS: Statistics: Num rows: 82510879939 Data size: 20349734757316 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((ss_quantity > 0) and (ss_net_profit > 1) and (ss_net_paid > 0)) (type: boolean) - Statistics: Num rows: 40994410513 Data size: 10110489442160 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 41222412506 Data size: 10166721784872 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ss_item_sk (type: bigint), ss_ticket_number (type: bigint), ss_quantity (type: int), ss_net_paid (type: decimal(7,2)), ss_sold_date_sk (type: bigint) outputColumnNames: _col0, _col1, _col2, _col3, _col5 - Statistics: Num rows: 40994410513 Data size: 5627405668656 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 41222412506 Data size: 5658704074760 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 @@ -274,18 +274,18 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3 input vertices: 1 Map 19 - Statistics: Num rows: 695952488 Data size: 11135239924 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 699823225 Data size: 11197171716 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint), _col1 (type: bigint) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint) - Statistics: Num rows: 695952488 Data size: 11135239924 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 699823225 Data size: 11197171716 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: decimal(7,2)) Select Operator expressions: _col0 (type: bigint), _col1 (type: bigint), hash(_col0,_col1) (type: int) outputColumnNames: _col0, _col1, _col3 - Statistics: Num rows: 695952488 Data size: 13919049760 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 699823225 Data size: 13996464500 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: min(_col0), max(_col0), min(_col1), max(_col1), bloom_filter(_col3, expectedEntries=1000000) minReductionHashAggr: 0.99 @@ -307,17 +307,17 @@ STAGE PLANS: Statistics: Num rows: 8634166995 Data size: 1104703724476 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((sr_return_amt > 10000) and sr_item_sk BETWEEN DynamicValue(RS[245]_col0) AND DynamicValue(RS[245]_col1) and sr_ticket_number BETWEEN DynamicValue(RS[245]_col2) AND DynamicValue(RS[245]_col3) and in_bloom_filter(hash(sr_item_sk,sr_ticket_number), DynamicValue(RS[245]_col4))) (type: boolean) - Statistics: Num rows: 4166475379 Data size: 533082215324 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4238623038 Data size: 542313191336 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: sr_item_sk (type: bigint), sr_ticket_number (type: bigint), sr_return_quantity (type: int), sr_return_amt (type: decimal(7,2)) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 4166475379 Data size: 533082215324 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4238623038 Data size: 542313191336 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint), _col1 (type: bigint) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: bigint), _col1 (type: bigint) - Statistics: Num rows: 4166475379 Data size: 533082215324 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4238623038 Data size: 542313191336 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: decimal(7,2)) Execution mode: vectorized, llap LLAP IO: may be used (ACID table) @@ -363,25 +363,25 @@ STAGE PLANS: outputColumnNames: _col0, _col2, _col3, _col9, _col10 input vertices: 1 Map 20 - Statistics: Num rows: 506324466 Data size: 106684518508 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 508031682 Data size: 107066281820 Basic stats: COMPLETE Column stats: COMPLETE DynamicPartitionHashJoin: true Select Operator expressions: _col0 (type: bigint), if(_col9 is not null, _col9, 0) (type: int), if(_col2 is not null, _col2, 0) (type: int), if(_col10 is not null, _col10, 0) (type: decimal(7,2)), if(_col3 is not null, _col3, 0) (type: decimal(7,2)) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 506324466 Data size: 106684518508 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 508031682 Data size: 107066281820 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), sum(_col2), sum(_col3), sum(_col4) keys: _col0 (type: bigint) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 3247596 Data size: 805403808 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3263172 Data size: 809266656 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: bigint) - Statistics: Num rows: 3247596 Data size: 805403808 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3263172 Data size: 809266656 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: decimal(17,2)), _col4 (type: decimal(17,2)) Reducer 15 Execution mode: vectorized, llap @@ -509,25 +509,25 @@ STAGE PLANS: outputColumnNames: _col0, _col2, _col3, _col9, _col10 input vertices: 1 Map 12 - Statistics: Num rows: 261773150 Data size: 55215206324 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 263404204 Data size: 55595556224 Basic stats: COMPLETE Column stats: COMPLETE DynamicPartitionHashJoin: true Select Operator expressions: _col0 (type: bigint), if(_col9 is not null, _col9, 0) (type: int), if(_col2 is not null, _col2, 0) (type: int), if(_col10 is not null, _col10, 0) (type: decimal(7,2)), if(_col3 is not null, _col3, 0) (type: decimal(7,2)) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 261773150 Data size: 55215206324 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 263404204 Data size: 55595556224 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), sum(_col2), sum(_col3), sum(_col4) keys: _col0 (type: bigint) minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 1694304 Data size: 420187392 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1709992 Data size: 424078016 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: bigint) - Statistics: Num rows: 1694304 Data size: 420187392 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1709992 Data size: 424078016 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: bigint), _col2 (type: bigint), _col3 (type: decimal(17,2)), _col4 (type: decimal(17,2)) Reducer 22 Execution mode: vectorized, llap @@ -541,12 +541,12 @@ STAGE PLANS: outputColumnNames: _col0, _col2, _col3, _col9, _col10 input vertices: 1 Map 27 - Statistics: Num rows: 695952488 Data size: 69405573924 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 699823225 Data size: 69593030336 Basic stats: COMPLETE Column stats: COMPLETE DynamicPartitionHashJoin: true Select Operator expressions: _col0 (type: bigint), if(_col9 is not null, _col9, 0) (type: int), if(_col2 is not null, _col2, 0) (type: int), if(_col10 is not null, _col10, 0) (type: decimal(7,2)), if(_col3 is not null, _col3, 0) (type: decimal(7,2)) outputColumnNames: _col0, _col1, _col2, _col3, _col4 - Statistics: Num rows: 695952488 Data size: 69405573924 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 699823225 Data size: 69593030336 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(_col1), sum(_col2), sum(_col3), sum(_col4) keys: _col0 (type: bigint) diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java index e5d8b0b18f58..13b8d115357a 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java @@ -21,6 +21,8 @@ import java.nio.ByteBuffer; import java.math.BigDecimal; import java.math.BigInteger; + +import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.metastore.api.Decimal; /** @@ -46,4 +48,8 @@ public static Decimal createThriftDecimal(String s) { public static String createJdoDecimalString(Decimal d) { return new BigDecimal(new BigInteger(d.getUnscaled()), d.getScale()).toString(); } + + public static HiveDecimal getHiveDecimal(Decimal decimal) { + return HiveDecimal.create(new BigInteger(decimal.getUnscaled()), decimal.getScale()); + } } diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java index 523f848ba444..a1c69171f132 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java @@ -21,15 +21,17 @@ import org.apache.hadoop.hive.common.histogram.KllHistogramEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; +import org.apache.hadoop.hive.metastore.api.utils.DecimalUtils; import org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector; import com.google.common.base.MoreObjects; import static org.apache.hadoop.hive.metastore.columnstats.ColumnsStatsUtils.decimalInspectorFromStats; -import org.apache.commons.lang3.ObjectUtils; +import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,7 +84,7 @@ public Decimal getHighValue(DecimalColumnStatsDataInspector data) { @Override public Decimal mergeLowValue(Decimal oldValue, Decimal newValue) { if (oldValue != null && newValue != null) { - return ObjectUtils.min(oldValue, newValue); + return compareDecimals(oldValue, newValue) < 0 ? oldValue : newValue; } if (oldValue != null || newValue != null) { return MoreObjects.firstNonNull(oldValue, newValue); @@ -93,11 +95,23 @@ public Decimal mergeLowValue(Decimal oldValue, Decimal newValue) { @Override public Decimal mergeHighValue(Decimal oldValue, Decimal newValue) { if (oldValue != null && newValue != null) { - return ObjectUtils.max(oldValue, newValue); + return compareDecimals(oldValue, newValue) < 0 ? newValue : oldValue; } if (oldValue != null || newValue != null) { return MoreObjects.firstNonNull(oldValue, newValue); } return null; } + + /** + * Compare two decimals. + * @param decimal1 a non-null decimal + * @param decimal2 a non-null decimal + * @return see {@link java.util.Comparator#compare(Object, Object)} + */ + private int compareDecimals(Decimal decimal1, Decimal decimal2) { + HiveDecimal d1 = DecimalUtils.getHiveDecimal(decimal1); + HiveDecimal d2 = DecimalUtils.getHiveDecimal(decimal2); + return d1.compareTo(d2); + } } diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java index 84fee0c8fd8d..92eb19899ea1 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java @@ -68,6 +68,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.common.TableName; +import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.metastore.ColumnType; import org.apache.hadoop.hive.metastore.ExceptionHandler; import org.apache.hadoop.hive.metastore.HiveMetaStore; @@ -252,6 +253,15 @@ public static double decimalToDouble(Decimal decimal) { return new BigDecimal(new BigInteger(decimal.getUnscaled()), decimal.getScale()).doubleValue(); } + public static String decimalToString(Decimal val) { + if (val == null) { + return ""; + } + + HiveDecimal result = HiveDecimal.create(new BigInteger(val.getUnscaled()), val.getScale()); + return (result != null) ? result.toString() : ""; + } + private static Pattern getPartitionValidationRegex(Configuration conf) { return Optional.ofNullable( MetastoreConf.getVar(conf, MetastoreConf.ConfVars.PARTITION_NAME_WHITELIST_PATTERN)) diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java index 7e19cbfcad39..e21a2a9b0a71 100644 --- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java @@ -26,9 +26,12 @@ import org.apache.hadoop.hive.metastore.api.utils.DecimalUtils; import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder; import org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector; +import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils; import org.junit.Test; import org.junit.experimental.categories.Category; +import java.util.Objects; + import static org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMergerTest.createColumnStatisticsObj; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -38,22 +41,28 @@ @Category(MetastoreUnitTest.class) public class DecimalColumnStatsMergerTest { - private static final Decimal DECIMAL_1 = DecimalUtils.getDecimal(1, 0); - private static final Decimal DECIMAL_3 = DecimalUtils.getDecimal(3, 0); - private static final Decimal DECIMAL_5 = DecimalUtils.getDecimal(5, 0); - private static final Decimal DECIMAL_20 = DecimalUtils.getDecimal(2, 1); + /** + * Creates a decimal and checks its string representation. + */ + private static Decimal getDecimal(String expected, int number, int scale) { + Decimal d = DecimalUtils.getDecimal(number, scale); + assertEquals(expected, MetaStoreServerUtils.decimalToString(d)); + return d; + } + + private static final Decimal DECIMAL_1 = getDecimal("1", 1, 0); + private static final Decimal DECIMAL_3 = getDecimal("3", 3, 0); + private static final Decimal DECIMAL_5 = getDecimal("5", 5, 0); + private static final Decimal DECIMAL_20 = getDecimal("20", 2, -1); private static final DecimalColumnStatsDataInspector DATA_3 = new DecimalColumnStatsDataInspector(); private static final DecimalColumnStatsDataInspector DATA_5 = new DecimalColumnStatsDataInspector(); - private static final DecimalColumnStatsDataInspector DATA_20 = new DecimalColumnStatsDataInspector(); static { DATA_3.setLowValue(DECIMAL_3); DATA_3.setHighValue(DECIMAL_3); DATA_5.setLowValue(DECIMAL_5); DATA_5.setHighValue(DECIMAL_5); - DATA_20.setLowValue(DECIMAL_20); - DATA_20.setHighValue(DECIMAL_20); } private final DecimalColumnStatsMerger merger = new DecimalColumnStatsMerger(); @@ -180,47 +189,59 @@ public void testMergeNonNullValues() { @Test public void testDecimalCompareEqual() { - assertTrue(DECIMAL_3.equals(DECIMAL_3)); + assertTrue(DECIMAL_3.equals(getDecimal("3", 3, 0))); + // the equals method does not check for numerical equality, + // e.g., DECIMAL_3 is not equal to getDecimal("3", 30, 1) } @Test public void testDecimalCompareDoesntEqual() { assertFalse(DECIMAL_3.equals(DECIMAL_5)); + assertFalse(DECIMAL_3.equals(getDecimal("30", 3, -1))); } - @Test - public void testCompareSimple() { - DecimalColumnStatsDataInspector data1 = new DecimalColumnStatsDataInspector(DATA_3); - DecimalColumnStatsDataInspector data2 = new DecimalColumnStatsDataInspector(DATA_5); - assertEquals(DECIMAL_5, merger.mergeHighValue(merger.getHighValue(data1), merger.getHighValue(data2))); - } - - @Test - public void testCompareSimpleFlipped() { - DecimalColumnStatsDataInspector data1 = new DecimalColumnStatsDataInspector(DATA_5); - DecimalColumnStatsDataInspector data2 = new DecimalColumnStatsDataInspector(DATA_3); - assertEquals(DECIMAL_5, merger.mergeHighValue(merger.getHighValue(data1), merger.getHighValue(data2))); + private void checkMergedValue(Decimal low, Decimal high) { + Objects.requireNonNull(low); + Objects.requireNonNull(high); + assertTrue(MetaStoreServerUtils.decimalToDouble(low) < MetaStoreServerUtils.decimalToDouble(high)); + var data1 = new DecimalColumnStatsDataInspector(); + data1.setLowValue(low); + data1.setHighValue(low); + var data2 = new DecimalColumnStatsDataInspector(); + data2.setLowValue(high); + data2.setHighValue(high); + + assertEquals(low, merger.mergeLowValue(data1.getLowValue(), data2.getLowValue())); + assertEquals(low, merger.mergeLowValue(data2.getLowValue(), data1.getLowValue())); + assertEquals(high, merger.mergeHighValue(data1.getHighValue(), data2.getHighValue())); + assertEquals(high, merger.mergeHighValue(data2.getHighValue(), data1.getHighValue())); } @Test - public void testCompareSimpleReversed() { - DecimalColumnStatsDataInspector data1 = new DecimalColumnStatsDataInspector(DATA_3); - DecimalColumnStatsDataInspector data2 = new DecimalColumnStatsDataInspector(DATA_5); - assertEquals(DECIMAL_3, merger.mergeLowValue(merger.getLowValue(data1), merger.getLowValue(data2))); + public void testCompareSimple() { + checkMergedValue(DECIMAL_3, DECIMAL_5); } @Test - public void testCompareSimpleFlippedReversed() { - DecimalColumnStatsDataInspector data1 = new DecimalColumnStatsDataInspector(DATA_5); - DecimalColumnStatsDataInspector data2 = new DecimalColumnStatsDataInspector(DATA_3); - assertEquals(DECIMAL_3, merger.mergeLowValue(merger.getLowValue(data1), merger.getLowValue(data2))); + public void testCompareUnscaledValue() { + checkMergedValue(DECIMAL_3, DECIMAL_20); } @Test - public void testCompareUnscaledValue() { - DecimalColumnStatsDataInspector data1 = new DecimalColumnStatsDataInspector(DATA_3); - DecimalColumnStatsDataInspector data2 = new DecimalColumnStatsDataInspector(DATA_20); - assertEquals(DECIMAL_20, merger.mergeHighValue(merger.getHighValue(data1), merger.getHighValue(data2))); + public void testCompareScaledValue() { + checkMergedValue( + getDecimal("-123.2", -1232, 1), + getDecimal("-10.2", -102, 1)); + + checkMergedValue( + getDecimal("1.02", 102, 2), + getDecimal("123.2", 1232, 1) + ); + + checkMergedValue( + getDecimal("1.02", 102, 2), + getDecimal("1232000", 1232, -3) + ); } @Test From 635e4250692feb6d1f054dd356f3f51305e727c6 Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Tue, 18 Nov 2025 16:36:54 +0100 Subject: [PATCH 2/3] HIVE-29275: Stats autogather calculates the min statistic incorrectly --- .../merge/DecimalColumnStatsMerger.java | 1 - .../merge/DecimalColumnStatsMergerTest.java | 19 +++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java index a1c69171f132..c47448c8f7d5 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMerger.java @@ -31,7 +31,6 @@ import static org.apache.hadoop.hive.metastore.columnstats.ColumnsStatsUtils.decimalInspectorFromStats; -import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java index e21a2a9b0a71..a1bf34730c36 100644 --- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java @@ -40,16 +40,6 @@ @Category(MetastoreUnitTest.class) public class DecimalColumnStatsMergerTest { - - /** - * Creates a decimal and checks its string representation. - */ - private static Decimal getDecimal(String expected, int number, int scale) { - Decimal d = DecimalUtils.getDecimal(number, scale); - assertEquals(expected, MetaStoreServerUtils.decimalToString(d)); - return d; - } - private static final Decimal DECIMAL_1 = getDecimal("1", 1, 0); private static final Decimal DECIMAL_3 = getDecimal("3", 3, 0); private static final Decimal DECIMAL_5 = getDecimal("5", 5, 0); @@ -67,6 +57,15 @@ private static Decimal getDecimal(String expected, int number, int scale) { private final DecimalColumnStatsMerger merger = new DecimalColumnStatsMerger(); + /** + * Creates a decimal and checks its string representation. + */ + private static Decimal getDecimal(String expected, int number, int scale) { + Decimal d = DecimalUtils.getDecimal(number, scale); + assertEquals(expected, MetaStoreServerUtils.decimalToString(d)); + return d; + } + @Test public void testMergeNullValues() { ColumnStatisticsObj aggrObj = createColumnStatisticsObj(new ColStatsBuilder<>(Decimal.class) From 092198460a3be78624ec35bd6473a329eb8ea1e5 Mon Sep 17 00:00:00 2001 From: Thomas Rebele Date: Mon, 24 Nov 2025 16:05:45 +0100 Subject: [PATCH 3/3] HIVE-29275: Stats autogather calculates the min statistic incorrectly --- .../metastore/api/utils/DecimalUtils.java | 6 ----- .../merge/DecimalColumnStatsMergerTest.java | 22 ++++++++----------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java index 13b8d115357a..8647db58faa6 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/api/utils/DecimalUtils.java @@ -30,12 +30,6 @@ */ public class DecimalUtils { - public static Decimal getDecimal(int number, int scale) { - ByteBuffer bb = ByteBuffer.allocate(4); - bb.asIntBuffer().put(number); - return new Decimal((short) scale, bb); - } - public static Decimal getDecimal(ByteBuffer unscaled, short scale) { return new Decimal((short) scale, unscaled); } diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java index a1bf34730c36..90e7bbf811a8 100644 --- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/merge/DecimalColumnStatsMergerTest.java @@ -23,13 +23,13 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; -import org.apache.hadoop.hive.metastore.api.utils.DecimalUtils; import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder; import org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector; import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils; import org.junit.Test; import org.junit.experimental.categories.Category; +import java.nio.ByteBuffer; import java.util.Objects; import static org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMergerTest.createColumnStatisticsObj; @@ -61,7 +61,9 @@ public class DecimalColumnStatsMergerTest { * Creates a decimal and checks its string representation. */ private static Decimal getDecimal(String expected, int number, int scale) { - Decimal d = DecimalUtils.getDecimal(number, scale); + ByteBuffer bb = ByteBuffer.allocate(4); + bb.asIntBuffer().put(number); + Decimal d = new Decimal((short) scale, bb); assertEquals(expected, MetaStoreServerUtils.decimalToString(d)); return d; } @@ -203,17 +205,11 @@ private void checkMergedValue(Decimal low, Decimal high) { Objects.requireNonNull(low); Objects.requireNonNull(high); assertTrue(MetaStoreServerUtils.decimalToDouble(low) < MetaStoreServerUtils.decimalToDouble(high)); - var data1 = new DecimalColumnStatsDataInspector(); - data1.setLowValue(low); - data1.setHighValue(low); - var data2 = new DecimalColumnStatsDataInspector(); - data2.setLowValue(high); - data2.setHighValue(high); - - assertEquals(low, merger.mergeLowValue(data1.getLowValue(), data2.getLowValue())); - assertEquals(low, merger.mergeLowValue(data2.getLowValue(), data1.getLowValue())); - assertEquals(high, merger.mergeHighValue(data1.getHighValue(), data2.getHighValue())); - assertEquals(high, merger.mergeHighValue(data2.getHighValue(), data1.getHighValue())); + + assertEquals(low, merger.mergeLowValue(low, high)); + assertEquals(low, merger.mergeLowValue(high, low)); + assertEquals(high, merger.mergeHighValue(low, high)); + assertEquals(high, merger.mergeHighValue(high, low)); } @Test