HIVE-28818: Snapshot-level Partition-Aware Optimization

okumin · okumin · commit 235581d9cd91 · 2025-05-26T22:56:16.000+09:00
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSplit.java
@@ -24,7 +24,6 @@
 import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Collection;
-import java.util.OptionalInt;
 import java.util.stream.IntStream;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.tez.HashableInputSplit;
@@ -98,17 +97,17 @@ public byte[] getBytesForHash() {
   }
 
   @Override
-  public OptionalInt getBucketId() {
+  public int getBucketId() {
     final StructLike key = innerSplit.taskGroup().groupingKey();
     if (key.size() == 0) {
-      return OptionalInt.empty();
+      throw new IllegalStateException("The grouping key is empty though a bucket id is requested");
     }
     final int[] bucketIds = IntStream
         .range(0, key.size())
         .map(i -> key.get(i, Integer.class))
         .toArray();
     final int hashCode = IcebergBucketFunction.getHashCode(bucketIds);
-    return OptionalInt.of(ObjectInspectorUtils.getBucketNumber(hashCode, numBuckets));
+    return ObjectInspectorUtils.getBucketNumber(hashCode, numBuckets);
   }
 
   @Override
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -968,25 +968,42 @@ private void addCustomSortExpr(Table table,  org.apache.hadoop.hive.ql.metadata.
 
   @Override
   public boolean supportsPartitionAwareOptimization(org.apache.hadoop.hive.ql.metadata.Table table) {
-    if (hasUndergonePartitionEvolution(table)) {
-      // Don't support complex cases yet
+    final Table icebergTable = IcebergTableUtil.getTable(conf, table.getTTable());
+    final Snapshot snapshot = IcebergTableUtil.getTableSnapshot(icebergTable, table);
+    if (snapshot == null) {
+      LOG.info("Partition-Aware Optimization is not supported because an unknown snapshot is specified");
+      return false;
+    }
+
+    final Set<Integer> partitionSpecIds = IcebergTableUtil.getPartitionSpecIds(snapshot, icebergTable.io());
+    if (partitionSpecIds.size() != 1) {
+      LOG.info("Partition-Aware Optimization is not supported when multiple partition specs are combined: {}",
+          partitionSpecIds);
       return false;
     }
-    final List<TransformSpec> specs = getPartitionTransformSpec(table);
+    final int partitionSpecId = partitionSpecIds.iterator().next();
+    final List<TransformSpec> specs = IcebergTableUtil.getTransformSpecs(icebergTable, partitionSpecId);
     // Currently, we support the only bucket transform
     return specs.stream().anyMatch(IcebergTableUtil::isBucket);
   }
 
   @Override
   public PartitionAwareOptimizationCtx createPartitionAwareOptimizationContext(
       org.apache.hadoop.hive.ql.metadata.Table table) {
+    final Table icebergTable = IcebergTableUtil.getTable(conf, table.getTTable());
+    final Snapshot snapshot = Objects.requireNonNull(IcebergTableUtil.getTableSnapshot(icebergTable, table));
+    final Set<Integer> partitionSpecIds = IcebergTableUtil.getPartitionSpecIds(snapshot, icebergTable.io());
+    Preconditions.checkArgument(partitionSpecIds.size() == 1);
+    final int partitionSpecId = partitionSpecIds.iterator().next();
+
     // Currently, we support the only bucket transform
     final List<String> bucketColumnNames = Lists.newArrayList();
     final List<Integer> numBuckets = Lists.newArrayList();
-    getPartitionTransformSpec(table).stream().filter(IcebergTableUtil::isBucket).forEach(spec -> {
-      bucketColumnNames.add(spec.getColumnName());
-      numBuckets.add(spec.getTransformParam().get());
-    });
+    IcebergTableUtil.getTransformSpecs(icebergTable, partitionSpecId).stream().filter(IcebergTableUtil::isBucket)
+        .forEach(spec -> {
+          bucketColumnNames.add(spec.getColumnName());
+          numBuckets.add(spec.getTransformParam().get());
+        });
 
     if (bucketColumnNames.isEmpty()) {
       return null;
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/IcebergTableUtil.java
@@ -27,9 +27,11 @@
 import java.util.Map;
 import java.util.Optional;
 import java.util.Properties;
+import java.util.Set;
 import java.util.function.BinaryOperator;
 import java.util.function.Function;
 import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
@@ -53,9 +55,13 @@
 import org.apache.hadoop.hive.ql.plan.PlanUtils;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.ql.session.SessionStateUtil;
+import org.apache.iceberg.DataFile;
 import org.apache.iceberg.DeleteFiles;
 import org.apache.iceberg.FileScanTask;
 import org.apache.iceberg.ManageSnapshots;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.ManifestReader;
 import org.apache.iceberg.MetadataTableType;
 import org.apache.iceberg.MetadataTableUtils;
 import org.apache.iceberg.PartitionData;
@@ -78,6 +84,7 @@
 import org.apache.iceberg.expressions.ResidualEvaluator;
 import org.apache.iceberg.hive.HiveSchemaUtil;
 import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.mr.Catalogs;
 import org.apache.iceberg.mr.InputFormatConfig;
 import org.apache.iceberg.relocated.com.google.common.collect.FluentIterable;
@@ -546,9 +553,24 @@ public static PartitionSpec getPartitionSpec(Table icebergTable, String partitio
   }
 
   public static TransformSpec getTransformSpec(Table table, String transformName, int sourceId) {
-    TransformSpec spec = TransformSpec.fromString(transformName.toUpperCase(),
-        table.schema().findColumnName(sourceId));
-    return spec;
+    return TransformSpec.fromString(transformName.toUpperCase(), table.schema().findColumnName(sourceId));
   }
 
+  public static List<TransformSpec> getTransformSpecs(Table table, int partitionSpecId) {
+    final PartitionSpec icebergSpec = table.specs().get(partitionSpecId);
+    return icebergSpec.fields().stream()
+        .map(f -> getTransformSpec(table, f.transform().toString(), f.sourceId()))
+        .collect(Collectors.toList());
+  }
+
+  public static Set<Integer> getPartitionSpecIds(Snapshot snapshot, FileIO io) {
+    final List<ManifestFile> manifestFiles = snapshot.allManifests(io);
+    return manifestFiles.parallelStream().flatMap(manifestFile -> {
+      try (ManifestReader<DataFile> entries = ManifestFiles.read(manifestFile, io)) {
+        return StreamSupport.stream(entries.spliterator(), false).map(DataFile::specId);
+      } catch (IOException e) {
+        throw new RuntimeException("Failed to read manifest file: " + manifestFile.path(), e);
+      }
+    }).collect(Collectors.toSet());
+  }
 }
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -21,8 +21,10 @@
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
+import java.util.Collections;
 import java.util.List;
 import java.util.Optional;
+import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.function.Consumer;
 import org.apache.commons.lang3.StringUtils;
@@ -45,10 +47,12 @@
 import org.apache.iceberg.DataTableScan;
 import org.apache.iceberg.FileScanTask;
 import org.apache.iceberg.IncrementalAppendScan;
+import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Partitioning;
 import org.apache.iceberg.Scan;
 import org.apache.iceberg.ScanTaskGroup;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.SystemConfigs;
 import org.apache.iceberg.Table;
@@ -60,6 +64,8 @@
 import org.apache.iceberg.mr.Catalogs;
 import org.apache.iceberg.mr.InputFormatConfig;
 import org.apache.iceberg.mr.hive.HiveIcebergStorageHandler;
+import org.apache.iceberg.mr.hive.IcebergTableUtil;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.types.Types.StructType;
 import org.apache.iceberg.util.SerializationUtil;
@@ -197,11 +203,15 @@ private List<InputSplit> planInputSplits(Table table, Configuration conf, Execut
         InputFormatConfig.InMemoryDataModel.GENERIC);
 
     long fromVersion = conf.getLong(InputFormatConfig.SNAPSHOT_ID_INTERVAL_FROM, -1);
+    Snapshot snapshot;
     Scan<? extends Scan, FileScanTask, CombinedScanTask> scan;
     if (fromVersion != -1) {
+      snapshot = table.currentSnapshot();
       scan = applyConfig(conf, createIncrementalAppendScan(table, conf));
     } else {
-      scan = applyConfig(conf, createTableScan(table, conf));
+      TableScan tableScan = createTableScan(table, conf);
+      snapshot = tableScan.snapshot();
+      scan = applyConfig(conf, tableScan);
     }
     scan = scan.planWith(workerPool);
 
@@ -211,7 +221,7 @@ private List<InputSplit> planInputSplits(Table table, Configuration conf, Execut
     Path tableLocation = new Path(conf.get(InputFormatConfig.TABLE_LOCATION));
 
     String[] groupingPartitionColumns = conf.getStrings(InputFormatConfig.GROUPING_PARTITION_COLUMNS);
-    generateInputSplits(scan, table, groupingPartitionColumns, taskGroup -> {
+    generateInputSplits(scan, table, snapshot, groupingPartitionColumns, taskGroup -> {
       if (applyResidual && (model == InputFormatConfig.InMemoryDataModel.HIVE ||
           model == InputFormatConfig.InMemoryDataModel.PIG)) {
         // TODO: We do not support residual evaluation for HIVE and PIG in memory data model yet
@@ -241,7 +251,7 @@ private static void validateFileLocations(ScanTaskGroup<FileScanTask> split, Pat
     }
   }
 
-  private static void generateInputSplits(Scan<?, FileScanTask, CombinedScanTask> scan, Table table,
+  private static void generateInputSplits(Scan<?, FileScanTask, CombinedScanTask> scan, Table table, Snapshot snapshot,
       String[] groupingPartitionColumns, Consumer<ScanTaskGroup<FileScanTask>> consumer) {
     if (groupingPartitionColumns == null) {
       try (CloseableIterable<CombinedScanTask> tasksIterable = scan.planTasks()) {
@@ -250,8 +260,11 @@ private static void generateInputSplits(Scan<?, FileScanTask, CombinedScanTask>
         throw new UncheckedIOException(String.format("Failed to close table scan: %s", scan), e);
       }
     } else {
-      final StructType groupingKeyType = Partitioning.groupingKeyType(
-          table.schema().select(groupingPartitionColumns), table.specs().values());
+      final Schema schema = table.schemas().get(snapshot.schemaId()).select(groupingPartitionColumns);
+      final Set<Integer> specIds = IcebergTableUtil.getPartitionSpecIds(snapshot, table.io());
+      Preconditions.checkArgument(specIds.size() == 1);
+      final PartitionSpec partitionSpec = table.specs().get(specIds.iterator().next());
+      final StructType groupingKeyType = Partitioning.groupingKeyType(schema, Collections.singletonList(partitionSpec));
       try (CloseableIterable<FileScanTask> taskIterable = scan.planFiles()) {
         final List<FileScanTask> tasks = Lists.newArrayList(taskIterable);
         final List<ScanTaskGroup<FileScanTask>> partitionScanTaskGroups =
diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_minor_compaction_bucket.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_minor_compaction_bucket.q.out
@@ -291,7 +291,7 @@ POSTHOOK: Input: default@srcbucket_big
 Plan optimized by CBO.
 
 Vertex dependency in root stage
-Map 1 <- Map 3 (BROADCAST_EDGE)
+Map 1 <- Map 3 (CUSTOM_EDGE)
 Reducer 2 <- Map 1 (SIMPLE_EDGE)
 
 Stage-0
@@ -305,9 +305,9 @@ Stage-0
         <-Map 1 [SIMPLE_EDGE] vectorized, llap
           SHUFFLE [RS_35]
             Map Join Operator [MAPJOIN_34] (rows=6 width=271)
-              Conds:SEL_33._col1=RS_31._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
-            <-Map 3 [BROADCAST_EDGE] vectorized, llap
-              BROADCAST [RS_31]
+              BucketMapJoin:true,Conds:SEL_33._col1=RS_31._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
+            <-Map 3 [CUSTOM_EDGE] vectorized, llap
+              MULTICAST [RS_31]
                 PartitionCols:_col0
                 Select Operator [SEL_30] (rows=4 width=93)
                   Output:["_col0","_col1"]
@@ -320,7 +320,7 @@ Stage-0
                 Filter Operator [FIL_32] (rows=6 width=178)
                   predicate:key is not null
                   TableScan [TS_0] (rows=6 width=178)
-                    default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["id","key","value"]
+                    default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key"],Output:["id","key","value"]
 
 PREHOOK: query: SELECT *
 FROM default.srcbucket_big.tag_bucket_4 a
@@ -550,7 +550,7 @@ POSTHOOK: Input: default@srcbucket_big
 Plan optimized by CBO.
 
 Vertex dependency in root stage
-Map 1 <- Map 3 (BROADCAST_EDGE)
+Map 1 <- Map 3 (CUSTOM_EDGE)
 Reducer 2 <- Map 1 (SIMPLE_EDGE)
 
 Stage-0
@@ -564,9 +564,9 @@ Stage-0
         <-Map 1 [SIMPLE_EDGE] vectorized, llap
           SHUFFLE [RS_35]
             Map Join Operator [MAPJOIN_34] (rows=8 width=250)
-              Conds:SEL_33._col1=RS_31._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
-            <-Map 3 [BROADCAST_EDGE] vectorized, llap
-              BROADCAST [RS_31]
+              BucketMapJoin:true,Conds:SEL_33._col1=RS_31._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
+            <-Map 3 [CUSTOM_EDGE] vectorized, llap
+              MULTICAST [RS_31]
                 PartitionCols:_col0
                 Select Operator [SEL_30] (rows=4 width=93)
                   Output:["_col0","_col1"]
@@ -579,7 +579,7 @@ Stage-0
                 Filter Operator [FIL_32] (rows=8 width=157)
                   predicate:key is not null
                   TableScan [TS_0] (rows=8 width=157)
-                    default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["id","key","value"]
+                    default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:8,Grouping Partition Columns:["key"],Output:["id","key","value"]
 
 PREHOOK: query: SELECT *
 FROM default.srcbucket_big a
@@ -626,7 +626,7 @@ POSTHOOK: Input: default@srcbucket_big
 Plan optimized by CBO.
 
 Vertex dependency in root stage
-Map 1 <- Map 3 (BROADCAST_EDGE)
+Map 1 <- Map 3 (CUSTOM_EDGE)
 Reducer 2 <- Map 1 (SIMPLE_EDGE)
 
 Stage-0
@@ -640,9 +640,9 @@ Stage-0
         <-Map 1 [SIMPLE_EDGE] vectorized, llap
           SHUFFLE [RS_35]
             Map Join Operator [MAPJOIN_34] (rows=6 width=271)
-              Conds:SEL_33._col1=RS_31._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
-            <-Map 3 [BROADCAST_EDGE] vectorized, llap
-              BROADCAST [RS_31]
+              BucketMapJoin:true,Conds:SEL_33._col1=RS_31._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
+            <-Map 3 [CUSTOM_EDGE] vectorized, llap
+              MULTICAST [RS_31]
                 PartitionCols:_col0
                 Select Operator [SEL_30] (rows=4 width=93)
                   Output:["_col0","_col1"]
@@ -655,7 +655,7 @@ Stage-0
                 Filter Operator [FIL_32] (rows=6 width=178)
                   predicate:key is not null
                   TableScan [TS_0] (rows=6 width=178)
-                    default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["id","key","value"]
+                    default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key"],Output:["id","key","value"]
 
 PREHOOK: query: SELECT *
 FROM default.srcbucket_big.tag_bucket_4 a
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -71,6 +71,7 @@
 import org.apache.hadoop.mapred.RecordReader;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.util.Preconditions;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hive.common.util.HiveStringUtils;
 import org.apache.hive.common.util.Ref;
@@ -174,7 +175,9 @@ public String inputFormatClassName() {
 
     public OptionalInt getBucketId() {
       if (inputSplit instanceof PartitionAwareSplit) {
-        return ((PartitionAwareSplit) inputSplit).getBucketId();
+        final int bucketId = ((PartitionAwareSplit) inputSplit).getBucketId();
+        Preconditions.checkArgument(bucketId >= 0);
+        return OptionalInt.of(bucketId);
       }
 
       final int bucketId = Utilities.parseSplitBucket(inputSplit);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/PartitionAwareSplit.java b/ql/src/java/org/apache/hadoop/hive/ql/io/PartitionAwareSplit.java
@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.hive.ql.io;
 
-import java.util.OptionalInt;
 import org.apache.hadoop.hive.common.classification.InterfaceStability.Unstable;
 
 /**
@@ -27,8 +26,8 @@
 @Unstable
 public interface PartitionAwareSplit {
   /**
-   * Returns the bucket number of this split. OptionalInt.empty if this is not a bucketed split.
+   * Returns the bucket number of this split
    */
   @Unstable
-  OptionalInt getBucketId();
+  int getBucketId();
 }