apache
diff --git a/‎common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
-2 b/‎common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
-2
diff --git a/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java
+12-4 b/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergOutputCommitter.java
+12-4
diff --git a/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergCompactionService.java
+7-4 b/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergCompactionService.java
+7-4
diff --git a/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergCompactionUtil.java
+8-2 b/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergCompactionUtil.java
+8-2
diff --git a/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergMajorQueryCompactor.java ‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergQueryCompactor.java
+21-7 b/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergMajorQueryCompactor.java ‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/IcebergQueryCompactor.java
+21-7
diff --git a/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/evaluator/CompactionEvaluator.java
+193 b/‎iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/compaction/evaluator/CompactionEvaluator.java
+193
@@ -2245,8 +2245,6 @@ public static enum ConfVars {
         "If this is set to true the URI for auth will have the default location masked with DEFAULT_TABLE_LOCATION"),
     HIVE_ICEBERG_ALLOW_DATAFILES_IN_TABLE_LOCATION_ONLY("hive.iceberg.allow.datafiles.in.table.location.only", false,
         "If this is set to true, then all the data files being read should be withing the table location"),
-    HIVE_ICEBERG_COMPACTION_TARGET_FILE_SIZE("hive.iceberg.compaction.target.file.size", "128mb",
-        new SizeValidator(), "Target file size for Iceberg compaction."),
     HIVE_USE_EXPLICIT_RCFILE_HEADER("hive.exec.rcfile.use.explicit.header", true,
         "If this is set the header for RCFiles will simply be RCF.  If this is not\n" +
         "set the header will be that borrowed from sequence files, e.g. SEQ- followed\n" +
 
@@ -50,6 +50,7 @@
 import org.apache.hadoop.hive.ql.metadata.HiveUtils;
 import org.apache.hadoop.hive.ql.plan.MapWork;
 import org.apache.hadoop.hive.ql.session.SessionStateUtil;
+import org.apache.hadoop.hive.ql.txn.compactor.CompactorContext;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.JobContext;
 import org.apache.hadoop.mapred.JobContextImpl;
@@ -516,7 +517,13 @@ private void commitTable(FileIO io, ExecutorService executor, OutputTable output
             .map(x -> x.getJobConf().get(IcebergCompactionService.PARTITION_PATH))
             .orElse(null);
 
-        commitCompaction(table, snapshotId, startTime, filesForCommit, partitionPath);
+        long fileSizeThreshold = jobContexts.stream()
+            .findAny()
+            .map(x -> x.getJobConf().get(CompactorContext.COMPACTION_FILE_SIZE_THRESHOLD))
+            .map(Long::parseLong)
+            .orElse(-1L);
+
+        commitCompaction(table, snapshotId, startTime, filesForCommit, partitionPath, fileSizeThreshold);
       } else {
         commitOverwrite(table, branchName, snapshotId, startTime, filesForCommit);
       }
@@ -614,9 +621,10 @@ private void commit(SnapshotUpdate<?> update) {
    * @param partitionPath The path of the compacted partition
    */
   private void commitCompaction(Table table, Long snapshotId, long startTime, FilesForCommit results,
-      String partitionPath) {
-    List<DataFile> existingDataFiles = IcebergCompactionUtil.getDataFiles(table, partitionPath);
-    List<DeleteFile> existingDeleteFiles = IcebergCompactionUtil.getDeleteFiles(table, partitionPath);
+      String partitionPath, long fileSizeThreshold) {
+    List<DataFile> existingDataFiles = IcebergCompactionUtil.getDataFiles(table, partitionPath, fileSizeThreshold);
+    List<DeleteFile> existingDeleteFiles = fileSizeThreshold == -1 ?
+        IcebergCompactionUtil.getDeleteFiles(table, partitionPath) : Collections.emptyList();
 
     RewriteFiles rewriteFiles = table.newRewrite();
     existingDataFiles.forEach(rewriteFiles::deleteFile);
 
@@ -26,7 +26,7 @@
 import org.apache.hadoop.hive.ql.txn.compactor.CompactorUtil;
 import org.apache.hadoop.hive.ql.txn.compactor.service.CompactionService;
 import org.apache.iceberg.mr.hive.IcebergTableUtil;
-import org.apache.iceberg.mr.hive.compaction.evaluator.IcebergCompactionEvaluator;
+import org.apache.iceberg.mr.hive.compaction.evaluator.CompactionEvaluator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -40,8 +40,9 @@ public IcebergCompactionService() {
 
   public Boolean compact(Table table, CompactionInfo ci) throws Exception {
 
-    if (!ci.isMajorCompaction()) {
-      ci.errorMessage = "Presently Iceberg tables support only Major compaction";
+    if (!ci.isMajorCompaction() && !ci.isMinorCompaction()) {
+      ci.errorMessage = String.format(
+          "Iceberg tables do not support %s compaction type, supported types are ['MINOR', 'MAJOR']", ci.type.name());
       LOG.error(ci.errorMessage + " Compaction info: {}", ci);
       try {
         msc.markRefused(CompactionInfo.compactionInfoToStruct(ci));
@@ -53,7 +54,9 @@ public Boolean compact(Table table, CompactionInfo ci) throws Exception {
     CompactorUtil.checkInterrupt(CLASS_NAME);
 
     org.apache.iceberg.Table icebergTable = IcebergTableUtil.getTable(conf, table);
-    if (!IcebergCompactionEvaluator.isEligibleForCompaction(icebergTable, ci.partName, ci.type, conf)) {
+    CompactionEvaluator compactionEvaluator = new CompactionEvaluator(icebergTable, ci,
+        table.getParameters());
+    if (!compactionEvaluator.isEligibleForCompaction()) {
       LOG.info("Table={}{} doesn't meet requirements for compaction", table.getTableName(),
           ci.partName == null ? "" : ", partition=" + ci.partName);
       msc.markRefused(CompactionInfo.compactionInfoToStruct(ci));
 
@@ -55,6 +55,12 @@ public static boolean shouldIncludeForCompaction(Table table, String partitionPa
             table.specs().get(file.specId()).partitionToPath(file.partition()).equals(partitionPath);
   }
 
+  public static boolean shouldIncludeForCompaction(Table table, String partitionPath, ContentFile<?> file,
+      long fileSizeThreshold) {
+    return shouldIncludeForCompaction(table, partitionPath, file) &&
+        (fileSizeThreshold == -1 || file.fileSizeInBytes() < fileSizeThreshold);
+  }
+
   /**
    * Returns table's list of data files as following:
    *  1. If the table is unpartitioned, returns all data files.
@@ -63,13 +69,13 @@ public static boolean shouldIncludeForCompaction(Table table, String partitionPa
    * @param table the iceberg table
    * @param partitionPath partition path
    */
-  public static List<DataFile> getDataFiles(Table table, String partitionPath) {
+  public static List<DataFile> getDataFiles(Table table, String partitionPath, long fileSizeThreshold) {
     CloseableIterable<FileScanTask> fileScanTasks =
         table.newScan().useSnapshot(table.currentSnapshot().snapshotId()).ignoreResiduals().planFiles();
     CloseableIterable<FileScanTask> filteredFileScanTasks =
         CloseableIterable.filter(fileScanTasks, t -> {
           DataFile file = t.asFileScanTask().file();
-          return shouldIncludeForCompaction(table, partitionPath, file);
+          return shouldIncludeForCompaction(table, partitionPath, file, fileSizeThreshold);
         });
     return Lists.newArrayList(CloseableIterable.transform(filteredFileScanTasks, t -> t.file()));
   }
 
@@ -26,6 +26,7 @@
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
 import org.apache.hadoop.hive.metastore.Warehouse;
+import org.apache.hadoop.hive.metastore.api.CompactionType;
 import org.apache.hadoop.hive.metastore.txn.entities.CompactionInfo;
 import org.apache.hadoop.hive.ql.Context.RewritePolicy;
 import org.apache.hadoop.hive.ql.DriverUtils;
@@ -40,12 +41,13 @@
 import org.apache.hive.iceberg.org.apache.orc.storage.common.TableName;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.mr.hive.IcebergTableUtil;
+import org.apache.iceberg.mr.hive.compaction.evaluator.CompactionEvaluator;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public class IcebergMajorQueryCompactor extends QueryCompactor  {
+public class IcebergQueryCompactor extends QueryCompactor  {
 
-  private static final Logger LOG = LoggerFactory.getLogger(IcebergMajorQueryCompactor.class.getName());
+  private static final Logger LOG = LoggerFactory.getLogger(IcebergQueryCompactor.class.getName());
 
   @Override
   public boolean run(CompactorContext context) throws IOException, HiveException, InterruptedException {
@@ -62,20 +64,32 @@ public boolean run(CompactorContext context) throws IOException, HiveException,
     Table icebergTable = IcebergTableUtil.getTable(conf, table.getTTable());
     String compactionQuery;
     String orderBy = ci.orderByClause == null ? "" : ci.orderByClause;
+    String fileSizePredicate = null;
+
+    if (ci.type == CompactionType.MINOR) {
+      long fileSizeInBytesThreshold = CompactionEvaluator.getFragmentSizeBytes(table.getParameters());
+      fileSizePredicate = String.format("%1$s in (select file_path from %2$s.files where file_size_in_bytes < %3$d)",
+          VirtualColumn.FILE_PATH.getName(), compactTableName, fileSizeInBytesThreshold);
+      conf.setLong(CompactorContext.COMPACTION_FILE_SIZE_THRESHOLD, fileSizeInBytesThreshold);
+      // IOW query containing a join with Iceberg .files metadata table fails with exception that Iceberg AVRO format
+      // doesn't support vectorization, hence disabling it in this case.
+      conf.setBoolVar(ConfVars.HIVE_VECTORIZATION_ENABLED, false);
+    }
 
     if (partSpec == null) {
       if (!icebergTable.spec().isPartitioned()) {
         HiveConf.setVar(conf, ConfVars.REWRITE_POLICY, RewritePolicy.FULL_TABLE.name());
-        compactionQuery = String.format("insert overwrite table %s select * from %<s %2$s", compactTableName, orderBy);
+        compactionQuery = String.format("insert overwrite table %s select * from %<s %2$s %3$s", compactTableName,
+            fileSizePredicate == null ? "" : "where " + fileSizePredicate, orderBy);
       } else if (icebergTable.specs().size() > 1) {
         // Compacting partitions of old partition specs on a partitioned table with partition evolution
         HiveConf.setVar(conf, ConfVars.REWRITE_POLICY, RewritePolicy.PARTITION.name());
         // A single filter on a virtual column causes errors during compilation,
         // added another filter on file_path as a workaround.
         compactionQuery = String.format("insert overwrite table %1$s select * from %1$s " +
-                "where %2$s != %3$d and %4$s is not null %5$s",
+                "where %2$s != %3$d and %4$s is not null %5$s %6$s",
             compactTableName, VirtualColumn.PARTITION_SPEC_ID.getName(), icebergTable.spec().specId(),
-            VirtualColumn.FILE_PATH.getName(), orderBy);
+            VirtualColumn.FILE_PATH.getName(), fileSizePredicate == null ? "" : "and " + fileSizePredicate, orderBy);
       } else {
         // Partitioned table without partition evolution with partition spec as null in the compaction request - this
         // code branch is not supposed to be reachable
@@ -90,8 +104,8 @@ public boolean run(CompactorContext context) throws IOException, HiveException,
       Warehouse.makeSpecFromName(partSpecMap, new Path(partSpec), null);
 
       compactionQuery = String.format("insert overwrite table %1$s select * from %1$s where %2$s=%3$d " +
-              "and %4$s is not null %5$s", compactTableName, VirtualColumn.PARTITION_HASH.getName(), partitionHash,
-          VirtualColumn.FILE_PATH.getName(), orderBy);
+              "and %4$s is not null %5$s %6$s", compactTableName, VirtualColumn.PARTITION_HASH.getName(), partitionHash,
+          VirtualColumn.FILE_PATH.getName(), fileSizePredicate == null ? "" : "and " + fileSizePredicate, orderBy);
     }
 
     SessionState sessionState = setupQueryCompactionSession(conf, ci, tblProperties);
 
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.iceberg.mr.hive.compaction.evaluator;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Optional;
+import org.apache.hadoop.hive.metastore.txn.entities.CompactionInfo;
+import org.apache.hadoop.hive.ql.txn.compactor.CompactorContext;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.MetadataTableType;
+import org.apache.iceberg.MetadataTableUtils;
+import org.apache.iceberg.PartitionData;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Partitioning;
+import org.apache.iceberg.PartitionsTable;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.mr.hive.IcebergTableUtil;
+import org.apache.iceberg.mr.hive.compaction.IcebergCompactionUtil;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.CommonPartitionEvaluator;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.IcebergTableFileScanHelper;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.OptimizingConfig;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.TableConfiguration;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.TableFileScanHelper;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.TableFormat;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.TableProperties;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.TableRuntime;
+import org.apache.iceberg.mr.hive.compaction.evaluator.amoro.TableRuntimeMeta;
+import org.apache.iceberg.relocated.com.google.common.collect.FluentIterable;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.util.Pair;
+import org.apache.iceberg.util.StructProjection;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CompactionEvaluator extends CommonPartitionEvaluator {
+
+  private static final long LAST_OPTIMIZE_TIME = 0;
+  private static final int TRIGGER_INTERVAL = -1;
+  private final Table table;
+  private final CompactionInfo ci;
+
+  private static final Logger LOG = LoggerFactory.getLogger(CompactionEvaluator.class);
+
+  public CompactionEvaluator(Table table, CompactionInfo ci, Map<String, String> parameters) throws IOException {
+    super(
+        createTableRuntime(table, parameters),
+        getPartitionSpecStructPair(table, ci.partName),
+        System.currentTimeMillis()
+    );
+    this.table = table;
+    this.ci = ci;
+    addFiles();
+  }
+
+  public boolean isEligibleForCompaction() {
+
+    if (table.currentSnapshot() == null) {
+      LOG.info("Table {}{} doesn't require compaction because it is empty", table,
+          ci.partName == null ? "" : " partition " + ci.partName);
+      return false;
+    }
+
+    addFiles();
+
+    switch (ci.type) {
+      case MINOR:
+        return isMinorNecessary();
+      case MAJOR:
+        return isMajorNecessary();
+      default:
+        return false;
+    }
+  }
+
+  private static TableRuntime createTableRuntime(Table icebergTable, Map<String, String> parameters) {
+    OptimizingConfig optimizingConfig = OptimizingConfig.parse(Collections.emptyMap());
+    optimizingConfig.setTargetSize(getTargetSizeBytes(parameters));
+    optimizingConfig.setFragmentRatio(getFragmentRatio(parameters));
+    optimizingConfig.setMinTargetSizeRatio(getMinTargetSizeRatio(parameters));
+    optimizingConfig.setMinorLeastFileCount(getMinInputFiles(parameters));
+    optimizingConfig.setMajorDuplicateRatio(getDeleteFileRatio(parameters));
+    optimizingConfig.setFullTriggerInterval(TRIGGER_INTERVAL);
+    optimizingConfig.setMinorLeastInterval(TRIGGER_INTERVAL);
+
+    TableConfiguration tableConfig = new TableConfiguration();
+    tableConfig.setOptimizingConfig(optimizingConfig);
+
+    TableRuntimeMeta tableRuntimeMeta = new TableRuntimeMeta();
+    tableRuntimeMeta.setTableName(icebergTable.name());
+    tableRuntimeMeta.setFormat(TableFormat.ICEBERG);
+    tableRuntimeMeta.setLastFullOptimizingTime(LAST_OPTIMIZE_TIME);
+    tableRuntimeMeta.setLastMinorOptimizingTime(LAST_OPTIMIZE_TIME);
+    tableRuntimeMeta.setTableConfig(tableConfig);
+
+    return new HiveTableRuntime(tableRuntimeMeta);
+  }
+
+  private void addFiles() {
+    TableFileScanHelper tableFileScanHelper = new IcebergTableFileScanHelper(table,
+        table.currentSnapshot().snapshotId());
+    try (CloseableIterable<TableFileScanHelper.FileScanResult> results =
+             tableFileScanHelper.scan()) {
+      for (TableFileScanHelper.FileScanResult fileScanResult : results) {
+        DataFile file = fileScanResult.file();
+        if (IcebergCompactionUtil.shouldIncludeForCompaction(table, ci.partName, file)) {
+          addFile(fileScanResult.file(), fileScanResult.deleteFiles());
+        }
+      }
+    } catch (IOException e) {
+      throw new UncheckedIOException(e);
+    }
+  }
+
+  public static long getTargetSizeBytes(Map<String, String> parameters) {
+    return Optional.ofNullable(parameters.get(CompactorContext.COMPACTION_TARGET_SIZE))
+        .map(Long::parseLong)
+        .orElse(TableProperties.SELF_OPTIMIZING_TARGET_SIZE_DEFAULT);
+  }
+
+  public static double getMinTargetSizeRatio(Map<String, String> parameters) {
+    return Optional.ofNullable(parameters.get(CompactorContext.COMPACTION_MIN_TARGET_SIZE_RATIO))
+        .map(Double::parseDouble)
+        .orElse(TableProperties.SELF_OPTIMIZING_MIN_TARGET_SIZE_RATIO_DEFAULT);
+  }
+
+  public static int getFragmentRatio(Map<String, String> parameters) {
+    return Optional.ofNullable(parameters.get(CompactorContext.COMPACTION_MIN_FRAGMENT_RATIO))
+        .map(x -> (int) (1 / Double.parseDouble(x)))
+        .orElse(TableProperties.SELF_OPTIMIZING_FRAGMENT_RATIO_DEFAULT);
+  }
+
+  public static int getFragmentSizeBytes(Map<String, String> parameters) {
+    return (int) (getTargetSizeBytes(parameters) * getMinTargetSizeRatio(parameters));
+  }
+
+  public static int getMinInputFiles(Map<String, String> parameters) {
+    return Optional.ofNullable(parameters.get(CompactorContext.COMPACTION_MIN_INPUT_FILES))
+        .map(Integer::parseInt)
+        .orElse(TableProperties.SELF_OPTIMIZING_MINOR_TRIGGER_FILE_CNT_DEFAULT);
+  }
+
+  public static double getDeleteFileRatio(Map<String, String> parameters) {
+    return Optional.ofNullable(parameters.get(CompactorContext.COMPACTION_DELETE_FILE_RATIO))
+        .map(Double::parseDouble)
+        .orElse(TableProperties.SELF_OPTIMIZING_MAJOR_TRIGGER_DUPLICATE_RATIO_DEFAULT);
+  }
+
+  private static Pair<Integer, StructLike> getPartitionSpecStructPair(Table table, String partitionPath)
+      throws IOException {
+    if (!table.spec().isPartitioned() || partitionPath == null) {
+      return null;
+    }
+    PartitionsTable partitionsTable = (PartitionsTable) MetadataTableUtils
+        .createMetadataTableInstance(table, MetadataTableType.PARTITIONS);
+    try (CloseableIterable<FileScanTask> fileScanTasks = partitionsTable.newScan().planFiles()) {
+      return FluentIterable.from(fileScanTasks)
+          .transformAndConcat(task -> task.asDataTask().rows())
+          .transform(row -> {
+            StructLike data = row.get(IcebergTableUtil.PART_IDX, StructProjection.class);
+            PartitionSpec spec = table.specs().get(row.get(IcebergTableUtil.SPEC_IDX, Integer.class));
+            PartitionData partitionData = IcebergTableUtil.toPartitionData(data,
+                Partitioning.partitionType(table), spec.partitionType());
+            String path = spec.partitionToPath(partitionData);
+            return Maps.immutableEntry(path, Pair.of(spec.specId(), data));
+          })
+          .filter(e -> e.getKey().equals(partitionPath))
+          .transform(Map.Entry::getValue)
+          .get(0);
+    }
+  }
+}