apache · kazuyukitanimura · Aug 9, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/common/src/main/java/org/apache/comet/parquet/BatchReader.java b/common/src/main/java/org/apache/comet/parquet/BatchReader.java
@@ -152,6 +152,8 @@ public class BatchReader extends RecordReader<Void, ColumnarBatch> implements Cl
   /** The TaskContext object for executing this task. */
   private final TaskContext taskContext;
 
+  private boolean hasNativeOperations = false;
+
   // Only for testing
   public BatchReader(String file, int capacity) {
     this(file, capacity, null, null);
@@ -215,7 +217,8 @@ public BatchReader(AbstractColumnReader[] columnReaders) {
       boolean useLegacyDateTimestamp,
       StructType partitionSchema,
       InternalRow partitionValues,
-      Map<String, SQLMetric> metrics) {
+      Map<String, SQLMetric> metrics,
+      boolean hasNativeOperations) {
     this.conf = conf;
     this.capacity = capacity;
     this.sparkSchema = sparkSchema;
@@ -229,6 +232,7 @@ public BatchReader(AbstractColumnReader[] columnReaders) {
     this.footer = footer;
     this.metrics = metrics;
     this.taskContext = TaskContext$.MODULE$.get();
+    this.hasNativeOperations = hasNativeOperations;
   }
 
   /**
@@ -586,7 +590,8 @@ private boolean loadNextRowGroupIfNecessary() throws Throwable {
               capacity,
               useDecimal128,
               useLazyMaterialization,
-              useLegacyDateTimestamp);
+              useLegacyDateTimestamp,
+              hasNativeOperations);
       reader.setPageReader(rowGroupReader.getPageReader(columns.get(i)));
       columnReaders[i] = reader;
     }

diff --git a/common/src/main/java/org/apache/comet/parquet/ColumnReader.java b/common/src/main/java/org/apache/comet/parquet/ColumnReader.java
@@ -44,11 +44,7 @@
 import org.apache.spark.sql.types.DataType;
 
 import org.apache.comet.CometConf;
-import org.apache.comet.vector.CometDecodedVector;
-import org.apache.comet.vector.CometDictionary;
-import org.apache.comet.vector.CometDictionaryVector;
-import org.apache.comet.vector.CometPlainVector;
-import org.apache.comet.vector.CometVector;
+import org.apache.comet.vector.*;
 
 public class ColumnReader extends AbstractColumnReader {
   protected static final Logger LOG = LoggerFactory.getLogger(ColumnReader.class);
@@ -58,7 +54,7 @@ public class ColumnReader extends AbstractColumnReader {
    * The current Comet vector holding all the values read by this column reader. Owned by this
    * reader and MUST be closed after use.
    */
-  private CometDecodedVector currentVector;
+  private CometVector currentVector;
 
   /** Dictionary values for this column. Only set if the column is using dictionary encoding. */
   protected CometDictionary dictionary;
@@ -90,6 +86,8 @@ public class ColumnReader extends AbstractColumnReader {
 
   private final CometSchemaImporter importer;
 
+  private final boolean hasNativeOperations;
+
   private ArrowArray array = null;
   private ArrowSchema schema = null;
 
@@ -99,11 +97,13 @@ public ColumnReader(
       CometSchemaImporter importer,
       int batchSize,
       boolean useDecimal128,
-      boolean useLegacyDateTimestamp) {
+      boolean useLegacyDateTimestamp,
+      boolean hasNativeOperations) {
     super(type, descriptor, useDecimal128, useLegacyDateTimestamp);
     assert batchSize > 0 : "Batch size must be positive, found " + batchSize;
     this.batchSize = batchSize;
     this.importer = importer;
+    this.hasNativeOperations = hasNativeOperations;
     initNative();
   }
 
@@ -171,7 +171,23 @@ public void close() {
   }
 
   /** Returns a decoded {@link CometDecodedVector Comet vector}. */
-  public CometDecodedVector loadVector() {
+  public CometVector loadVector() {
+    if (hasNativeOperations) {
+      if (currentVector != null) {
+        currentVector.close();
+      }
+
+      array = ArrowArray.allocateNew(ALLOCATOR);
+      schema = ArrowSchema.allocateNew(ALLOCATOR);
+
+      long arrayAddr = array.memoryAddress();
+      long schemaAddr = schema.memoryAddress();
+
+      Native.currentBatch(nativeHandle, arrayAddr, schemaAddr);
+      currentVector = new CometNativeVector(null, useDecimal128, arrayAddr, schemaAddr);
+      return currentVector;
+    }
+
     // Only re-use Comet vector iff:
     //   1. if we're not using dictionary encoding, since with dictionary encoding, the native
     //      side may fallback to plain encoding and the underlying memory address for the vector
@@ -264,7 +280,7 @@ protected void readPage() {
     if (page == null) {
       throw new RuntimeException("overreading: returned DataPage is null");
     }
-    ;
+
     int pageValueCount = page.getValueCount();
     page.accept(
         new DataPage.Visitor<Void>() {

diff --git a/common/src/main/java/org/apache/comet/parquet/LazyColumnReader.java b/common/src/main/java/org/apache/comet/parquet/LazyColumnReader.java
@@ -50,7 +50,14 @@ public LazyColumnReader(
       int batchSize,
       boolean useDecimal128,
       boolean useLegacyDateTimestamp) {
-    super(sparkReadType, descriptor, importer, batchSize, useDecimal128, useLegacyDateTimestamp);
+    super(
+        sparkReadType,
+        descriptor,
+        importer,
+        batchSize,
+        useDecimal128,
+        useLegacyDateTimestamp,
+        false);
     this.batchSize = 0; // the batch size is set later in `readBatch`
     this.vector = new CometLazyVector(sparkReadType, this, useDecimal128);
   }

diff --git a/common/src/main/java/org/apache/comet/parquet/Utils.java b/common/src/main/java/org/apache/comet/parquet/Utils.java
@@ -32,10 +32,18 @@ public static ColumnReader getColumnReader(
       CometSchemaImporter importer,
       int batchSize,
       boolean useDecimal128,
-      boolean useLazyMaterialization) {
+      boolean useLazyMaterialization,
+      boolean hasNativeOperations) {
     // TODO: support `useLegacyDateTimestamp` for Iceberg
     return getColumnReader(
-        type, descriptor, importer, batchSize, useDecimal128, useLazyMaterialization, true);
+        type,
+        descriptor,
+        importer,
+        batchSize,
+        useDecimal128,
+        useLazyMaterialization,
+        true,
+        hasNativeOperations);
   }
 
   public static ColumnReader getColumnReader(
@@ -45,13 +53,20 @@ public static ColumnReader getColumnReader(
       int batchSize,
       boolean useDecimal128,
       boolean useLazyMaterialization,
-      boolean useLegacyDateTimestamp) {
-    if (useLazyMaterialization && supportLazyMaterialization(type)) {
+      boolean useLegacyDateTimestamp,
+      boolean hasNativeOperations) {
+    if (useLazyMaterialization && !hasNativeOperations && supportLazyMaterialization(type)) {
       return new LazyColumnReader(
           type, descriptor, importer, batchSize, useDecimal128, useLegacyDateTimestamp);
     } else {
       return new ColumnReader(
-          type, descriptor, importer, batchSize, useDecimal128, useLegacyDateTimestamp);
+          type,
+          descriptor,
+          importer,
+          batchSize,
+          useDecimal128,
+          useLegacyDateTimestamp,
+          hasNativeOperations);
     }
   }
 

diff --git a/common/src/main/java/org/apache/comet/vector/CometNativeVector.java b/common/src/main/java/org/apache/comet/vector/CometNativeVector.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.vector;
+
+import org.apache.arrow.vector.ValueVector;
+import org.apache.spark.sql.types.DataType;
+
+public class CometNativeVector extends CometVector {
+  private final long arrayAddress;
+  private final long schemaAddress;
+
+  public CometNativeVector(
+      DataType type, boolean useDecimal128, long arrayAddress, long schemaAddress) {
+    super(type, useDecimal128);
+    this.arrayAddress = arrayAddress;
+    this.schemaAddress = schemaAddress;
+  }
+
+  @Override
+  public void setNumNulls(int numNulls) {}
+
+  @Override
+  public void setNumValues(int numValues) {}
+
+  @Override
+  public int numValues() {
+    return 0;
+  }
+
+  @Override
+  public ValueVector getValueVector() {
+    return null;
+  }
+
+  @Override
+  public CometVector slice(int offset, int length) {
+    return null;
+  }
+
+  @Override
+  public boolean hasNull() {
+    return false;
+  }
+
+  @Override
+  public int numNulls() {
+    return 0;
+  }
+
+  @Override
+  public boolean isNullAt(int i) {
+    return false;
+  }
+
+  @Override
+  public void close() {}
+
+  public long getArrayAddress() {
+    return arrayAddress;
+  }
+
+  public long getSchemaAddress() {
+    return schemaAddress;
+  }
+}
diff --git a/common/src/main/scala/org/apache/comet/vector/NativeUtil.scala b/common/src/main/scala/org/apache/comet/vector/NativeUtil.scala
@@ -88,14 +88,16 @@ class NativeUtil {
    *   an exported batches object containing an array containing number of rows + pairs of memory
    *   addresses in the format of (address of Arrow array, address of Arrow schema)
    */
-  def exportBatch(
-      arrayAddrs: Array[Long],
-      schemaAddrs: Array[Long],
-      batch: ColumnarBatch): Int = {
+  def exportBatch(batch: ColumnarBatch): Array[Long] = {
     val numRows = mutable.ArrayBuffer.empty[Int]
+    val builder = Array.newBuilder[Long]
+    builder += batch.numRows()
 
     (0 until batch.numCols()).foreach { index =>
       batch.column(index) match {
+        case a: CometNativeVector =>
+          builder += a.getArrayAddress
+          builder += a.getSchemaAddress
         case a: CometVector =>
           val valueVector = a.getValueVector
 
@@ -107,16 +109,16 @@ class NativeUtil {
             null
           }
 
-          // The array and schema structures are allocated by native side.
-          // Don't need to deallocate them here.
-          val arrowSchema = ArrowSchema.wrap(schemaAddrs(index))
-          val arrowArray = ArrowArray.wrap(arrayAddrs(index))
+          val arrowSchema = ArrowSchema.allocateNew(allocator)
+          val arrowArray = ArrowArray.allocateNew(allocator)
           Data.exportVector(
             allocator,
             getFieldVector(valueVector, "export"),
             provider,
             arrowArray,
             arrowSchema)
+          builder += arrowArray.memoryAddress()
+          builder += arrowSchema.memoryAddress()
         case c =>
           throw new SparkException(
             "Comet execution only takes Arrow Arrays, but got " +
@@ -133,34 +135,35 @@ class NativeUtil {
     // the Arrow arrays. For example, Iceberg column reader will skip deleted rows internally in
     // its `CometVector` implementation. The `ColumnarBatch` returned by the reader will report
     // logical number of rows which is less than actual number of rows due to row deletion.
-    numRows.headOption.getOrElse(batch.numRows())
+
+    builder.result()
   }
 
   /**
    * Gets the next batch from native execution.
    *
-   * @param numOutputCols
-   *   The number of output columns
    * @param func
    *   The function to call to get the next batch
    * @return
    *   The number of row of the next batch, or None if there are no more batches
    */
-  def getNextBatch(
-      numOutputCols: Int,
-      func: (Array[Long], Array[Long]) => Long): Option[ColumnarBatch] = {
-    val (arrays, schemas) = allocateArrowStructs(numOutputCols)
-
-    val arrayAddrs = arrays.map(_.memoryAddress())
-    val schemaAddrs = schemas.map(_.memoryAddress())
-
-    val result = func(arrayAddrs, schemaAddrs)
+  def getNextBatch(func: () => Array[Long]): Option[ColumnarBatch] = {
+    val cometBatchElements = func()
+    val result = cometBatchElements(0)
+    val arrayBuilder = Array.newBuilder[ArrowArray]
+    val schemaBuilder = Array.newBuilder[ArrowSchema]
+    for (i <- 1 until cometBatchElements.length by 2) {
+      arrayBuilder += ArrowArray.wrap(cometBatchElements(i))
+      schemaBuilder += ArrowSchema.wrap(cometBatchElements(i + 1))
+    }
+    val arrays = arrayBuilder.result()
+    val schemas = schemaBuilder.result()
 
     result match {
       case -1 =>
         // EOF
         None
-      case numRows =>
+      case numRows if numRows >= 0 =>
         val cometVectors = importVector(arrays, schemas)
         Some(new ColumnarBatch(cometVectors.toArray, numRows.toInt))
       case flag =>