initial offset

zikangh · zikangh · commit 3763efa4ac26 · 2025-11-17T22:58:24.000Z
diff --git a/kernel-spark/src/main/java/io/delta/kernel/spark/read/SparkMicroBatchStream.java b/kernel-spark/src/main/java/io/delta/kernel/spark/read/SparkMicroBatchStream.java
@@ -63,6 +63,9 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsAdmissio
   private final boolean shouldValidateOffsets;
   private final SparkSession spark;
 
+  // Tracks whether this is the initial batch for this stream (no checkpointed offset).
+  private boolean isInitialBatch = false;
+
   public SparkMicroBatchStream(DeltaSnapshotManager snapshotManager, Configuration hadoopConf) {
     this(
         snapshotManager,
@@ -95,10 +98,34 @@ public SparkMicroBatchStream(
   // offset //
   ////////////
 
+  /**
+   * Returns the initial offset for a streaming query to start reading from (if there's no
+   * checkpointed offset). Returns null if there's no data to read.
+   */
   @Override
   public Offset initialOffset() {
-    // TODO(#5318): Implement initialOffset
-    throw new UnsupportedOperationException("initialOffset is not supported");
+    Optional<Long> startingVersionOpt = getStartingVersion();
+    long version;
+    boolean isInitialSnapshot;
+
+    if (startingVersionOpt.isPresent()) {
+      version = startingVersionOpt.get();
+      isInitialSnapshot = false;
+    } else {
+      // TODO(#5318): Support initial snapshot case (isInitialSnapshot == true)
+      throw new UnsupportedOperationException(
+          "initialOffset with initial snapshot is not supported yet");
+    }
+
+    if (version < 0) {
+      // This shouldn't happen; defensively return null.
+      return null;
+    }
+
+    isInitialBatch = true;
+
+    return DeltaSourceOffset.apply(
+        tableId, version, DeltaSourceOffset.BASE_INDEX(), isInitialSnapshot);
   }
 
   @Override
@@ -133,6 +160,8 @@ public Offset latestOffset(Offset startOffset, ReadLimit limit) {
       DeltaSourceOffset.validateOffsets(deltaStartOffset, endOffset.get());
     }
 
+    isInitialBatch = false;
+
     // endOffset is null: no data is available to read for this batch.
     return endOffset.orElse(null);
   }
@@ -153,7 +182,9 @@ public ReadLimit getDefaultReadLimit() {
    *
    * @param previousOffset The previous offset
    * @param limits Rate limits for this batch (Optional.empty() for no limits)
-   * @return The next offset, or the previous offset if no new data is available
+   * @return The next offset, or the previous offset if no new data is available (except on the
+   *     initial batch where we return empty to match DSv1's
+   *     getStartingOffsetFromSpecificDeltaVersion behavior)
    */
   private Optional<DeltaSourceOffset> getNextOffsetFromPreviousOffset(
       DeltaSourceOffset previousOffset, Optional<DeltaSource.AdmissionLimits> limits) {
@@ -169,6 +200,11 @@ private Optional<DeltaSourceOffset> getNextOffsetFromPreviousOffset(
     Optional<IndexedFile> lastFileChange = StreamingHelper.iteratorLast(changes);
 
     if (!lastFileChange.isPresent()) {
+      // On the initial batch, return empty to match DSv1's
+      // getStartingOffsetFromSpecificDeltaVersion
+      if (isInitialBatch) {
+        return Optional.empty();
+      }
       return Optional.of(previousOffset);
     }
     // TODO(#5318): Check read-incompatible schema changes during stream start
@@ -218,6 +254,8 @@ public void stop() {
    * Extracts whether users provided the option to time travel a relation. If a query restarts from
    * a checkpoint and the checkpoint has recorded the offset, this method should never be called.
    *
+   * <p>Returns Optional.empty() if no starting version is provided.
+   *
    * <p>This is the DSv2 Kernel-based implementation of DeltaSource.getStartingVersion.
    */
   Optional<Long> getStartingVersion() {
diff --git a/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkMicroBatchStreamTest.java b/kernel-spark/src/test/java/io/delta/kernel/spark/read/SparkMicroBatchStreamTest.java
@@ -32,10 +32,12 @@
 import java.util.stream.Stream;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.spark.sql.catalyst.expressions.Expression;
 import org.apache.spark.sql.connector.read.streaming.Offset;
 import org.apache.spark.sql.connector.read.streaming.ReadLimit;
 import org.apache.spark.sql.delta.DeltaLog;
 import org.apache.spark.sql.delta.DeltaOptions;
+import org.apache.spark.sql.delta.Snapshot;
 import org.apache.spark.sql.delta.sources.DeltaSource;
 import org.apache.spark.sql.delta.sources.DeltaSourceOffset;
 import org.apache.spark.sql.delta.sources.ReadMaxBytes;
@@ -46,7 +48,9 @@
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;
 import scala.Option;
+import scala.collection.JavaConverters;
 import scala.collection.immutable.Map$;
+import scala.collection.immutable.Seq;
 
 public class SparkMicroBatchStreamTest extends SparkDsv2TestBase {
 
@@ -100,7 +104,8 @@ public void testInitialOffset_throwsUnsupportedOperationException(@TempDir File
     SparkMicroBatchStream microBatchStream = createTestStream(tempDir);
     UnsupportedOperationException exception =
         assertThrows(UnsupportedOperationException.class, () -> microBatchStream.initialOffset());
-    assertEquals("initialOffset is not supported", exception.getMessage());
+    assertEquals(
+        "initialOffset with initial snapshot is not supported yet", exception.getMessage());
   }
 
   @Test
@@ -129,6 +134,71 @@ public void testStop_throwsUnsupportedOperationException(@TempDir File tempDir)
     assertEquals("stop is not supported", exception.getMessage());
   }
 
+  // ================================================================================================
+  // Tests for initialOffset parity between DSv1 and DSv2
+  // ================================================================================================
+
+  @ParameterizedTest
+  @MethodSource("initialOffsetParameters")
+  public void testInitialOffset_FirstBatchParity(
+      String startingVersion,
+      ReadLimitConfig limitConfig,
+      String testDescription,
+      @TempDir File tempDir)
+      throws Exception {
+    String testTablePath = tempDir.getAbsolutePath();
+    String testTableName = "test_initial_" + System.nanoTime();
+    createEmptyTestTable(testTablePath, testTableName);
+    insertVersions(
+        testTableName,
+        /* numVersions= */ 5,
+        /* rowsPerVersion= */ 10,
+        /* includeEmptyVersion= */ false);
+
+    DeltaLog deltaLog = DeltaLog.forTable(spark, new Path(testTablePath));
+    ReadLimit readLimit = limitConfig.toReadLimit();
+    DeltaOptions options;
+    if (startingVersion == null) {
+      options = new DeltaOptions(Map$.MODULE$.empty(), spark.sessionState().conf());
+    } else {
+      scala.collection.immutable.Map<String, String> scalaMap =
+          Map$.MODULE$.<String, String>empty().updated("startingVersion", startingVersion);
+      options = new DeltaOptions(scalaMap, spark.sessionState().conf());
+    }
+
+    // DSv1
+    DeltaSource deltaSource = createDeltaSource(deltaLog, testTablePath, options);
+    // DSv1 sources don't have an initialOffset() method.
+    // Batch 0 is called with startOffset=null.
+    Offset dsv1Offset = deltaSource.latestOffset(/* startOffset= */ null, readLimit);
+
+    // DSv2
+    Configuration hadoopConf = new Configuration();
+    PathBasedSnapshotManager snapshotManager =
+        new PathBasedSnapshotManager(testTablePath, hadoopConf);
+    SparkMicroBatchStream stream =
+        new SparkMicroBatchStream(snapshotManager, hadoopConf, spark, options);
+    Offset initialOffset = stream.initialOffset();
+    Offset dsv2Offset = stream.latestOffset(initialOffset, readLimit);
+
+    compareOffsets(dsv1Offset, dsv2Offset, testDescription);
+  }
+
+  /** Provides test parameters for the initialOffset parity test. */
+  private static Stream<Arguments> initialOffsetParameters() {
+    return Stream.of(
+        Arguments.of("0", ReadLimitConfig.noLimit(), "NoLimit1"),
+        Arguments.of("1", ReadLimitConfig.noLimit(), "NoLimit2"),
+        Arguments.of("3", ReadLimitConfig.noLimit(), "NoLimit3"),
+        Arguments.of("latest", ReadLimitConfig.noLimit(), "LatestNoLimit"),
+        Arguments.of("latest", ReadLimitConfig.maxFiles(1000), "LatestMaxFiles"),
+        Arguments.of("latest", ReadLimitConfig.maxBytes(1000), "LatestMaxBytes"),
+        Arguments.of("0", ReadLimitConfig.maxFiles(5), "MaxFiles1"),
+        Arguments.of("1", ReadLimitConfig.maxFiles(10), "MaxFiles2"),
+        Arguments.of("0", ReadLimitConfig.maxBytes(1000), "MaxBytes1"),
+        Arguments.of("1", ReadLimitConfig.maxBytes(2000), "MaxBytes2"));
+  }
+
   // ================================================================================================
   // Tests for getFileChanges parity between DSv1 and DSv2
   // ================================================================================================
@@ -1095,26 +1165,6 @@ private Optional<DeltaSource.AdmissionLimits> createAdmissionLimits(
     return Optional.of(new DeltaSource.AdmissionLimits(options, scalaMaxFiles, scalaMaxBytes));
   }
 
-  /** Helper method to create a DeltaSource instance for testing. */
-  private DeltaSource createDeltaSource(DeltaLog deltaLog, String tablePath) {
-    DeltaOptions options = new DeltaOptions(Map$.MODULE$.empty(), spark.sessionState().conf());
-    scala.collection.immutable.Seq<org.apache.spark.sql.catalyst.expressions.Expression> emptySeq =
-        scala.collection.JavaConverters.asScalaBuffer(
-                new java.util.ArrayList<org.apache.spark.sql.catalyst.expressions.Expression>())
-            .toList();
-    org.apache.spark.sql.delta.Snapshot snapshot =
-        deltaLog.update(false, scala.Option.empty(), scala.Option.empty());
-    return new DeltaSource(
-        spark,
-        deltaLog,
-        /* catalogTableOpt= */ scala.Option.empty(),
-        options,
-        /* snapshotAtSourceInit= */ snapshot,
-        /* metadataPath= */ tablePath + "/_checkpoint",
-        /* metadataTrackingLog= */ scala.Option.empty(),
-        /* filters= */ emptySeq);
-  }
-
   /** Helper method to format a DSv1 IndexedFile for debugging. */
   private String formatIndexedFile(org.apache.spark.sql.delta.sources.IndexedFile file) {
     return String.format(
@@ -1173,4 +1223,23 @@ private void compareOffsetSequence(
           String.format("%s (iteration %d)", testDescription, i));
     }
   }
+
+  private DeltaSource createDeltaSource(DeltaLog deltaLog, String tablePath) {
+    DeltaOptions options = new DeltaOptions(Map$.MODULE$.empty(), spark.sessionState().conf());
+    return createDeltaSource(deltaLog, tablePath, options);
+  }
+
+  private DeltaSource createDeltaSource(DeltaLog deltaLog, String tablePath, DeltaOptions options) {
+    Seq<Expression> emptySeq = JavaConverters.asScalaBuffer(new ArrayList<Expression>()).toList();
+    Snapshot snapshot = deltaLog.update(false, Option.empty(), Option.empty());
+    return new DeltaSource(
+        spark,
+        deltaLog,
+        /* catalogTableOpt= */ Option.empty(),
+        options,
+        /* snapshotAtSourceInit= */ snapshot,
+        /* metadataPath= */ tablePath + "/_checkpoint",
+        /* metadataTrackingLog= */ Option.empty(),
+        /* filters= */ emptySeq);
+  }
 }