From 89737b89003ea38f7eb79b1734a6a32bdc5b062d Mon Sep 17 00:00:00 2001 From: arnavb Date: Sun, 17 Aug 2025 08:46:20 +0000 Subject: [PATCH 1/3] update --- .../parquet/hadoop/ParquetFileWriter.java | 66 +++++++++++++++++-- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 4d17a1d6e4..0886fa2f25 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -1804,14 +1804,27 @@ private static void copy(SeekableInputStream from, PositionOutputStream to, long * @throws IOException if there is an error while writing */ public void end(Map extraMetaData) throws IOException { + final long footerStart = out.getPos(); + + // Build the footer metadata) in memory using the helper stream + InMemoryPositionOutputStream buffer = new InMemoryPositionOutputStream(footerStart); + + serializeColumnIndexes(columnIndexes, blocks, buffer, fileEncryptor); + serializeOffsetIndexes(offsetIndexes, blocks, buffer, fileEncryptor); + serializeBloomFilters(bloomFilters, blocks, buffer, fileEncryptor); + + ParquetMetadata localFooter = + new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks); + serializeFooter(localFooter, buffer, fileEncryptor, metadataConverter); + + byte[] footerBytes = buffer.toByteArray(); + try { + out.write(footerBytes); + out.flush(); + state = state.end(); - serializeColumnIndexes(columnIndexes, blocks, out, fileEncryptor); - serializeOffsetIndexes(offsetIndexes, blocks, out, fileEncryptor); - serializeBloomFilters(bloomFilters, blocks, out, fileEncryptor); - LOG.debug("{}: end", out.getPos()); - this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks); - serializeFooter(footer, out, fileEncryptor, metadataConverter); + this.footer = localFooter; } finally { close(); } @@ -2441,4 +2454,45 @@ protected boolean isPaddingNeeded(long remaining) { return (remaining <= maxPaddingSize); } } + + /** + * Lightweight {@link PositionOutputStream} that writes into a byte buffer while + * keeping a virtual position that can be initialised to an arbitrary offset. + * The position offset lets us build the footer in memory but still record the + * *final* absolute offsets that will appear once the buffer is flushed to the + * underlying file. + */ + private static final class InMemoryPositionOutputStream extends PositionOutputStream { + private final java.io.ByteArrayOutputStream buffer = new java.io.ByteArrayOutputStream(); + private long pos; + + InMemoryPositionOutputStream(long startPos) { + this.pos = startPos; + } + + @Override + public long getPos() { + return pos; + } + + @Override + public void write(int b) { + buffer.write(b); + pos++; + } + + @Override + public void write(byte[] b, int off, int len) { + buffer.write(b, off, len); + pos += len; + } + + @Override + public void flush() { + } + + byte[] toByteArray() { + return buffer.toByteArray(); + } + } } From 0943a33d3765a662275b11d6c66c60302aa32465 Mon Sep 17 00:00:00 2001 From: arnavb Date: Sun, 17 Aug 2025 13:16:12 +0000 Subject: [PATCH 2/3] update --- .../main/java/org/apache/parquet/hadoop/ParquetFileWriter.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 0886fa2f25..26a543777a 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -2488,8 +2488,7 @@ public void write(byte[] b, int off, int len) { } @Override - public void flush() { - } + public void flush() {} byte[] toByteArray() { return buffer.toByteArray(); From 258ee14f1fa431ebf6df2a5fd696337a5c20e31e Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 25 Aug 2025 07:20:02 +0000 Subject: [PATCH 3/3] update --- .../java/org/apache/parquet/hadoop/ParquetFileWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 26a543777a..c8d8bf7534 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -1820,10 +1820,10 @@ public void end(Map extraMetaData) throws IOException { byte[] footerBytes = buffer.toByteArray(); try { + state = state.end(); + out.write(footerBytes); out.flush(); - - state = state.end(); this.footer = localFooter; } finally { close();