Skip to content

[Parquet] Cannot enable statistics for specific fields when writing a dataset with encryption #48168

@adamreeve

Description

@adamreeve

Describe the bug, including details regarding any error messages, version, and platform.

When writing a Parquet Dataset and enabling statistics for specific fields, no fields have statistics written. This works correctly when writing a single Parquet file without using the Dataset API.

Python repro:

import base64
import datetime
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.parquet.encryption as pe


dataset_path = Path("./tmp_dataset")
data_file_path = Path("./tmp_data.parquet")


class MockKmsClient(pe.KmsClient):
    def __init__(self, _kms_connection_configuration):
        super().__init__()

    def wrap_key(self, key_bytes, _master_key_identifier):
        return base64.b64encode(key_bytes)

    def unwrap_key(self, wrapped_key, _master_key_identifier):
        return base64.b64decode(wrapped_key)


crypto_factory = pe.CryptoFactory(lambda config: MockKmsClient(config))
config = pe.KmsConnectionConfig()
encryption_config = pe.EncryptionConfiguration(
        "kf",
        uniform_encryption=True,
        plaintext_footer=False)
decryption_config = pe.DecryptionConfiguration()
decryption_properties = crypto_factory.file_decryption_properties(config, decryption_config)

num_rows = 100
data = pa.Table.from_pydict({
    'id': pa.array(list(range(num_rows)), type=pa.int32()),
    'timestamp': pa.array([datetime.datetime(2025, 9, 16, 0, 0, 0) + datetime.timedelta(seconds=i) for i in range(num_rows)], type=pa.timestamp('ms')),
    'x': pa.array(list(range(num_rows)), type=pa.float32()),
})

ds_encryption_config = ds.ParquetEncryptionConfig(crypto_factory, config, encryption_config)
encryption_properties = crypto_factory.file_encryption_properties(config, encryption_config)

# Write with dataset
pq.write_to_dataset(
        data,
        dataset_path,
        compression=None,
        encryption_config=ds_encryption_config,
        write_statistics=["id", "timestamp"])

# Write Parquet file directly
pq.write_table(
        data,
        data_file_path,
        encryption_properties=encryption_properties,
        write_statistics=["id", "timestamp"])

# Read back stats
ds_file_path = next(f for f in dataset_path.iterdir() if f.suffix == ".parquet")

for file_path in [ds_file_path, data_file_path]:
    print(f"\nReading {file_path}")
    with pq.ParquetFile(
            file_path,
            decryption_properties=decryption_properties) as f:
        rg = f.metadata.row_group(0)
        for col_idx in range(f.metadata.num_columns):
            col = rg.column(col_idx)
            statistics = "None" if col.statistics is None else str(col.statistics)
            print(f"Column '{col.path_in_schema}' statistics:\n  {statistics}")

This outputs:

Reading tmp_dataset/02779199901f4b51a6eb343881ba1a0f-0.parquet
Column 'id' statistics:
  None
Column 'timestamp' statistics:
  None
Column 'x' statistics:
  None

Reading tmp_data.parquet
Column 'id' statistics:
  <pyarrow._parquet.Statistics object at 0x7f11cc9e5580>
  has_min_max: True
  min: 0
  max: 99
  null_count: 0
  distinct_count: None
  num_values: 100
  physical_type: INT32
  logical_type: None
  converted_type (legacy): NONE
Column 'timestamp' statistics:
  <pyarrow._parquet.Statistics object at 0x7f11cc9e5670>
  has_min_max: True
  min: 2025-09-16 00:00:00
  max: 2025-09-16 00:01:39
  null_count: 0
  distinct_count: None
  num_values: 100
  physical_type: INT64
  logical_type: Timestamp(isAdjustedToUTC=false, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false)
  converted_type (legacy): NONE
Column 'x' statistics:
  None

This is caused by adding the file encryption properties to the writer properties here:

auto writer_properties =
parquet::WriterProperties::Builder(*parquet_options->writer_properties)
.encryption(std::move(file_encryption_prop))
->build();

The conversion to a WriterProperties::Builder from WriterProperties is lossy and loses any settings overridden for specific columns.

Component(s)

Parquet, C++

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions