-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Closed
Description
Describe the bug, including details regarding any error messages, version, and platform.
When writing a Parquet Dataset and enabling statistics for specific fields, no fields have statistics written. This works correctly when writing a single Parquet file without using the Dataset API.
Python repro:
import base64
import datetime
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.parquet.encryption as pe
dataset_path = Path("./tmp_dataset")
data_file_path = Path("./tmp_data.parquet")
class MockKmsClient(pe.KmsClient):
def __init__(self, _kms_connection_configuration):
super().__init__()
def wrap_key(self, key_bytes, _master_key_identifier):
return base64.b64encode(key_bytes)
def unwrap_key(self, wrapped_key, _master_key_identifier):
return base64.b64decode(wrapped_key)
crypto_factory = pe.CryptoFactory(lambda config: MockKmsClient(config))
config = pe.KmsConnectionConfig()
encryption_config = pe.EncryptionConfiguration(
"kf",
uniform_encryption=True,
plaintext_footer=False)
decryption_config = pe.DecryptionConfiguration()
decryption_properties = crypto_factory.file_decryption_properties(config, decryption_config)
num_rows = 100
data = pa.Table.from_pydict({
'id': pa.array(list(range(num_rows)), type=pa.int32()),
'timestamp': pa.array([datetime.datetime(2025, 9, 16, 0, 0, 0) + datetime.timedelta(seconds=i) for i in range(num_rows)], type=pa.timestamp('ms')),
'x': pa.array(list(range(num_rows)), type=pa.float32()),
})
ds_encryption_config = ds.ParquetEncryptionConfig(crypto_factory, config, encryption_config)
encryption_properties = crypto_factory.file_encryption_properties(config, encryption_config)
# Write with dataset
pq.write_to_dataset(
data,
dataset_path,
compression=None,
encryption_config=ds_encryption_config,
write_statistics=["id", "timestamp"])
# Write Parquet file directly
pq.write_table(
data,
data_file_path,
encryption_properties=encryption_properties,
write_statistics=["id", "timestamp"])
# Read back stats
ds_file_path = next(f for f in dataset_path.iterdir() if f.suffix == ".parquet")
for file_path in [ds_file_path, data_file_path]:
print(f"\nReading {file_path}")
with pq.ParquetFile(
file_path,
decryption_properties=decryption_properties) as f:
rg = f.metadata.row_group(0)
for col_idx in range(f.metadata.num_columns):
col = rg.column(col_idx)
statistics = "None" if col.statistics is None else str(col.statistics)
print(f"Column '{col.path_in_schema}' statistics:\n {statistics}")This outputs:
Reading tmp_dataset/02779199901f4b51a6eb343881ba1a0f-0.parquet
Column 'id' statistics:
None
Column 'timestamp' statistics:
None
Column 'x' statistics:
None
Reading tmp_data.parquet
Column 'id' statistics:
<pyarrow._parquet.Statistics object at 0x7f11cc9e5580>
has_min_max: True
min: 0
max: 99
null_count: 0
distinct_count: None
num_values: 100
physical_type: INT32
logical_type: None
converted_type (legacy): NONE
Column 'timestamp' statistics:
<pyarrow._parquet.Statistics object at 0x7f11cc9e5670>
has_min_max: True
min: 2025-09-16 00:00:00
max: 2025-09-16 00:01:39
null_count: 0
distinct_count: None
num_values: 100
physical_type: INT64
logical_type: Timestamp(isAdjustedToUTC=false, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false)
converted_type (legacy): NONE
Column 'x' statistics:
None
This is caused by adding the file encryption properties to the writer properties here:
arrow/cpp/src/arrow/dataset/file_parquet.cc
Lines 736 to 739 in 5a48044
| auto writer_properties = | |
| parquet::WriterProperties::Builder(*parquet_options->writer_properties) | |
| .encryption(std::move(file_encryption_prop)) | |
| ->build(); |
The conversion to a WriterProperties::Builder from WriterProperties is lossy and loses any settings overridden for specific columns.
Component(s)
Parquet, C++