-
Notifications
You must be signed in to change notification settings - Fork 1.1k
GH-7686: [Parquet] Fix int96 min/max stats #7687
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 16 commits
Commits
Show all changes
28 commits
Select commit
Hold shift + click to select a range
3b371ac
Add int96 stats test
fd51210
fix conversions
72687dd
asserts
46d98e8
printing change
93a780c
Create int96 from time since epoch
a5b9eb7
Add ways to set int96 from timestamps
796243b
simplify
34c928d
Add correct ordering for int96
eb8a77c
Add tests for int96ordering
ef07163
rename tests
6abd75f
Simplify test
6108b14
Refactor test
5155556
simplify test
abc743a
Merge remote-tracking branch 'origin/main' into add-tests-for-int-96-…
4b0b94d
Improve testcase
325d335
shuffle data before writing to file
6036398
change int96 internal format
a4b0049
Revert "change int96 internal format"
4bca51b
Make the changes minimal
f184429
Save instructions in the comparison
081d20f
Fix test
2a3e802
Add comments explaining the reasoning behind the new ordering
3c1d4b0
Remove stale comment
ecf5802
Merge remote-tracking branch 'origin/main' into pr/rahulketch/7687
etseidl 84b05ce
clippy
etseidl c008cad
lint
etseidl cd63d0a
add license statement
etseidl 810b25c
Update parquet/src/data_type.rs
alamb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| use parquet::basic::Type; | ||
| use parquet::data_type::{Int96, Int96Type}; | ||
| use parquet::file::properties::{EnabledStatistics, WriterProperties}; | ||
| use parquet::file::reader::{FileReader, SerializedFileReader}; | ||
| use parquet::file::statistics::Statistics; | ||
| use parquet::file::writer::SerializedFileWriter; | ||
| use parquet::schema::parser::parse_message_type; | ||
| use rand::seq::SliceRandom; | ||
| use std::fs::File; | ||
| use std::sync::Arc; | ||
| use tempfile::Builder; | ||
| use chrono::{DateTime, NaiveDateTime, Utc}; | ||
|
|
||
| fn datetime_to_int96(dt: &str) -> Int96 { | ||
| let naive = NaiveDateTime::parse_from_str(dt, "%Y-%m-%d %H:%M:%S%.f").unwrap(); | ||
| let datetime: DateTime<Utc> = DateTime::from_naive_utc_and_offset(naive, Utc); | ||
| let nanos = datetime.timestamp_nanos_opt().unwrap(); | ||
| let mut int96 = Int96::new(); | ||
| int96.set_data_from_nanos(nanos); | ||
| int96 | ||
| } | ||
|
|
||
| fn verify_ordering(data: Vec<Int96>) { | ||
| // Create a temporary file | ||
| let tmp = Builder::new() | ||
| .prefix("test_int96_stats") | ||
| .tempfile() | ||
| .unwrap(); | ||
| let file_path = tmp.path().to_owned(); | ||
|
|
||
| // Create schema with INT96 field | ||
| let message_type = " | ||
| message test { | ||
| REQUIRED INT96 timestamp; | ||
| } | ||
| "; | ||
| let schema = parse_message_type(message_type).unwrap(); | ||
|
|
||
| // Configure writer properties to enable statistics | ||
| let props = WriterProperties::builder() | ||
| .set_statistics_enabled(EnabledStatistics::Page) | ||
| .build(); | ||
|
|
||
| let expected_min = data[0]; | ||
| let expected_max = data[data.len() - 1]; | ||
|
|
||
| { | ||
| let file = File::create(&file_path).unwrap(); | ||
| let mut writer = SerializedFileWriter::new(file, schema.into(), Arc::new(props)).unwrap(); | ||
| let mut row_group = writer.next_row_group().unwrap(); | ||
| let mut col_writer = row_group.next_column().unwrap().unwrap(); | ||
|
|
||
| { | ||
| let writer = col_writer.typed::<Int96Type>(); | ||
| let mut shuffled_data = data.clone(); | ||
| shuffled_data.shuffle(&mut rand::rng()); | ||
| writer.write_batch(&shuffled_data, None, None).unwrap(); | ||
| } | ||
| col_writer.close().unwrap(); | ||
| row_group.close().unwrap(); | ||
| writer.close().unwrap(); | ||
| } | ||
|
|
||
| let file = File::open(&file_path).unwrap(); | ||
| let reader = SerializedFileReader::new(file).unwrap(); | ||
| let metadata = reader.metadata(); | ||
| let row_group = metadata.row_group(0); | ||
| let column = row_group.column(0); | ||
|
|
||
| let stats = column.statistics().unwrap(); | ||
| assert_eq!(stats.physical_type(), Type::INT96); | ||
|
|
||
| if let Statistics::Int96(stats) = stats { | ||
| let min = stats.min_opt().unwrap(); | ||
| let max = stats.max_opt().unwrap(); | ||
|
|
||
| assert_eq!(*min, expected_min, "Min value should be {} but was {}", expected_min, min); | ||
| assert_eq!(*max, expected_max, "Max value should be {} but was {}", expected_max, max); | ||
| assert_eq!(stats.null_count_opt(), Some(0)); | ||
| } else { | ||
| panic!("Expected Int96 statistics"); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_multiple_dates() { | ||
| let data = vec![ | ||
| datetime_to_int96("2020-01-01 00:00:00.000"), | ||
| datetime_to_int96("2020-02-29 23:59:59.000"), | ||
| datetime_to_int96("2020-12-31 23:59:59.000"), | ||
| datetime_to_int96("2021-01-01 00:00:00.000"), | ||
| datetime_to_int96("2023-06-15 12:30:45.000"), | ||
| datetime_to_int96("2024-02-29 15:45:30.000"), | ||
| datetime_to_int96("2024-12-25 07:00:00.000"), | ||
| datetime_to_int96("2025-01-01 00:00:00.000"), | ||
| datetime_to_int96("2025-07-04 20:00:00.000"), | ||
| datetime_to_int96("2025-12-31 23:59:59.000"), | ||
| ]; | ||
| verify_ordering(data); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_same_day_different_time() { | ||
| let data = vec![ | ||
| datetime_to_int96("2020-01-01 00:01:00.000"), | ||
| datetime_to_int96("2020-01-01 00:02:00.000"), | ||
| datetime_to_int96("2020-01-01 00:03:00.000"), | ||
| ]; | ||
| verify_ordering(data); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_increasing_day_decreasing_time() { | ||
| let data = vec![ | ||
| datetime_to_int96("2020-01-01 12:00:00.000"), | ||
| datetime_to_int96("2020-02-01 11:00:00.000"), | ||
| datetime_to_int96("2020-03-01 10:00:00.000"), | ||
| ]; | ||
| verify_ordering(data); | ||
| } |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.