diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index ba8ffc2e92c3..0d66c118a4f2 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -28,7 +28,7 @@ pub use crate::compression::{BrotliLevel, GzipLevel, ZstdLevel}; use crate::file::metadata::HeapSize; use crate::parquet_thrift::{ ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, - WriteThrift, WriteThriftField, + WriteThrift, WriteThriftField, validate_list_type, }; use crate::{thrift_enum, thrift_struct, thrift_union_all_empty, write_thrift_field}; @@ -771,6 +771,8 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for EncodingMask { // This reads a Thrift `list` and turns it into a bitmask let list_ident = prot.read_list_begin()?; + // check for enum (encoded as I32) + validate_list_type(ElementType::I32, &list_ident)?; for _ in 0..list_ident.size { let val = Encoding::read_thrift(prot)?; mask |= 1 << val as i32; diff --git a/parquet/src/file/metadata/thrift/mod.rs b/parquet/src/file/metadata/thrift/mod.rs index ad8f6312c215..b138de319368 100644 --- a/parquet/src/file/metadata/thrift/mod.rs +++ b/parquet/src/file/metadata/thrift/mod.rs @@ -51,7 +51,7 @@ use crate::{ parquet_thrift::{ ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, WriteThrift, WriteThriftField, - read_thrift_vec, + read_thrift_vec, validate_list_type, }, schema::types::{ ColumnDescriptor, SchemaDescriptor, TypePtr, num_nodes, parquet_schema_from_array, @@ -387,6 +387,9 @@ fn read_encoding_stats_as_mask<'a>( // read the vector of stats, setting mask bits for data pages let mut mask = 0i32; let list_ident = prot.read_list_begin()?; + // check for PageEncodingStats struct + validate_list_type(ElementType::Struct, &list_ident)?; + for _ in 0..list_ident.size { let pes = PageEncodingStats::read_thrift(prot)?; match pes.page_type { @@ -652,6 +655,8 @@ fn read_row_group( match field_ident.id { 1 => { let list_ident = prot.read_list_begin()?; + // check for list of struct + validate_list_type(ElementType::Struct, &list_ident)?; if schema_descr.num_columns() != list_ident.size as usize { return Err(general_err!( "Column count mismatch. Schema has {} columns while Row Group has {}", @@ -801,6 +806,8 @@ pub(crate) fn parquet_metadata_from_bytes( } let schema_descr = schema_descr.as_ref().unwrap(); let list_ident = prot.read_list_begin()?; + // check for list of struct + validate_list_type(ElementType::Struct, &list_ident)?; let mut rg_vec = Vec::with_capacity(list_ident.size as usize); // Read row groups and handle ordinal assignment diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs index b1e30dd4590c..06d21efb68c8 100644 --- a/parquet/src/file/page_index/offset_index.rs +++ b/parquet/src/file/page_index/offset_index.rs @@ -23,7 +23,7 @@ use std::io::Write; use crate::parquet_thrift::{ ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol, ThriftCompactOutputProtocol, - WriteThrift, WriteThriftField, read_thrift_vec, + WriteThrift, WriteThriftField, read_thrift_vec, validate_list_type, }; use crate::{ errors::{ParquetError, Result}, @@ -90,6 +90,7 @@ impl OffsetIndexMetaData { // we have to do this manually because we want to use the fast PageLocation decoder let list_ident = prot.read_list_begin()?; + validate_list_type(ElementType::Struct, &list_ident)?; let mut page_locations = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { page_locations.push(read_page_location(prot)?); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 254ccb779a4a..4b71e3c14e6d 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -2117,7 +2117,7 @@ mod tests { let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data)); assert_eq!( ret.err().unwrap().to_string(), - "Parquet error: Received empty union from remote ColumnOrder" + "Parquet error: Expected list element type of Struct but got List" ); } diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs index a13baa0924e0..5d9c09b37718 100644 --- a/parquet/src/parquet_thrift.rs +++ b/parquet/src/parquet_thrift.rs @@ -702,9 +702,10 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for &'a [u8] { pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result> where R: ThriftCompactInputProtocol<'a>, - T: ReadThrift<'a, R>, + T: ReadThrift<'a, R> + WriteThrift, { let list_ident = prot.read_list_begin()?; + validate_list_type(T::ELEMENT_TYPE, &list_ident)?; let mut res = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { let val = T::read_thrift(prot)?; @@ -713,6 +714,17 @@ where Ok(res) } +pub(crate) fn validate_list_type(expected: ElementType, got: &ListIdentifier) -> Result<()> { + if got.element_type != expected { + return Err(general_err!( + "Expected list element type of {:?} but got {:?}", + expected, + got.element_type + )); + } + Ok(()) +} + ///////////////////////// // thrift compact output @@ -1138,4 +1150,20 @@ pub(crate) mod tests { let result = prot.read_list_begin(); assert!(result.is_err(), "expected error, got {result:?}"); } + + #[test] + fn test_read_list_wrong_type() { + // list header: 4 elements of `Boolean` + let data = [0x42, 0x01]; + let mut prot = ThriftSliceInputProtocol::new(&data); + // try to read as list + let result = read_thrift_vec::(&mut prot); + println!("{result:?}"); + assert!( + result + .unwrap_err() + .to_string() + .contains("Expected list element type of I32 but got Bool") + ); + } } diff --git a/parquet/tests/arrow_reader/bad_data.rs b/parquet/tests/arrow_reader/bad_data.rs index d9b0d89e2c5f..56fddf505dac 100644 --- a/parquet/tests/arrow_reader/bad_data.rs +++ b/parquet/tests/arrow_reader/bad_data.rs @@ -98,7 +98,7 @@ fn test_arrow_gh_41317() { let err = read_file("ARROW-GH-41317.parquet").unwrap_err(); assert_eq!( err.to_string(), - "External: Parquet argument error: Parquet error: StructArrayReader out of sync in read_records, expected 5 read, got 2" + "Parquet error: Expected list element type of I32 but got I16" ); }