apache · alamb · Jul 17, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 16, 2025
diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs
@@ -234,44 +234,58 @@ impl<'m> VariantMetadata<'m> {
                 self.header.first_offset_byte() as _..self.first_value_byte as _,
             )?;
 
-            let offsets =
-                map_bytes_to_offsets(offset_bytes, self.header.offset_size).collect::<Vec<_>>();
-
             // Verify the string values in the dictionary are UTF-8 encoded strings.
             let value_buffer =
                 string_from_slice(self.bytes, 0, self.first_value_byte as _..self.bytes.len())?;
 
+            let mut offsets_iter = map_bytes_to_offsets(offset_bytes, self.header.offset_size);
+            let mut current_offset = offsets_iter.next().unwrap_or(0);
+
             if self.header.is_sorted {
                 // Validate the dictionary values are unique and lexicographically sorted
                 //
                 // Since we use the offsets to access dictionary values, this also validates
                 // offsets are in-bounds and monotonically increasing
-                let are_dictionary_values_unique_and_sorted = (1..offsets.len())
-                    .map(|i| {
-                        let field_range = offsets[i - 1]..offsets[i];
-                        value_buffer.get(field_range)
-                    })
-                    .is_sorted_by(|a, b| match (a, b) {
-                        (Some(a), Some(b)) => a < b,
-                        _ => false,
-                    });
-
-                if !are_dictionary_values_unique_and_sorted {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "dictionary values are not unique and ordered".to_string(),
-                    ));
+                let mut prev_value: Option<&str> = None;
+
+                for next_offset in offsets_iter {
+                    if next_offset <= current_offset {
+                        return Err(ArrowError::InvalidArgumentError(
+                            "offsets not monotonically increasing".to_string(),
+                        ));
+                    }
+
+                    let current_value =
+                        value_buffer
+                            .get(current_offset..next_offset)
+                            .ok_or_else(|| {
+                                ArrowError::InvalidArgumentError("offset out of bounds".to_string())
+                            })?;
+
+                    if let Some(prev_val) = prev_value {
+                        if current_value <= prev_val {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "dictionary values are not unique and ordered".to_string(),
+                            ));
+                        }
+                    }
+
+                    prev_value = Some(current_value);
+                    current_offset = next_offset;
                 }
             } else {
                 // Validate offsets are in-bounds and monotonically increasing
                 //
                 // Since shallow validation ensures the first and last offsets are in bounds,
                 // we can also verify all offsets are in-bounds by checking if
                 // offsets are monotonically increasing
-                let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
-                if !are_offsets_monotonic {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "offsets not monotonically increasing".to_string(),
-                    ));
+                for next_offset in offsets_iter {
+                    if next_offset <= current_offset {
+                        return Err(ArrowError::InvalidArgumentError(
+                            "offsets not monotonically increasing".to_string(),
+                        ));
+                    }
+                    current_offset = next_offset;
                 }
             }
 

diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs
@@ -217,23 +217,31 @@ impl<'m, 'v> VariantObject<'m, 'v> {
                 self.header.field_ids_start_byte() as _..self.first_field_offset_byte as _,
             )?;
 
-            let field_ids = map_bytes_to_offsets(field_id_buffer, self.header.field_id_size)
-                .collect::<Vec<_>>();
-
+            let mut field_ids_iter =
+                map_bytes_to_offsets(field_id_buffer, self.header.field_id_size);
             // Validate all field ids exist in the metadata dictionary and the corresponding field names are lexicographically sorted
             if self.metadata.is_sorted() {
                 // Since the metadata dictionary has unique and sorted field names, we can also guarantee this object's field names
                 // are lexicographically sorted by their field id ordering
-                if !field_ids.is_sorted() {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "field names not sorted".to_string(),
-                    ));
-                }
+                let dictionary_size = self.metadata.dictionary_size();
+
+                if let Some(mut current_id) = field_ids_iter.next() {
+                    for next_id in field_ids_iter {
+                        if current_id >= dictionary_size {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "field id is not valid".to_string(),
+                            ));
+                        }
+
+                        if next_id <= current_id {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "field names not sorted".to_string(),
+                            ));
+                        }
+                        current_id = next_id;
+                    }
 
-                // Since field ids are sorted, if the last field is smaller than the dictionary size,
-                // we also know all field ids are smaller than the dictionary size and in-bounds.
-                if let Some(&last_field_id) = field_ids.last() {
-                    if last_field_id >= self.metadata.dictionary_size() {
+                    if current_id >= dictionary_size {
                         return Err(ArrowError::InvalidArgumentError(
                             "field id is not valid".to_string(),
                         ));
@@ -244,16 +252,22 @@ impl<'m, 'v> VariantObject<'m, 'v> {
                 // to check lexicographical order
                 //
                 // Since we are probing the metadata dictionary by field id, this also verifies field ids are in-bounds
-                let are_field_names_sorted = field_ids
-                    .iter()
-                    .map(|&i| self.metadata.get(i))
-                    .collect::<Result<Vec<_>, _>>()?
-                    .is_sorted();
-
-                if !are_field_names_sorted {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "field names not sorted".to_string(),
-                    ));
+                let mut current_field_name = match field_ids_iter.next() {
+                    Some(field_id) => Some(self.metadata.get(field_id)?),
+                    None => None,
+                };
+
+                for field_id in field_ids_iter {
+                    let next_field_name = self.metadata.get(field_id)?;
+
+                    if let Some(current_name) = current_field_name {
+                        if next_field_name <= current_name {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "field names not sorted".to_string(),
+                            ));
+                        }
+                    }
+                    current_field_name = Some(next_field_name);
                 }
             }