-
Notifications
You must be signed in to change notification settings - Fork 974
[Variant] Impl PartialEq
for VariantObject
#7943
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,11 +16,12 @@ | |
// under the License. | ||
use crate::decoder::{VariantBasicType, VariantPrimitiveType}; | ||
use crate::{ | ||
ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantMetadata, | ||
ShortString, Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8, VariantList, | ||
VariantMetadata, VariantObject, | ||
}; | ||
use arrow_schema::ArrowError; | ||
use indexmap::{IndexMap, IndexSet}; | ||
use std::collections::{HashMap, HashSet}; | ||
use std::collections::HashSet; | ||
|
||
const BASIC_TYPE_BITS: u8 = 2; | ||
const UNIX_EPOCH_DATE: chrono::NaiveDate = chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); | ||
|
@@ -216,6 +217,57 @@ impl ValueBuffer { | |
self.append_slice(value.as_bytes()); | ||
} | ||
|
||
fn append_object(&mut self, metadata_builder: &mut MetadataBuilder, obj: VariantObject) { | ||
let mut object_builder = self.new_object(metadata_builder); | ||
|
||
for (field_name, value) in obj.iter() { | ||
object_builder.insert(field_name, value); | ||
} | ||
|
||
object_builder.finish().unwrap(); | ||
} | ||
|
||
fn try_append_object( | ||
&mut self, | ||
metadata_builder: &mut MetadataBuilder, | ||
obj: VariantObject, | ||
) -> Result<(), ArrowError> { | ||
let mut object_builder = self.new_object(metadata_builder); | ||
|
||
for res in obj.iter_try() { | ||
let (field_name, value) = res?; | ||
object_builder.try_insert(field_name, value)?; | ||
} | ||
|
||
object_builder.finish()?; | ||
|
||
Ok(()) | ||
} | ||
|
||
fn append_list(&mut self, metadata_builder: &mut MetadataBuilder, list: VariantList) { | ||
let mut list_builder = self.new_list(metadata_builder); | ||
for value in list.iter() { | ||
list_builder.append_value(value); | ||
} | ||
list_builder.finish(); | ||
} | ||
|
||
fn try_append_list( | ||
&mut self, | ||
metadata_builder: &mut MetadataBuilder, | ||
list: VariantList, | ||
) -> Result<(), ArrowError> { | ||
let mut list_builder = self.new_list(metadata_builder); | ||
for res in list.iter_try() { | ||
let value = res?; | ||
list_builder.try_append_value(value)?; | ||
} | ||
|
||
list_builder.finish(); | ||
|
||
Ok(()) | ||
} | ||
|
||
fn offset(&self) -> usize { | ||
self.0.len() | ||
} | ||
|
@@ -252,9 +304,31 @@ impl ValueBuffer { | |
variant: Variant<'m, 'd>, | ||
metadata_builder: &mut MetadataBuilder, | ||
) { | ||
self.try_append_variant(variant, metadata_builder).unwrap(); | ||
match variant { | ||
Variant::Null => self.append_null(), | ||
Variant::BooleanTrue => self.append_bool(true), | ||
Variant::BooleanFalse => self.append_bool(false), | ||
Variant::Int8(v) => self.append_int8(v), | ||
Variant::Int16(v) => self.append_int16(v), | ||
Variant::Int32(v) => self.append_int32(v), | ||
Variant::Int64(v) => self.append_int64(v), | ||
Variant::Date(v) => self.append_date(v), | ||
Variant::TimestampMicros(v) => self.append_timestamp_micros(v), | ||
Variant::TimestampNtzMicros(v) => self.append_timestamp_ntz_micros(v), | ||
Variant::Decimal4(decimal4) => self.append_decimal4(decimal4), | ||
Variant::Decimal8(decimal8) => self.append_decimal8(decimal8), | ||
Variant::Decimal16(decimal16) => self.append_decimal16(decimal16), | ||
Variant::Float(v) => self.append_float(v), | ||
Variant::Double(v) => self.append_double(v), | ||
Variant::Binary(v) => self.append_binary(v), | ||
Variant::String(s) => self.append_string(s), | ||
Variant::ShortString(s) => self.append_short_string(s), | ||
Variant::Object(obj) => self.append_object(metadata_builder, obj), | ||
Variant::List(list) => self.append_list(metadata_builder, list), | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is unfortunate to have to copy the same match statement as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see -- I think it is the unfortunate consequence of having the distinction between the "validated" and "non validated" variant APIs Maybe we can add a few comments explaining the practical difference (performance vs extra error checking) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also left a suggestion above about how to potentially unify the code paths |
||
} | ||
|
||
/// Appends a variant to the buffer | ||
fn try_append_variant<'m, 'd>( | ||
&mut self, | ||
variant: Variant<'m, 'd>, | ||
|
@@ -279,35 +353,8 @@ impl ValueBuffer { | |
Variant::Binary(v) => self.append_binary(v), | ||
Variant::String(s) => self.append_string(s), | ||
Variant::ShortString(s) => self.append_short_string(s), | ||
Variant::Object(obj) => { | ||
let metadata_field_names = metadata_builder | ||
.field_names | ||
.iter() | ||
.enumerate() | ||
.map(|(i, f)| (f.clone(), i)) | ||
.collect::<HashMap<_, _>>(); | ||
|
||
let mut object_builder = self.new_object(metadata_builder); | ||
|
||
// first add all object fields that exist in metadata builder | ||
let mut object_fields = obj.iter().collect::<Vec<_>>(); | ||
|
||
object_fields | ||
.sort_by_key(|(field_name, _)| metadata_field_names.get(field_name as &str)); | ||
|
||
for (field_name, value) in object_fields { | ||
object_builder.insert(field_name, value); | ||
} | ||
|
||
object_builder.finish()?; | ||
} | ||
Variant::List(list) => { | ||
let mut list_builder = self.new_list(metadata_builder); | ||
for value in list.iter() { | ||
list_builder.append_value(value); | ||
} | ||
list_builder.finish(); | ||
} | ||
Variant::Object(obj) => self.try_append_object(metadata_builder, obj)?, | ||
Variant::List(list) => self.try_append_list(metadata_builder, list)?, | ||
} | ||
|
||
Ok(()) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,8 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use std::collections::HashSet; | ||
|
||
use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes}; | ||
use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice}; | ||
|
||
|
@@ -125,7 +127,7 @@ impl VariantMetadataHeader { | |
/// | ||
/// [`Variant`]: crate::Variant | ||
/// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding | ||
#[derive(Debug, Clone, PartialEq)] | ||
#[derive(Debug, Clone)] | ||
pub struct VariantMetadata<'m> { | ||
pub(crate) bytes: &'m [u8], | ||
header: VariantMetadataHeader, | ||
|
@@ -332,6 +334,30 @@ impl<'m> VariantMetadata<'m> { | |
} | ||
} | ||
|
||
// According to the spec, metadata dictionaries are not required to be in a specific order, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need a special is_equal check for VariantMetadata? It seems like now since VariantObject doesn't include a check for the metadata being equal we could avoid a special equality 🤔 |
||
// to enable flexibility when constructing Variant values | ||
// | ||
// Instead of comparing the raw bytes of 2 variant metadata instances, this implementation | ||
// checks whether the dictionary entries are equal -- regardless of their sorting order | ||
impl<'m> PartialEq for VariantMetadata<'m> { | ||
fn eq(&self, other: &Self) -> bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the cost of constructing hash maps etc, is it worth adding the following quick-check in case the dictionaries are both sorted? if self.is_ordered() && other.is_ordered() {
if self.len() != other.len() {
return false;
}
let self_value_bytes = ... all string value bytes ...;
let other_value_bytes = ... all string value bytes ...;
return self_value_bytes == other_value_bytes;
} Before trusting a single long string compare tho, we would need to convince ourselves that there's no way two dictionaries with different offsets can have identical value bytes. Otherwise, we'd have to loop over the two sets of strings manually. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very nice. I will think about this and push up a PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
let is_equal = self.is_empty() == other.is_empty() | ||
&& self.is_fully_validated() == other.is_fully_validated() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would expect the following to pass for all variants and metadata let variant1 = Variant::new(metadata, buffers);
let variant2 = Variant::new(metadata, buffers).with_full_validation();
assert_eq!(variant1, variant2) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
&& self.first_value_byte == other.first_value_byte | ||
&& self.validated == other.validated; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could break early here if is_equal is false and not check the fields |
||
|
||
let other_field_names: HashSet<&'m str> = HashSet::from_iter(other.iter()); | ||
|
||
for field_name in self.iter() { | ||
if !other_field_names.contains(field_name) { | ||
return false; | ||
Comment on lines
+351
to
+353
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems one-sided? Don't we need to prove the symmetric difference is empty? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. You are right. I think @friendlymatthew has a fix for it in |
||
} | ||
} | ||
|
||
is_equal | ||
} | ||
} | ||
|
||
/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing | ||
/// [unvalidated] input could also panic if the underlying bytes are invalid. | ||
/// | ||
|
@@ -346,6 +372,7 @@ impl std::ops::Index<usize> for VariantMetadata<'_> { | |
|
||
#[cfg(test)] | ||
mod tests { | ||
|
||
use super::*; | ||
|
||
/// `"cat"`, `"dog"` – valid metadata | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might be able to consolidate the apis if you checked the
is_validated
dynamicallyFor example, have a single try_append_object and internally try
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Except
append_object
is infallible... so even if we "consolidate" by havingtry_append_object
fall back to its infallible partner, both similar-but-different for loops will still exist.