Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 50 additions & 2 deletions parquet-variant-compute/benches/variant_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{Array, ArrayRef, StringArray};
use arrow::array::{Array, ArrayRef, BinaryViewArray, StringArray, StructArray};
use arrow::util::test_util::seedable_rng;
use arrow_schema::{DataType, Field, FieldRef, Fields};
use criterion::{Criterion, criterion_group, criterion_main};
use parquet_variant::{Variant, VariantBuilder};
use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantBuilder};
use parquet_variant_compute::{
GetOptions, VariantArray, VariantArrayBuilder, json_to_variant, variant_get,
};
Expand Down Expand Up @@ -98,9 +99,26 @@ pub fn variant_get_bench(c: &mut Criterion) {
});
}

pub fn variant_get_shredded_utf8_bench(c: &mut Criterion) {
let variant_array = create_shredded_utf8_variant_array(8192);
let input = ArrayRef::from(variant_array);

let field: FieldRef = Arc::new(Field::new("typed_value", DataType::Utf8, true));
let options = GetOptions {
path: vec![].into(),
as_type: Some(field),
cast_options: Default::default(),
};

c.bench_function("variant_get_shredded_utf8", |b| {
b.iter(|| variant_get(&input.clone(), options.clone()))
});
}

criterion_group!(
benches,
variant_get_bench,
variant_get_shredded_utf8_bench,
benchmark_batch_json_string_to_variant
);
criterion_main!(benches);
Expand All @@ -121,6 +139,36 @@ fn create_primitive_variant_array(size: usize) -> VariantArray {
variant_builder.build()
}

/// Creates a `VariantArray` where the values are already shredded as UTF8.
fn create_shredded_utf8_variant_array(size: usize) -> VariantArray {
let metadata = BinaryViewArray::from_iter_values(
std::iter::repeat(EMPTY_VARIANT_METADATA_BYTES).take(size),
);
let typed_value = StringArray::from_iter_values((0..size).map(|i| format!("value_{i}")));

let metadata_ref: ArrayRef = Arc::new(metadata);
let typed_value_ref: ArrayRef = Arc::new(typed_value);

let fields = Fields::from(vec![
Arc::new(Field::new(
"metadata",
metadata_ref.data_type().clone(),
false,
)),
Arc::new(Field::new(
"typed_value",
typed_value_ref.data_type().clone(),
true,
)),
]);

let struct_array = StructArray::new(fields, vec![metadata_ref, typed_value_ref], None);
let struct_array_ref: ArrayRef = Arc::new(struct_array);

VariantArray::try_new(struct_array_ref.as_ref())
.expect("created struct should be a valid shredded variant")
}

/// Return an iterator off JSON strings, each representing a person
/// with random first name, last name, and age.
///
Expand Down
68 changes: 67 additions & 1 deletion parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.
use arrow::{
array::{self, Array, ArrayRef, BinaryViewArray, StructArray},
array::{self, Array, ArrayRef, BinaryViewArray, StructArray, make_array},
buffer::NullBuffer,
compute::CastOptions,
datatypes::Field,
error::Result,
Expand Down Expand Up @@ -109,6 +110,30 @@ pub(crate) fn follow_shredded_path_element<'a>(
}
}

/// Returns a cloned `ArrayRef` whose null mask is the union of the array's existing mask and
/// `parent_nulls`. If `parent_nulls` is `None` or contains no nulls, the original array is returned.
///
/// This necessary because the null of the shredded value is the union of parent null and current nulls.
fn clone_with_parent_nulls(
array: &ArrayRef,
parent_nulls: Option<&NullBuffer>,
) -> Result<ArrayRef> {
let Some(parent_nulls) = parent_nulls else {
return Ok(array.clone());
};
if parent_nulls.null_count() == 0 {
return Ok(array.clone());
}

let combined_nulls = NullBuffer::union(array.as_ref().nulls(), Some(parent_nulls));
let data = array
.to_data()
.into_builder()
.nulls(combined_nulls)
.build()?;
Ok(make_array(data))
}

/// Follows the given path as far as possible through shredded variant fields. If the path ends on a
/// shredded field, return it directly. Otherwise, use a row shredder to follow the rest of the path
/// and extract the requested value on a per-row basis.
Expand Down Expand Up @@ -208,6 +233,21 @@ fn shredded_get_path(
return Ok(ArrayRef::from(target));
};

// Try to return the typed value directly when we have a perfect shredding match.
if !matches!(as_field.data_type(), DataType::Struct(_)) {
if let Some(typed_value) = target.typed_value_field() {
let types_match = typed_value.data_type() == as_field.data_type();
Copy link
Contributor Author

@XiangpengHao XiangpengHao Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is probably overly restrictive, Utf8, LargeUtf8, and Utf8View should be allowed. Maybe check if cast-able.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, the current code is correct unless we're willing to inject a typecast.
Otherwise, the caller could end up with a different type than they requested.

That said, casting does seem reasonable -- it would certainly be faster than a variant builder, and the caller did request a specific type.

let value_not_present = target.value_field().is_none();
// this is a perfect shredding, where the value is entirely shredded out, so we can just return the typed value
// note that we MUST check value_not_present, because some of the `typed_value` might be null but data is present in the `value` column.
// an alternative is to count whether typed_value has any non-nulls, or check every row in `value` is null,
// but this is too complicated and might be slow.
if types_match && value_not_present {
return clone_with_parent_nulls(typed_value, target.nulls());
}
}
}

// Structs are special. Recurse into each field separately, hoping to follow the shredding even
// further, and build up the final struct from those individually shredded results.
if let DataType::Struct(fields) = as_field.data_type() {
Expand Down Expand Up @@ -604,6 +644,32 @@ mod test {
assert_eq!(&result, &expected)
}

// on a perfect shredding, we must still obey the parent nulls
#[test]
fn get_variant_shredded_utf8_respects_parent_nulls() {
let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 2));
let typed_value = Arc::new(StringArray::from(vec![Some("foo"), Some("bar")]));
let parent_nulls = NullBuffer::from(vec![false, true]);

let variant_array = VariantArray::from_parts(
metadata,
None,
Some(typed_value.clone()),
Some(parent_nulls),
);
let input: ArrayRef = ArrayRef::from(variant_array);

let field = Field::new("result", DataType::Utf8, true);
let options = GetOptions::new().with_as_type(Some(FieldRef::from(field)));
let result = variant_get(&input, options).unwrap();
let strings = result.as_any().downcast_ref::<StringArray>().unwrap();

assert_eq!(strings.len(), 2);
assert!(strings.is_null(0));
assert_eq!(strings.value(1), "bar");
}

/// Shredding: extract a value as an Int32Array, unsafe cast (should error on "n/a")
#[test]
fn get_variant_shredded_int32_as_int32_unsafe_cast() {
Expand Down
Loading